In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Binary Classification using a Perceptron and MLP

In [2]:
# Loading the required libraries
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc

from keras.models import Sequential
from keras.layers import Dense
import matplotlib.pyplot as plt

In [3]:
# To filter/truncate warnings
import warnings
warnings.filterwarnings('ignore')

#### Loading the data

In [4]:
os.chdir(r"/content/drive/MyDrive/ANN/")
!pwd

/content/drive/MyDrive/ANN


In [5]:
data = pd.read_csv("BackOrders.csv", header = 0)

#### Understand the Data

In [6]:
# Check number of rows and columns
data.shape

(61589, 23)

In [7]:
# Display columns
data.columns

Index(['sku', 'national_inv', 'lead_time', 'in_transit_qty',
       'forecast_3_month', 'forecast_6_month', 'forecast_9_month',
       'sales_1_month', 'sales_3_month', 'sales_6_month', 'sales_9_month',
       'min_bank', 'potential_issue', 'pieces_past_due', 'perf_6_month_avg',
       'perf_12_month_avg', 'local_bo_qty', 'deck_risk', 'oe_constraint',
       'ppap_risk', 'stop_auto_buy', 'rev_stop', 'went_on_backorder'],
      dtype='object')

In [8]:
# Check the top 4 rows of the data
data[:4]

Unnamed: 0,sku,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,sales_9_month,min_bank,potential_issue,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop,went_on_backorder
0,1888279,117,,0,0,0,0,0,0,15,15,1,No,0,-99.0,-99.0,0,No,No,Yes,Yes,No,No
1,1870557,7,2.0,0,0,0,0,0,0,0,0,0,No,0,0.5,0.28,0,Yes,No,No,Yes,No,No
2,1475481,258,15.0,10,10,77,184,46,132,256,365,47,No,0,0.54,0.7,0,No,No,No,Yes,No,No
3,1758220,46,2.0,0,0,0,0,1,2,6,9,1,No,0,0.75,0.9,0,Yes,No,No,Yes,No,No


In [9]:
# Summary statistics of each column
data.describe()

Unnamed: 0,sku,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,sales_9_month,min_bank,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty
count,61589.0,61589.0,58186.0,61589.0,61589.0,61589.0,61589.0,61589.0,61589.0,61589.0,61589.0,61589.0,61589.0,61589.0,61589.0,61589.0
mean,2037188.0,287.721882,7.559619,30.192843,169.2728,315.0413,453.576,44.742957,150.732631,283.5465,419.6427,43.087256,1.6054,-6.264182,-5.863664,1.205361
std,656417.8,4233.906931,6.498952,792.869253,5286.742,9774.362,14202.01,1373.805831,5224.959649,8872.27,12698.58,959.614135,42.309229,25.537906,24.844514,29.981155
min,1068628.0,-2999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,0.0
25%,1498574.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.62,0.64,0.0
50%,1898033.0,10.0,8.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,6.0,0.0,0.0,0.82,0.8,0.0
75%,2314826.0,57.0,8.0,0.0,12.0,25.0,36.0,6.0,17.0,34.0,51.0,3.0,0.0,0.96,0.95,0.0
max,3284895.0,673445.0,52.0,170976.0,1126656.0,2094336.0,3062016.0,295197.0,934593.0,1799099.0,2631590.0,192978.0,7392.0,1.0,1.0,2999.0


In [10]:
# Check data type of each attribute
data.dtypes

sku                    int64
national_inv           int64
lead_time            float64
in_transit_qty         int64
forecast_3_month       int64
forecast_6_month       int64
forecast_9_month       int64
sales_1_month          int64
sales_3_month          int64
sales_6_month          int64
sales_9_month          int64
min_bank               int64
potential_issue       object
pieces_past_due        int64
perf_6_month_avg     float64
perf_12_month_avg    float64
local_bo_qty           int64
deck_risk             object
oe_constraint         object
ppap_risk             object
stop_auto_buy         object
rev_stop              object
went_on_backorder     object
dtype: object

In [11]:
def understand_data(data) :
    return(pd.DataFrame({"Datatype":data.dtypes,
                         "No of NAs":data.isna().sum(),
                         "No of Levels":data.apply(lambda x: x.nunique(),axis=0),
                         "Levels":data.apply(lambda x: str(x.unique()),axis=0)}))

In [12]:
understand_data(data)

Unnamed: 0,Datatype,No of NAs,No of Levels,Levels
sku,int64,0,61589,[1888279 1870557 1475481 ... 1909363 1845783 1...
national_inv,int64,0,2916,[ 117 7 258 ... 2701 6362 -84]
lead_time,float64,3403,28,[nan 2. 15. 12. 8. 9. 4. 10. 16. 52. 0. ...
in_transit_qty,int64,0,908,[ 0 10 562 11 812 1 ...
forecast_3_month,int64,0,1623,[ 0 10 4 ... 1206 1205 715]
forecast_6_month,int64,0,2195,[ 0 77 6 ... 1205 6830 1573]
forecast_9_month,int64,0,2664,[ 0 184 10 ... 976 12415 2431]
sales_1_month,int64,0,1092,[ 0 46 1 ... 2065 342 1796]
sales_3_month,int64,0,1928,[ 0 132 2 ... 792 6664 1715]
sales_6_month,int64,0,2679,[ 15 0 256 ... 11606 2252 3425]


**Observations**

sku is Categorical but is interpreted as int64                                                                         
potential_issue, deck_risk, oe_constraint, ppap_risk, stop_auto_buy, rev_stop, and went_on_backorder are also categorical but is interpreted as object. 

**Convert all the attributes to appropriate type**

Using astype('category') to convert potential_issue, deck_risk, oe_constraint, ppap_risk, stop_auto_buy, rev_stop, and went_on_backorder attributes to categorical attributes.

In [13]:
for col in ['sku', 'potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk', 'stop_auto_buy', 'rev_stop', 'went_on_backorder']:
    data[col] = data[col].astype('category')

In [14]:
# Display data type of each variable after dataype conversion
data.dtypes

sku                  category
national_inv            int64
lead_time             float64
in_transit_qty          int64
forecast_3_month        int64
forecast_6_month        int64
forecast_9_month        int64
sales_1_month           int64
sales_3_month           int64
sales_6_month           int64
sales_9_month           int64
min_bank                int64
potential_issue      category
pieces_past_due         int64
perf_6_month_avg      float64
perf_12_month_avg     float64
local_bo_qty            int64
deck_risk            category
oe_constraint        category
ppap_risk            category
stop_auto_buy        category
rev_stop             category
went_on_backorder    category
dtype: object

In [15]:
# Analyze sku attribute
np.unique(data.sku, return_counts = True)

(array([1068628, 1111587, 1111623, ..., 3284776, 3284852, 3284895]),
 array([1, 1, 1, ..., 1, 1, 1]))

In [16]:
# Analyze sku attribute
print(np.size(np.unique(data.sku, return_counts = True)[0]));
print(data.shape[0])

61589
61589


Notice that it is some kind of an ID column with all distinct values. It will not be useful in model building

In [17]:
# Delete sku attribute
data.drop('sku', axis=1, inplace = True)

**Missing Values**

In [18]:
data.isnull().sum()

national_inv            0
lead_time            3403
in_transit_qty          0
forecast_3_month        0
forecast_6_month        0
forecast_9_month        0
sales_1_month           0
sales_3_month           0
sales_6_month           0
sales_9_month           0
min_bank                0
potential_issue         0
pieces_past_due         0
perf_6_month_avg        0
perf_12_month_avg       0
local_bo_qty            0
deck_risk               0
oe_constraint           0
ppap_risk               0
stop_auto_buy           0
rev_stop                0
went_on_backorder       0
dtype: int64

In [19]:
print (data.shape)

(61589, 22)


In [20]:
# Since the number of missing values is about 5%. For initial analysis we shall ignore all these records
data = data.dropna(axis=0)

In [21]:
print(data.isnull().sum())
print(data.shape)

national_inv         0
lead_time            0
in_transit_qty       0
forecast_3_month     0
forecast_6_month     0
forecast_9_month     0
sales_1_month        0
sales_3_month        0
sales_6_month        0
sales_9_month        0
min_bank             0
potential_issue      0
pieces_past_due      0
perf_6_month_avg     0
perf_12_month_avg    0
local_bo_qty         0
deck_risk            0
oe_constraint        0
ppap_risk            0
stop_auto_buy        0
rev_stop             0
went_on_backorder    0
dtype: int64
(58186, 22)


**Converting Categorical to Numeric**

For some of the models all the independent attribute should be of type numeric. But this data set has some categorial attributes.

'pandas.get_dummies' To convert convert categorical variable into dummy/indicator variables

**Creating dummy variables**

If we have k levels in a category, then we create k-1 dummy variables as the last one would be redundant. So we use the parameter drop_first in pd.get_dummies function that drops the first level in each of the category

In [22]:
categorical_Attributes = data.select_dtypes(include = ['category']).columns
categorical_Attributes

Index(['potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk',
       'stop_auto_buy', 'rev_stop', 'went_on_backorder'],
      dtype='object')

In [23]:
data = pd.get_dummies(columns = categorical_Attributes, 
                      data = data, 
                      prefix = categorical_Attributes, 
                      prefix_sep = "_",
                      drop_first = True)

In [24]:
print (data.select_dtypes(include = ['uint8']).columns)

Index(['potential_issue_Yes', 'deck_risk_Yes', 'oe_constraint_Yes',
       'ppap_risk_Yes', 'stop_auto_buy_Yes', 'rev_stop_Yes',
       'went_on_backorder_Yes'],
      dtype='object')


In [25]:
data.head()

Unnamed: 0,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,sales_9_month,min_bank,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty,potential_issue_Yes,deck_risk_Yes,oe_constraint_Yes,ppap_risk_Yes,stop_auto_buy_Yes,rev_stop_Yes,went_on_backorder_Yes
1,7,2.0,0,0,0,0,0,0,0,0,0,0,0.5,0.28,0,0,1,0,0,1,0,0
2,258,15.0,10,10,77,184,46,132,256,365,47,0,0.54,0.7,0,0,0,0,0,1,0,0
3,46,2.0,0,0,0,0,1,2,6,9,1,0,0.75,0.9,0,0,1,0,0,1,0,0
4,2,2.0,0,4,6,10,2,2,5,6,0,0,0.97,0.92,0,0,0,0,0,1,0,0
5,297,12.0,0,0,0,0,5,6,44,57,3,0,0.58,0.75,0,0,1,0,0,1,0,0


**Target attribute distribution**

In [26]:
pd.value_counts(data['went_on_backorder_Yes'])

0    47217
1    10969
Name: went_on_backorder_Yes, dtype: int64

**Split the data in to train and test**

sklearn.model_selection.train_test_split                                                                               
Split arrays or matrices into random train and test subsets

In [27]:
# Performing train test split on the data
X, y = data.loc[:,data.columns!='went_on_backorder_Yes'].values, data.loc[:,'went_on_backorder_Yes'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)

In [28]:
# To get the distribution in the target in train and test
print(pd.value_counts(y_train))
print(pd.value_counts(y_test))

0    33067
1     7663
dtype: int64
0    14150
1     3306
dtype: int64


**Perceptron Model Building¶**

In [29]:
perceptron_model = Sequential()

perceptron_model.add(Dense(1, input_dim = 21, activation = 'sigmoid', kernel_initializer = 'normal'))

Once the model is created, we can config the model with losses and metrics with model.compile(), train the model with model.fit() and  use the model to do prediction with model.predict().

Remember training a network means finding the best set of weights to map inputs to outputs in our dataset.

We must specify the loss function to use to evaluate a set of weights, the optimizer is used to search through different weights for the network and any optional metrics we would like to collect and report during training.

In [30]:
perceptron_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

We can train or fit our model on our loaded data by calling the fit() function on the model.

Training occurs over epochs and each epoch is split into batches.

Epoch: One pass through all of the rows in the training dataset.
Batch: One or more samples considered by the model within an epoch before weights are updated.

In [31]:
perceptron_model.fit(X_train, y_train, epochs = 30, batch_size = 64)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f22cde8d2d0>

**Predictions**

In [32]:
train_pred = perceptron_model.predict(X_train)
test_pred = perceptron_model.predict(X_test)

In [33]:
train_pred = np.where(train_pred >= 0.5, 1, 0)
test_pred = np.where(test_pred >= 0.5, 1, 0)

In [34]:
print(np.unique(y_train, return_counts = True))
print(np.unique(train_pred, return_counts = True))
print(np.unique(y_test, return_counts = True))
print(np.unique(test_pred, return_counts = True))

(array([0, 1], dtype=uint8), array([33067,  7663]))
(array([0, 1]), array([38986,  1744]))
(array([0, 1], dtype=uint8), array([14150,  3306]))
(array([0, 1]), array([16698,   758]))


**Getting evaluation metrics for evaluating the model performance**

In [35]:
print("Train data target \n", pd.value_counts(y_train))
confusion_matrix_train = confusion_matrix(y_train, train_pred)
print("\n Confusion matrix \n", confusion_matrix_train)

print("\n Test data target \n", pd.value_counts(y_test))
confusion_matrix_test = confusion_matrix(y_test, test_pred)
print("\n Confusion matrix \n", confusion_matrix_test)

Train data target 
 0    33067
1     7663
dtype: int64

 Confusion matrix 
 [[32555   512]
 [ 6431  1232]]

 Test data target 
 0    14150
1     3306
dtype: int64

 Confusion matrix 
 [[13913   237]
 [ 2785   521]]


**Calculate Accuracy, True Positive Rate and True Negative Rates**

In [36]:
Accuracy_Train=(confusion_matrix_train[0,0]+confusion_matrix_train[1,1])/(confusion_matrix_train[0,0]+
                                                                          confusion_matrix_train[0,1]+
                                                                          confusion_matrix_train[1,0]+
                                                                          confusion_matrix_train[1,1])
TNR_Train= confusion_matrix_train[0,0]/(confusion_matrix_train[0,0]+confusion_matrix_train[0,1])
TPR_Train= confusion_matrix_train[1,1]/(confusion_matrix_train[1,0]+confusion_matrix_train[1,1])

print("Train TNR: ",TNR_Train)
print("Train TPR: ",TPR_Train)
print("Train Accuracy: ",Accuracy_Train)

Train TNR:  0.9845162851180935
Train TPR:  0.1607725433903171
Train Accuracy:  0.8295359685735331


In [37]:
Accuracy_Test=(confusion_matrix_test[0,0]+confusion_matrix_test[1,1])/(confusion_matrix_test[0,0]+confusion_matrix_test[0,1]+confusion_matrix_test[1,0]+confusion_matrix_test[1,1])
TNR_Test= confusion_matrix_test[0,0]/(confusion_matrix_test[0,0] +confusion_matrix_test[0,1])
TPR_Test= confusion_matrix_test[1,1]/(confusion_matrix_test[1,0] +confusion_matrix_test[1,1])

print("Test TNR: ",TNR_Test)
print("Test TPR: ",TPR_Test)
print("Test Accuracy: ",Accuracy_Test)

Test TNR:  0.9832508833922261
Test TPR:  0.15759225650332728
Test Accuracy:  0.8268790100824931


**MLP Model Building**

In [38]:
mlp_model = Sequential()

mlp_model.add(Dense(12, input_dim = 21, activation='relu', kernel_initializer = 'normal'))
mlp_model.add(Dense(1, activation = 'sigmoid', kernel_initializer = 'normal'))

In [39]:
mlp_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [40]:
mlp_model.fit(X_train, y_train, epochs = 30, batch_size = 64)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f22cdd80cd0>

**Predictions**

In [41]:
test_pred = mlp_model.predict(X_test)
train_pred = mlp_model.predict(X_train)

In [42]:
train_pred = np.where(train_pred >= 0.5, 1, 0)
test_pred = np.where(test_pred >= 0.5, 1, 0)

In [43]:
print(np.unique(y_train, return_counts = True))
print(np.unique(train_pred, return_counts = True))
print(np.unique(y_test, return_counts = True))
print(np.unique(test_pred, return_counts = True))

(array([0, 1], dtype=uint8), array([33067,  7663]))
(array([0, 1]), array([34409,  6321]))
(array([0, 1], dtype=uint8), array([14150,  3306]))
(array([0, 1]), array([14719,  2737]))


**Getting evaluation metrics for evaluating the model performance**

In [44]:
print("Train data target \n", pd.value_counts(y_train))
confusion_matrix_train = confusion_matrix(y_train, train_pred)
print("\n Confusion matrix \n", confusion_matrix_train)

print("\n Test data target \n", pd.value_counts(y_test))
confusion_matrix_test = confusion_matrix(y_test, test_pred)
print("\n Confusion matrix \n", confusion_matrix_test)

Train data target 
 0    33067
1     7663
dtype: int64

 Confusion matrix 
 [[31537  1530]
 [ 2872  4791]]

 Test data target 
 0    14150
1     3306
dtype: int64

 Confusion matrix 
 [[13487   663]
 [ 1232  2074]]


**Calculate Accuracy, True Positive Rate and True Negative Rates**

In [45]:
Accuracy_Train = (confusion_matrix_train[0,0] + confusion_matrix_train[1,1])/\
                 (confusion_matrix_train[0,0] + confusion_matrix_train[0,1] + confusion_matrix_train[1,0] + confusion_matrix_train[1,1])
TNR_Train = confusion_matrix_train[0,0] / (confusion_matrix_train[0,0] + confusion_matrix_train[0,1])
TPR_Train = confusion_matrix_train[1,1] / (confusion_matrix_train[1,0] + confusion_matrix_train[1,1])

print("Train TNR: ", TNR_Train)
print("Train TPR: ", TPR_Train)
print("Train Accuracy: ", Accuracy_Train)

Train TNR:  0.953730305138053
Train TPR:  0.6252120579407543
Train Accuracy:  0.8919224159096489


In [46]:
Accuracy_Test = (confusion_matrix_test[0,0] + confusion_matrix_test[1,1])/\
                (confusion_matrix_test[0,0] + confusion_matrix_test[0,1] + confusion_matrix_test[1,0] + confusion_matrix_test[1,1])
TNR_Test = confusion_matrix_test[0,0] / (confusion_matrix_test[0,0] + confusion_matrix_test[0,1])
TPR_Test = confusion_matrix_test[1,1] / (confusion_matrix_test[1,0] + confusion_matrix_test[1,1])

print("Test TNR: ", TNR_Test)
print("Test TPR: ", TPR_Test)
print("Test Accuracy: ", Accuracy_Test)

Test TNR:  0.9531448763250884
Test TPR:  0.6273442226255294
Test Accuracy:  0.8914413382218148


In [47]:
# Notice that the TPR is very bad using a single perceptron & this improved significantly when we used a MLP architecture.