##  Perception for Binary classification

#### Loading the required libraries

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc

from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

import matplotlib.pyplot as plt

In [1]:
import keras

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


#### Loading the data

In [3]:
data = pd.read_csv("BackOrders.csv",header=0)

#### Understand the Data

In [4]:
#Check number of rows and columns
data.shape

(10000, 23)

In [5]:
#Display columns
data.columns

Index(['sku', 'national_inv', 'lead_time', 'in_transit_qty',
       'forecast_3_month', 'forecast_6_month', 'forecast_9_month',
       'sales_1_month', 'sales_3_month', 'sales_6_month', 'sales_9_month',
       'min_bank', 'potential_issue', 'pieces_past_due', 'perf_6_month_avg',
       'perf_12_month_avg', 'local_bo_qty', 'deck_risk', 'oe_constraint',
       'ppap_risk', 'stop_auto_buy', 'rev_stop', 'went_on_backorder'],
      dtype='object')

In [6]:
#Display index
data.index

RangeIndex(start=0, stop=10000, step=1)

See the top rows of the data

In [7]:
# Check top 'n' rows
data[:3]

Unnamed: 0,sku,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,...,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop,went_on_backorder
0,1888279,117,,0,0,0,0,0,0,15,...,0,-99.0,-99.0,0,No,No,Yes,Yes,No,No
1,1870557,7,2.0,0,0,0,0,0,0,0,...,0,0.5,0.28,0,Yes,No,No,Yes,No,No
2,1475481,258,15.0,10,10,77,184,46,132,256,...,0,0.54,0.7,0,No,No,No,Yes,No,No


In [8]:
#Summary statistics of each column
data.describe()

Unnamed: 0,sku,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,sales_9_month,min_bank,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty
count,10000.0,10000.0,9433.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,2046242.0,267.3887,7.561645,26.1158,137.9128,248.7809,355.8195,36.4921,108.2983,209.8898,314.2486,36.0632,1.5674,-6.284197,-5.869804,1.7169
std,659018.4,3937.266534,6.47056,340.288412,1458.350464,2511.049468,3425.324204,288.688666,801.982281,1515.773389,2200.196573,304.328827,27.482543,25.575629,24.859075,44.798882
min,1111652.0,-2999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,0.0
25%,1510146.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.63,0.64,0.0
50%,1898042.0,10.0,8.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,6.0,0.0,0.0,0.82,0.8,0.0
75%,2811447.0,55.0,8.0,0.0,12.0,25.0,37.0,5.0,17.0,34.0,49.0,3.0,0.0,0.96,0.95,0.0
max,3284750.0,364065.0,52.0,20047.0,98548.0,169720.0,214848.0,10698.0,35076.0,67129.0,89727.0,16642.0,1488.0,1.0,1.0,2999.0


Display data type of each variable

In [9]:
# Check data type of each attribute
data.dtypes

sku                    int64
national_inv           int64
lead_time            float64
in_transit_qty         int64
forecast_3_month       int64
forecast_6_month       int64
forecast_9_month       int64
sales_1_month          int64
sales_3_month          int64
sales_6_month          int64
sales_9_month          int64
min_bank               int64
potential_issue       object
pieces_past_due        int64
perf_6_month_avg     float64
perf_12_month_avg    float64
local_bo_qty           int64
deck_risk             object
oe_constraint         object
ppap_risk             object
stop_auto_buy         object
rev_stop              object
went_on_backorder     object
dtype: object

#### Observations

    sku is Categorical but is interpreted as int64 
    potential_issue, deck_risk, oe_constraint, ppap_risk, stop_auto_buy, rev_stop, and went_on_backorder are also categorical but is interpreted as object. 

#### Convert all the attributes to appropriate type

Data type conversion

    Using astype('category') to convert potential_issue, deck_risk, oe_constraint, ppap_risk, stop_auto_buy, rev_stop, and went_on_backorder attributes to categorical attributes.


In [10]:
for col in ['sku', 'potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk', 'stop_auto_buy', 'rev_stop', 'went_on_backorder']:
    data[col] = data[col].astype('category')

Display data type of each variable

In [11]:
#Display data type of each variable
data.dtypes

sku                  category
national_inv            int64
lead_time             float64
in_transit_qty          int64
forecast_3_month        int64
forecast_6_month        int64
forecast_9_month        int64
sales_1_month           int64
sales_3_month           int64
sales_6_month           int64
sales_9_month           int64
min_bank                int64
potential_issue      category
pieces_past_due         int64
perf_6_month_avg      float64
perf_12_month_avg     float64
local_bo_qty            int64
deck_risk            category
oe_constraint        category
ppap_risk            category
stop_auto_buy        category
rev_stop             category
went_on_backorder    category
dtype: object

#### Delete sku attribute

In [12]:
np.size(np.unique(data.sku, return_counts=True)[0])

10000

In [13]:
data.drop('sku', axis=1, inplace=True)

#### Missing Data

    Missing value analysis and dropping the records with missing values

In [14]:
data.isnull().sum()

national_inv           0
lead_time            567
in_transit_qty         0
forecast_3_month       0
forecast_6_month       0
forecast_9_month       0
sales_1_month          0
sales_3_month          0
sales_6_month          0
sales_9_month          0
min_bank               0
potential_issue        0
pieces_past_due        0
perf_6_month_avg       0
perf_12_month_avg      0
local_bo_qty           0
deck_risk              0
oe_constraint          0
ppap_risk              0
stop_auto_buy          0
rev_stop               0
went_on_backorder      0
dtype: int64

Observing the number of records before and after missing value records removal

In [15]:
print (data.shape)

(10000, 22)


In [16]:
#Since the number of missing values is about 5%. For initial analysis we ignore all these records
data = data.dropna(axis=0)

In [17]:
data.isnull().sum()
print(data.shape)

(9433, 22)


#### Converting Categorical to Numeric

For some of the models all the independent attribute should be of type numeric and Linear Regression model is one among them.
But this data set has some categorial attributes.

'pandas.get_dummies' To convert convert categorical variable into dummy/indicator variables


In [18]:
print (data.columns)

Index(['national_inv', 'lead_time', 'in_transit_qty', 'forecast_3_month',
       'forecast_6_month', 'forecast_9_month', 'sales_1_month',
       'sales_3_month', 'sales_6_month', 'sales_9_month', 'min_bank',
       'potential_issue', 'pieces_past_due', 'perf_6_month_avg',
       'perf_12_month_avg', 'local_bo_qty', 'deck_risk', 'oe_constraint',
       'ppap_risk', 'stop_auto_buy', 'rev_stop', 'went_on_backorder'],
      dtype='object')


Creating dummy variables.

    If we have k levels in a category, then we create k-1 dummy variables as the last one would be redundant. So we use the parameter drop_first in pd.get_dummies function that drops the first level in each of the category


In [19]:
categorical_Attributes = data.select_dtypes(include=['category']).columns

In [20]:
data = pd.get_dummies(columns=categorical_Attributes, data=data, prefix=categorical_Attributes, prefix_sep="_",
                      drop_first=True)

In [21]:
print (data.columns, data.shape)

Index(['national_inv', 'lead_time', 'in_transit_qty', 'forecast_3_month',
       'forecast_6_month', 'forecast_9_month', 'sales_1_month',
       'sales_3_month', 'sales_6_month', 'sales_9_month', 'min_bank',
       'pieces_past_due', 'perf_6_month_avg', 'perf_12_month_avg',
       'local_bo_qty', 'potential_issue_Yes', 'deck_risk_Yes',
       'oe_constraint_Yes', 'ppap_risk_Yes', 'stop_auto_buy_Yes',
       'rev_stop_Yes', 'went_on_backorder_Yes'],
      dtype='object') (9433, 22)


#### Target attribute distribution

In [22]:
pd.value_counts(data['went_on_backorder_Yes'])

0    7632
1    1801
Name: went_on_backorder_Yes, dtype: int64

#### Split the data in to train and test

sklearn.model_selection.train_test_split

    Split arrays or matrices into random train and test subsets

In [23]:
#Performing train test split on the data


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)


In [24]:
#To get the distribution in the target in train and test
print(pd.value_counts(y_train))
print(pd.value_counts(y_test))

0    5372
1    1231
dtype: int64
0    2260
1     570
dtype: int64


#### Perceptron Model Building

In [25]:
perceptron_model = Sequential()

perceptron_model.add(Dense(1, input_dim=21, activation='sigmoid', kernel_initializer='normal'))

In [26]:
perceptron_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [27]:
perceptron_model.fit(X_train, y_train, epochs=30, batch_size=64)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x14b26dfce80>

#### Predictions

In [28]:
y_pred=perceptron_model.predict_classes(X_test)
y_train_pred=perceptron_model.predict_classes(X_train)

#### Getting evaluation metrics and evaluating model performance

In [29]:
print("Train data target \n", pd.value_counts(y_train))
confusion_matrix_train = confusion_matrix(y_train, y_train_pred)
print("\n Confusion matrix \n", confusion_matrix_train)

print("\n Test data target \n", pd.value_counts(y_test))
confusion_matrix_test = confusion_matrix(y_test, y_pred)
print("\n Confusion matrix \n", confusion_matrix_test)

Train data target 
 0    5372
1    1231
dtype: int64

 Confusion matrix 
 [[5301   71]
 [ 841  390]]

 Test data target 
 0    2260
1     570
dtype: int64

 Confusion matrix 
 [[2231   29]
 [ 393  177]]


#### Calculate Accuracy, True Positive Rate and True Negative Rates

In [30]:
Accuracy_Train=(confusion_matrix_train[0,0]+confusion_matrix_train[1,1])/(confusion_matrix_train[0,0]+
                                                                          confusion_matrix_train[0,1]+
                                                                          confusion_matrix_train[1,0]+
                                                                          confusion_matrix_train[1,1])
TNR_Train= confusion_matrix_train[0,0]/(confusion_matrix_train[0,0]+confusion_matrix_train[0,1])
TPR_Train= confusion_matrix_train[1,1]/(confusion_matrix_train[1,0]+confusion_matrix_train[1,1])

print("Train TNR: ",TNR_Train)
print("Train TPR: ",TPR_Train)
print("Train Accuracy: ",Accuracy_Train)

Train TNR:  0.986783320923306
Train TPR:  0.31681559707554835
Train Accuracy:  0.8618809631985461


In [100]:
Accuracy_Test=(confusion_matrix_test[0,0]+confusion_matrix_test[1,1])/(confusion_matrix_test[0,0]+confusion_matrix_test[0,1]+confusion_matrix_test[1,0]+confusion_matrix_test[1,1])
TNR_Test= confusion_matrix_test[0,0]/(confusion_matrix_test[0,0] +confusion_matrix_test[0,1])
TPR_Test= confusion_matrix_test[1,1]/(confusion_matrix_test[1,0] +confusion_matrix_test[1,1])

print("Test TNR: ",TNR_Test)
print("Test TPR: ",TPR_Test)
print("Test Accuracy: ",Accuracy_Test)

Test TNR:  0.959292035398
Test TPR:  0.519298245614
Test Accuracy:  0.870671378092
