##  Perception for Binary classification

#### Loading the required libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc

from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [1]:
import keras

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


#### Loading the data

In [5]:
data = pd.read_csv("CustomerData.csv",header=0)

#### Understand the Data

In [6]:
#Check number of rows and columns
data.shape

(3209, 14)

In [7]:
#Display columns
data.columns

Index(['CustomerID', 'City', 'NoOfChildren', 'MinAgeOfChild', 'MaxAgeOfChild',
       'Tenure', 'FrquncyOfPurchase', 'NoOfUnitsPurchased', 'FrequencyOFPlay',
       'NoOfGamesPlayed', 'NoOfGamesBought', 'FavoriteChannelOfTransaction',
       'FavoriteGame', 'TotalRevenueGenerated'],
      dtype='object')

In [8]:
#Display index
data.index

RangeIndex(start=0, stop=3209, step=1)

See the top rows of the data

In [9]:
# Check top 'n' rows
data[:3]

Unnamed: 0,CustomerID,City,NoOfChildren,MinAgeOfChild,MaxAgeOfChild,Tenure,FrquncyOfPurchase,NoOfUnitsPurchased,FrequencyOFPlay,NoOfGamesPlayed,NoOfGamesBought,FavoriteChannelOfTransaction,FavoriteGame,TotalRevenueGenerated
0,1001,1,2,3,8,210,11,11,2344,108,10,Uniform,Uniform,107.51
1,1002,1,2,3,6,442,20,20,245,22,7,Favorite,Uniform,382.4
2,1003,1,4,3,5,424,18,18,1059,130,18,Favorite,Uniform,135.01


In [10]:
#Summary statistics of each column
data.describe()

Unnamed: 0,CustomerID,City,NoOfChildren,MinAgeOfChild,MaxAgeOfChild,Tenure,FrquncyOfPurchase,NoOfUnitsPurchased,FrequencyOFPlay,NoOfGamesPlayed,NoOfGamesBought,TotalRevenueGenerated
count,3209.0,3209.0,3209.0,3209.0,3209.0,3209.0,3209.0,3209.0,3209.0,3209.0,3209.0,3209.0
mean,2605.0,1.114054,2.128389,4.960735,7.990651,347.520411,16.269554,14.684014,1568.207853,93.627921,14.761608,168.477183
std,926.502833,0.317927,1.035092,3.714191,8.784084,90.520118,8.44167,7.182029,1810.630464,88.936372,8.77391,81.798528
min,1001.0,1.0,1.0,0.0,3.0,100.0,1.0,1.0,0.0,0.0,0.0,100.0
25%,1803.0,1.0,1.0,4.0,6.0,301.0,11.0,10.0,446.0,37.0,10.0,116.64
50%,2605.0,1.0,2.0,5.0,7.0,368.0,14.0,13.0,1029.0,70.0,14.0,142.39
75%,3407.0,1.0,3.0,6.0,8.0,417.0,19.0,17.0,2029.0,119.0,19.0,191.25
max,4209.0,2.0,11.0,113.0,113.0,472.0,119.0,112.0,27829.0,1166.0,115.0,990.56


Display data type of each variable

In [11]:
# Check data type of each attribute
data.dtypes

CustomerID                        int64
City                              int64
NoOfChildren                      int64
MinAgeOfChild                     int64
MaxAgeOfChild                     int64
Tenure                            int64
FrquncyOfPurchase                 int64
NoOfUnitsPurchased                int64
FrequencyOFPlay                   int64
NoOfGamesPlayed                   int64
NoOfGamesBought                   int64
FavoriteChannelOfTransaction     object
FavoriteGame                     object
TotalRevenueGenerated           float64
dtype: object

#### Observations

    City is Categorical but is interpreted as int64 
    FavoriteChannelOfTransaction,FavoriteGame   interpreted as Object but we should convert to Categorical

#### Convert all the attributes to appropriate type

Data type conversion

    Using astype('category') to convert potential_issue, deck_risk, oe_constraint, ppap_risk, stop_auto_buy, rev_stop, and went_on_backorder attributes to categorical attributes.


In [12]:
for col in ['City', 'FavoriteChannelOfTransaction', 'FavoriteGame']:
    data[col] = data[col].astype('category')

Display data type of each variable

In [13]:
#Display data type of each variable
data.dtypes

CustomerID                         int64
City                            category
NoOfChildren                       int64
MinAgeOfChild                      int64
MaxAgeOfChild                      int64
Tenure                             int64
FrquncyOfPurchase                  int64
NoOfUnitsPurchased                 int64
FrequencyOFPlay                    int64
NoOfGamesPlayed                    int64
NoOfGamesBought                    int64
FavoriteChannelOfTransaction    category
FavoriteGame                    category
TotalRevenueGenerated            float64
dtype: object

#### Delete sku attribute

In [14]:
np.size(np.unique(data.CustomerID, return_counts=True)[0])

3209

In [15]:
data.drop('CustomerID', axis=1, inplace=True)

#### Missing Data

    Missing value analysis and dropping the records with missing values

In [16]:
data.isnull().sum()

City                            0
NoOfChildren                    0
MinAgeOfChild                   0
MaxAgeOfChild                   0
Tenure                          0
FrquncyOfPurchase               0
NoOfUnitsPurchased              0
FrequencyOFPlay                 0
NoOfGamesPlayed                 0
NoOfGamesBought                 0
FavoriteChannelOfTransaction    0
FavoriteGame                    0
TotalRevenueGenerated           0
dtype: int64

Observing the number of records before and after missing value records removal

In [17]:
print (data.shape)

(3209, 13)


#### Converting Categorical to Numeric

For some of the models all the independent attribute should be of type numeric and Linear Regression model is one among them.
But this data set has some categorial attributes.

'pandas.get_dummies' To convert convert categorical variable into dummy/indicator variables


In [18]:
print (data.columns)

Index(['City', 'NoOfChildren', 'MinAgeOfChild', 'MaxAgeOfChild', 'Tenure',
       'FrquncyOfPurchase', 'NoOfUnitsPurchased', 'FrequencyOFPlay',
       'NoOfGamesPlayed', 'NoOfGamesBought', 'FavoriteChannelOfTransaction',
       'FavoriteGame', 'TotalRevenueGenerated'],
      dtype='object')


Creating dummy variables.

    If we have k levels in a category, then we create k-1 dummy variables as the last one would be redundant. So we use the parameter drop_first in pd.get_dummies function that drops the first level in each of the category


In [19]:
categorical_Attributes = data.select_dtypes(include=['category']).columns

In [20]:
data = pd.get_dummies(columns=categorical_Attributes, data=data, prefix=categorical_Attributes, prefix_sep="_",
                      drop_first=True)

In [21]:
print (data.columns, data.shape)

Index(['NoOfChildren', 'MinAgeOfChild', 'MaxAgeOfChild', 'Tenure',
       'FrquncyOfPurchase', 'NoOfUnitsPurchased', 'FrequencyOFPlay',
       'NoOfGamesPlayed', 'NoOfGamesBought', 'TotalRevenueGenerated', 'City_2',
       'FavoriteChannelOfTransaction_Uniform', 'FavoriteGame_Uniform'],
      dtype='object') (3209, 13)


#### Target attribute distribution

In [22]:
pd.value_counts(data['TotalRevenueGenerated'])

100.00    85
105.00    74
110.00    61
102.50    57
120.00    53
107.50    46
117.50    41
132.50    39
145.00    39
115.00    38
130.00    38
112.50    37
125.00    34
137.50    31
122.50    31
140.00    30
150.00    29
152.50    27
142.50    27
127.50    25
135.00    24
180.00    23
170.00    21
155.00    18
172.50    17
165.00    14
160.00    13
195.00    13
157.50    12
147.50    11
          ..
100.13     1
155.03     1
120.26     1
171.97     1
312.51     1
268.23     1
114.23     1
100.68     1
139.79     1
129.88     1
125.32     1
123.91     1
281.25     1
122.25     1
141.75     1
218.50     1
255.00     1
452.50     1
100.50     1
139.50     1
185.89     1
252.50     1
111.50     1
317.64     1
249.00     1
122.75     1
105.09     1
106.43     1
366.79     1
210.71     1
Name: TotalRevenueGenerated, Length: 1672, dtype: int64

#### Split the data in to train and test

sklearn.model_selection.train_test_split

    Split arrays or matrices into random train and test subsets

In [23]:
#Performing train test split on the data
X, y = data.loc[:,data.columns!='TotalRevenueGenerated'].values, data.loc[:,'TotalRevenueGenerated'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)


In [24]:
#To get the distribution in the target in train and test
print(pd.value_counts(y_train))
print(pd.value_counts(y_test))

100.00    66
105.00    51
110.00    42
120.00    36
102.50    35
107.50    32
117.50    30
112.50    28
115.00    25
145.00    25
132.50    24
125.00    24
137.50    22
140.00    22
122.50    22
130.00    21
152.50    21
127.50    20
150.00    18
180.00    18
135.00    15
142.50    15
170.00    13
195.00    11
172.50    11
157.50    11
155.00    11
165.00     9
175.00     7
207.50     7
          ..
284.53     1
105.04     1
126.13     1
119.23     1
112.59     1
270.93     1
151.74     1
338.19     1
162.81     1
143.70     1
102.48     1
100.94     1
103.41     1
284.32     1
159.67     1
116.11     1
154.12     1
103.87     1
101.32     1
106.71     1
269.25     1
425.00     1
220.50     1
170.59     1
126.75     1
167.00     1
290.00     1
123.51     1
114.50     1
104.67     1
Length: 1277, dtype: int64
105.00    23
102.50    22
100.00    19
110.00    19
120.00    17
130.00    17
132.50    15
107.50    14
145.00    14
115.00    13
142.50    12
150.00    11
117.50    11
125.00    1

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3209 entries, 0 to 3208
Data columns (total 13 columns):
NoOfChildren                            3209 non-null int64
MinAgeOfChild                           3209 non-null int64
MaxAgeOfChild                           3209 non-null int64
Tenure                                  3209 non-null int64
FrquncyOfPurchase                       3209 non-null int64
NoOfUnitsPurchased                      3209 non-null int64
FrequencyOFPlay                         3209 non-null int64
NoOfGamesPlayed                         3209 non-null int64
NoOfGamesBought                         3209 non-null int64
TotalRevenueGenerated                   3209 non-null float64
City_2                                  3209 non-null uint8
FavoriteChannelOfTransaction_Uniform    3209 non-null uint8
FavoriteGame_Uniform                    3209 non-null uint8
dtypes: float64(1), int64(9), uint8(3)
memory usage: 260.2 KB


#### Perceptron Model Building

In [38]:
perceptron_model = Sequential()

perceptron_model.add(Dense(10, input_dim=12, activation='relu', kernel_initializer='normal'))
perceptron_model.add(Dense(15, activation='relu', kernel_initializer='normal'))
perceptron_model.add(Dense(1, activation='linear', kernel_initializer='normal',kernel_regularizer=regularizers.l2(0.05)))

NameError: name 'regularizers' is not defined

In [None]:
perceptron_model.compile(loss='mse', optimizer='adam', metrics=['mse'])

In [None]:
perceptron_model.fit(X_train, y_train, epochs=50, batch_size=64)

#### Predictions

In [39]:
y_pred=perceptron_model.predict_classes(X_test)
y_train_pred=perceptron_model.predict_classes(X_train)

In [40]:
from sklearn import metrics

#### Getting evaluation metrics and evaluating model performance

In [41]:
print("Mean absolute Error:", metrics.mean_absolute_error(y_pred,y_test))
print("Mean Squared Error:", metrics.mean_squared_error(y_pred,y_test))
print("Root Mean Square error:", np.sqrt(metrics.mean_squared_error(y_pred,y_test)))

Mean absolute Error: 160.87406022845275
Mean Squared Error: 32889.77159065421
Root Mean Square error: 181.35537375731167


#### Calculate Accuracy, True Positive Rate and True Negative Rates