In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [4]:
df.duplicated().sum()

0

In [6]:
df['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [13]:
cat_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
for i in range(len(cat_cols)):
    print(cat_cols[i],'\n',df[cat_cols[i]].value_counts())

Geography 
 France     5014
Germany    2509
Spain      2477
Name: Geography, dtype: int64
Gender 
 Male      5457
Female    4543
Name: Gender, dtype: int64
HasCrCard 
 1    7055
0    2945
Name: HasCrCard, dtype: int64
IsActiveMember 
 1    5151
0    4849
Name: IsActiveMember, dtype: int64


In [15]:
#dropping cols which are not required
cols_to_drop = ['RowNumber', 'CustomerId', 'Surname']

df.drop(cols_to_drop, axis = 1, inplace = True)
df.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')

In [21]:
#one hot encoding for cat cols
cat_cols_one_hot_encoding = ['Geography', 'Gender']
df = pd.get_dummies(df, columns = cat_cols_one_hot_encoding, drop_first = True)
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,0


In [22]:
#scaling is required when working with neural nets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('Exited', axis=1),df['Exited'], 
                                                    test_size = 0.2, random_state = 1 )
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8000, 11), (2000, 11), (8000,), (2000,))

In [24]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

In [25]:
import tensorflow
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

In [28]:
model = Sequential()
#input layer
model.add(Dense(3, activation = 'sigmoid', input_dim = 11))
#hidden layer
model.add(Dense(3, activation = 'sigmoid'))
#output layer
model.add(Dense(1, activation = 'sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 3)                 36        
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 12        
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 4         
Total params: 52
Trainable params: 52
Non-trainable params: 0
_________________________________________________________________


In [40]:
model.compile(loss = 'binary_crossentropy', optimizer = 'Adam', metrics = ['accuracy'])
model.fit(X_train_scaled, y_train, epochs = 10, validation_split = 0.2) 
# we can add , validation_split = 0.2 if
# want to do cross validation 

Train on 6400 samples, validate on 1600 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fbc9902b810>

In [41]:
model.layers[0].get_weights()

[array([[-1.15205131e-01, -6.38485104e-02,  2.94237167e-01],
        [ 2.59506845e+00, -2.28116250e+00, -4.45404291e-01],
        [ 4.31878120e-02,  7.80599415e-02,  9.29348990e-02],
        [ 2.63540268e-01, -1.14255957e-01, -4.18940187e-01],
        [ 2.53025471e-04,  1.39993817e-01, -1.14710014e-02],
        [ 6.30801767e-02, -5.45192510e-02,  2.92943478e-01],
        [ 7.07574841e-03,  4.02116358e-01,  1.62109625e+00],
        [-1.07408494e-01, -2.61882246e-01,  4.17003445e-02],
        [ 4.86289524e-02, -5.74788153e-01, -9.57670689e-01],
        [ 1.48854226e-01,  3.46765257e-02,  1.65455583e-02],
        [-7.63155669e-02,  3.86184752e-01,  7.60354459e-01]], dtype=float32),
 array([-0.15769751,  0.26581803,  0.41799808], dtype=float32)]

In [42]:
model.layers[1].get_weights()

[array([[-1.5591385 ,  0.08892732, -1.4209027 ],
        [ 0.8031588 , -1.7367874 ,  1.314514  ],
        [ 1.9630593 , -0.34137738,  1.156978  ]], dtype=float32),
 array([-0.1159832 ,  0.2817166 , -0.14915426], dtype=float32)]

In [43]:
model.layers[2].get_weights()

[array([[-1.8634802],
        [ 1.3480954],
        [-1.7015034]], dtype=float32), array([0.15066391], dtype=float32)]

In [44]:
y_pred = np.where(model.predict(X_test_scaled) > 0.5, 1, 0)

In [45]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.828

In [36]:
#ways to increase performance
# increase no of epochs
# increase no of nodes in the layers
# increase no of layers
# change activation function to 'relu'

In [46]:
# also if we stor model.fit part in a variable
# we can plot plt.plot(variable.history['loss']), plt.plot(variable.history['val_loss'])
#, plt.plot(variable.history['accuracy']), plt.plot(variable.history['val_accuracy'])
# this will give the curve of epochs vs loss