In [7]:
# I am going to apply ANN (Artificial Neural Network)

In [8]:
# tensorflow-gpu Version greater than 2.0 has Keras integrated with it
!pip install tensorflow-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
import tensorflow as tf

In [10]:
print(tf.__version__)

2.11.0


In [11]:
# import some basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [12]:
df=pd.read_csv('Churn_Modelling.csv')

In [13]:
df.head()
# this is a binary classification problem and we need to predict if a customer is going to exit a particular bank or not
# we will create a neural network for making prediction if a customer is goingt to exit a bank or not and then offer them a service/product accordingly

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# In Neural Network for binary classification problem has following properties
# 1)Activation function in the Hidden layer: ReLU or extension of ReLU
# 2) Activation function in the output layer: Sigmoid Function
# 3) Loss Function: Log loss (for binary classification)
# 4) Optimiser: Adam Optimiser (proven to be the best one) 

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [15]:
df.select_dtypes(include=object)

Unnamed: 0,Surname,Geography,Gender
0,Hargrave,France,Female
1,Hill,Spain,Female
2,Onio,France,Female
3,Boni,France,Female
4,Mitchell,Spain,Female
...,...,...,...
9995,Obijiaku,France,Male
9996,Johnstone,France,Male
9997,Liu,France,Female
9998,Sabbatini,Germany,Male


In [16]:
# splitting the dataset into dependent and independent features
X=df.iloc[:,3:-1]
y=df.iloc[:,-1:]

In [17]:
X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,699,France,Female,39,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [18]:
y.head()

Unnamed: 0,Exited
0,1
1,0
2,1
3,0
4,0


In [19]:
print(X.shape,y.shape)

(10000, 10) (10000, 1)


# Feature Engineering
# we have nominal categorical variables and the the number of classes are not very high in those categorical features therefore, we can perform one hot encoding to convert those features into numerical features

In [20]:
# feature Engineering- Handling Categorical Features- One Hot Encoding
geography=pd.get_dummies(X['Geography'],drop_first=True)
gender=pd.get_dummies(X['Gender'],drop_first=True)

In [21]:
#Concatinating the above with the Dataframe and dropping their duplicate columns
X.drop(['Geography','Gender'],axis=1,inplace=True)

In [22]:
X=pd.concat([X,geography,gender],axis=1)

In [23]:
X.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Germany,Spain,Male
0,619,42,2,0.0,1,1,1,101348.88,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,1,0


In [24]:
X.shape

(10000, 11)

 Now we are going to perform train test split before we start training the ANN

In [25]:
!pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [26]:
#Splitting the dataset into train and test dataset
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [27]:
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(8000, 11) (8000, 1)
(2000, 11) (2000, 1)


## Feature Scaling:
we are required to do feature scaling in ANN as higher magnitude of values could slow down the process of finding global minima in Gradient Descent.

In [28]:
from sklearn.preprocessing import StandardScaler

In [29]:
scaler=StandardScaler().fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)
# we don't use .fit on the whole data set to avoid data leakage

In [30]:
X_test

array([[ 8.33260665e-01, -9.50812388e-02,  1.71512123e+00, ...,
        -5.84086250e-01, -5.71384156e-01, -1.08620280e+00],
       [-5.53096261e-02,  1.89309306e-01, -1.04946154e+00, ...,
         1.71207591e+00, -5.71384156e-01, -1.08620280e+00],
       [-3.34279369e-01,  1.61126203e+00,  6.78402693e-01, ...,
        -5.84086250e-01, -5.71384156e-01,  9.20638397e-01],
       ...,
       [ 8.74589516e-01, -9.50812388e-02, -1.39503438e+00, ...,
        -5.84086250e-01, -5.71384156e-01, -1.08620280e+00],
       [ 3.88975520e-01, -2.84390545e-04, -1.74060723e+00, ...,
        -5.84086250e-01, -5.71384156e-01, -1.08620280e+00],
       [ 7.29938538e-01,  6.63293547e-01, -1.39503438e+00, ...,
        -5.84086250e-01, -5.71384156e-01,  9.20638397e-01]])

In [31]:
print(X_train.shape,X_test.shape)


(8000, 11) (2000, 11)


# Part 2: Creating ANN:

In [32]:
# after the tensorflow version 2.0--> keras has been integrated with Tensorflow


In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LeakyReLU, PReLU,ELU,ReLU # importing all the activation functions for the hidden layer
from tensorflow.keras.layers import Dropout


In [34]:
# Initializing the ANN
classifier=Sequential()

In [35]:
#Adding the input layer: the number of neurons is the same as number of independent variables
classifier.add(Dense(units=11, activation='relu'))

In [36]:
#adding the 1st hidden layer
classifier.add(Dense(units=7, activation='relu'))

In [37]:
#Adding the 2nd hidden layer
classifier.add(Dense(units=6,activation='relu'))

In [38]:
#Adding the output layer
classifier.add(Dense(units=1,activation='sigmoid'))

We have constructed the entire neural network. we also have got our data set ready for training. Now we just need to train the neural network on the training data set as shown below:

In [39]:
classifier.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy']) # to compile the entire neural network
#here the default learning rate is 0.01

In [40]:
# to change the learning rate
import tensorflow
opt=tensorflow.keras.optimizers.Adam(learning_rate=0.02)

In [41]:
classifier.compile(optimizer=opt,loss='binary_crossentropy',metrics=['accuracy']) # to compile the entire neural network


In [42]:
# Early Stopping: it stops the training when the accuracy doesn't increase
import tensorflow as tf
early_stopping=tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0.0001,
    patience=20,
    verbose=1,
    mode="auto",
    baseline=None,
    restore_best_weights=False,
    start_from_epoch=0,
)

In [43]:
#training the neural network
model_history=classifier.fit(X_train,y_train,validation_split=0.33,batch_size=10,epochs=1000,callbacks=early_stopping)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 27: early stopping


In [44]:
model_history.history.keys()

dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])

In [45]:
y_pred=classifier.predict(X_test)
y_pred=(y_pred>=0.5)



In [48]:
# make the confusion matrix
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
cm

array([[1550,   67],
       [ 224,  159]])

In [49]:
# calculating the accuracy
from sklearn.metrics import accuracy_score
score=accuracy_score(y_pred,y_test)
score

0.8545

In [51]:
#get the weights
classifier.get_weights()

[array([[-0.0925027 ,  0.05640888, -0.4833607 , -0.6056124 , -1.3661486 ,
          1.1746433 ,  0.24372952, -0.04024388, -0.71013886, -0.43222734,
          0.16953176],
        [ 3.8760185 , -1.9523715 , -1.9949133 ,  3.925087  ,  3.0928884 ,
          2.14379   , -1.258194  ,  1.9672326 , -0.09701835, -3.9028656 ,
         -1.6482931 ],
        [ 0.7162193 , -0.28074074,  0.8627243 ,  0.38070804, -1.241643  ,
         -0.19011812,  0.29940608, -0.17077889,  0.54253715,  0.01396888,
         -0.88483554],
        [-0.41212702, -0.5660574 , -1.2269428 , -2.8667815 , -1.1969777 ,
         -0.4884944 , -3.7034862 ,  0.8904754 , -1.9967244 , -2.3884895 ,
         -2.2305763 ],
        [ 0.3808296 ,  1.1266475 , -0.49950904, -3.7738276 , -0.32053858,
         -0.634439  ,  3.3563082 ,  4.8138213 , -1.5143282 , -0.8649737 ,
         -0.01994565],
        [-0.09963858,  0.47954464, -0.26146597,  0.22195037, -0.26758954,
          0.8025394 ,  0.7925199 , -0.5469922 , -1.3571883 ,  0.5174984