# Churn modelling

In [4]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [6]:
import seaborn as sns
%matplotlib inline

In [7]:
dataset = pd.read_csv("Churn_Modelling.csv")


In [8]:
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [9]:
dataset.shape

(10000, 14)

In [10]:
dataset.Exited.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

# Data analysis

In [11]:
dataset[['Geography','Exited']].groupby(['Geography']).mean()

Unnamed: 0_level_0,Exited
Geography,Unnamed: 1_level_1
France,0.161548
Germany,0.324432
Spain,0.166734


churn rate of people from germany is more as compare to people from france and spain . Therefore geography have a significant affect on our output result . Therefore we will use this column in out model.

In [12]:
dataset[['Age','Exited']].groupby(['Exited']).mean()

Unnamed: 0_level_0,Age
Exited,Unnamed: 1_level_1
0,37.408389
1,44.837997


In [13]:
dataset[['Tenure','Exited']].groupby(['Exited']).mean()

Unnamed: 0_level_0,Tenure
Exited,Unnamed: 1_level_1
0,5.033279
1,4.932744


we will remove tenure because it doesn't add significance value to our model.

In [14]:
dataset[['NumOfProducts','Exited']].groupby(['NumOfProducts']).mean()

Unnamed: 0_level_0,Exited
NumOfProducts,Unnamed: 1_level_1
1,0.277144
2,0.075817
3,0.827068
4,1.0


In [15]:
dataset[['HasCrCard','Exited']].groupby(['Exited']).mean()

Unnamed: 0_level_0,HasCrCard
Exited,Unnamed: 1_level_1
0,0.707146
1,0.699067


remove HasCrCard column

In [16]:
dataset[['IsActiveMember','Exited']].groupby(['IsActiveMember']).mean()

Unnamed: 0_level_0,Exited
IsActiveMember,Unnamed: 1_level_1
0,0.268509
1,0.142691


the column that we need to drop are 1) RowNumber   2) Customerid   3) surname   4) tenure    5) hascrcard    

In [17]:
dataset.drop(['RowNumber','CustomerId','Surname','Tenure','HasCrCard'], axis=1, inplace=True)


In [18]:
dataset.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,0.0,1,1,101348.88,1
1,608,Spain,Female,41,83807.86,1,1,112542.58,0
2,502,France,Female,42,159660.8,3,0,113931.57,1
3,699,France,Female,39,0.0,2,0,93826.63,0
4,850,Spain,Female,43,125510.82,1,1,79084.1,0


In [19]:
dataset.columns.get_loc('Geography')

1

In [20]:
dataset

Unnamed: 0,CreditScore,Geography,Gender,Age,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,0.00,1,1,101348.88,1
1,608,Spain,Female,41,83807.86,1,1,112542.58,0
2,502,France,Female,42,159660.80,3,0,113931.57,1
3,699,France,Female,39,0.00,2,0,93826.63,0
4,850,Spain,Female,43,125510.82,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,0.00,2,0,96270.64,0
9996,516,France,Male,35,57369.61,1,1,101699.77,0
9997,709,France,Female,36,0.00,1,1,42085.58,1
9998,772,Germany,Male,42,75075.31,2,0,92888.52,1


#Data preprocessing

In [22]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler



In [23]:
features=['Geography','Gender']

In [24]:
dt=pd.get_dummies(dataset, columns=features, drop_first=True)


In [25]:
dt

Unnamed: 0,CreditScore,Age,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,0.00,1,1,101348.88,1,0,0,0
1,608,41,83807.86,1,1,112542.58,0,0,1,0
2,502,42,159660.80,3,0,113931.57,1,0,0,0
3,699,39,0.00,2,0,93826.63,0,0,0,0
4,850,43,125510.82,1,1,79084.10,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
9995,771,39,0.00,2,0,96270.64,0,0,0,1
9996,516,35,57369.61,1,1,101699.77,0,0,0,1
9997,709,36,0.00,1,1,42085.58,1,0,0,0
9998,772,42,75075.31,2,0,92888.52,1,1,0,1


In [26]:
sc = MinMaxScaler()
a = sc.fit_transform(dt[['CreditScore']])
b = sc.fit_transform(dt[['Age']])
c = sc.fit_transform(dt[['Balance']])
d = sc.fit_transform(dt[['EstimatedSalary']])

In [27]:
dt['CreditScore'] = a
dt['Age'] = b
dt['Balance'] = c
dt['EstimatedSalary'] = d

In [28]:
dt

Unnamed: 0,CreditScore,Age,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,0.538,0.324324,0.000000,1,1,0.506735,1,0,0,0
1,0.516,0.310811,0.334031,1,1,0.562709,0,0,1,0
2,0.304,0.324324,0.636357,3,0,0.569654,1,0,0,0
3,0.698,0.283784,0.000000,2,0,0.469120,0,0,0,0
4,1.000,0.337838,0.500246,1,1,0.395400,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
9995,0.842,0.283784,0.000000,2,0,0.481341,0,0,0,1
9996,0.332,0.229730,0.228657,1,1,0.508490,0,0,0,1
9997,0.718,0.243243,0.000000,1,1,0.210390,1,0,0,0
9998,0.844,0.324324,0.299226,2,0,0.464429,1,1,0,1


# Resampling

In [29]:
dt.Exited.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [30]:
dt_yes=dt[dt.Exited==1]
dt_no=dt[dt.Exited==0]


In [31]:
#Before resampling
print(len(dt_yes),len(dt_no))

2037 7963


In [32]:
dt_yes_upsampled = dt_yes.sample(n=len(dt_no), replace=True, random_state=21)
print(len(dt_yes_upsampled))

7963


In [33]:
dt_upsampled=dt_no.append(dt_yes_upsampled).reset_index(drop=True)

In [34]:
dt_upsampled

Unnamed: 0,CreditScore,Age,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,0.516,0.310811,0.334031,1,1,0.562709,0,0,1,0
1,0.698,0.283784,0.000000,2,0,0.469120,0,0,0,0
2,1.000,0.337838,0.500246,1,1,0.395400,0,0,1,0
3,0.944,0.432432,0.000000,2,1,0.050261,0,0,0,1
4,0.302,0.351351,0.566170,2,1,0.374680,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
15921,0.624,0.554054,0.416776,1,0,0.040243,1,1,0,1
15922,0.520,0.432432,0.454196,2,0,0.982669,1,1,0,1
15923,0.660,0.256757,0.493452,1,0,0.408865,1,0,0,0
15924,0.340,0.378378,0.339646,1,0,0.586846,1,0,0,0


In [35]:
dt_upsampled.Exited.value_counts()

1    7963
0    7963
Name: Exited, dtype: int64

# split data into test and training set

In [36]:
from sklearn.model_selection import train_test_split
x = dt_upsampled.drop(['Exited'], axis=1) 
y = dt_upsampled['Exited']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 21)

# ann model

In [37]:
import keras
from keras.models import Sequential
from keras.layers import Dense

In [39]:
# Initialising the ANN
classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(output_dim = 6, init = 'uniform', activation = 'relu', input_dim = 9))

# Adding the second hidden layer
classifier.add(Dense(output_dim = 6, init = 'uniform', activation = 'relu'))

# Adding the output layer
classifier.add(Dense(output_dim = 1, init = 'uniform', activation = 'sigmoid'))

# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to the Training set
classifier.fit(x_train, y_train, batch_size = 10, nb_epoch = 100)

  """
  
  # This is added back by InteractiveShellApp.init_path()


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100


Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x29b88ba5d68>

In [40]:
# Predicting the Test set results
y_pred = classifier.predict(x_test)
y_pred = (y_pred > 0.5)

In [41]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [42]:
cm

array([[1115,  530],
       [ 245, 1296]], dtype=int64)

In [44]:
from sklearn.metrics import accuracy_score

In [45]:
accuracy_score(y_test, y_pred)

0.7567482736974263

# random forest

In [46]:
from sklearn.ensemble import RandomForestClassifier

In [81]:
forest_obj = RandomForestClassifier(n_estimators=100, max_depth=10, criterion='entropy')

In [82]:
forest_obj.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', max_depth=10)

In [83]:
pred_train=forest_obj.predict(x_train)

In [84]:
accuracy_score(y_train, pred_train)

0.878806907378336

In [85]:
pred_test=forest_obj.predict(x_test)

In [86]:
accuracy_score(y_test, pred_test)

0.8449466415568111

When depth of trees is 12, the accuracy on the training set is 93.8% while that on the test set is 88.6%. The accuracy on training set is 5% higher than the accuracy on test set which indicates a slight overfitting. We can decrease the depth of a tree in the forest because as trees get deeper, they tend to be more specific which results in overfitting. However, reducing tree depth may also decrease the accuracy. So we need to be careful when optimizing the parameters. We can also increase the number of trees in the forest which will help the model to be more generalized and thus reduce overfitting. 

For reducing overfitting ,we will take depth of trees as 10.

Also for depth =10, the accuracy obtained on the training set by using Random forest classifier is 10% higher than the accuracy obtained in ANN. similarly on the test set also ,the accuracy in case of Random forest is 9% higher than that of ANN. 

This shows that Random forest performs a lot better than the ANN.