# Bank Churn Prediction 

Objective: 
Given a Bank customer, build a neural network based classifier that can determine whether they will leave or not in the next 6 months.  
 
Context:  
Businesses like banks which provide service have to worry about problem of 'Churn' i.e. customers leaving and joining another service provider. It is important to understand which aspects of the service influence a customer's decision in this regard. Management can concentrate efforts on improvement of service, keeping in mind these priorities.  
 
Data Description:  
The case study is from an open-source dataset from Kaggle.  The dataset contains 10,000 sample points with 14 distinct features such as CustomerId, CreditScore, Geography, Gender, Age, Tenure, Balance etc. 

In [262]:
#Load basic libraries

import numpy as np
import pandas as pd
import math
import numbers
import random
import operator
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import random
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from scipy.stats import chisquare
%matplotlib inline

#Load machine learning libraries

from sklearn import ensemble, metrics
from sklearn import model_selection
#from sklearn.model_selection import cross_validation
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

#Load neural network libraries

import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow import keras

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Loading and initial analysis

In [263]:
churn_df0 = pd.read_csv("bank.csv")
churn_df0.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [264]:
churn_df0.shape

(10000, 14)

In [265]:
churn_df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [266]:
churn_df0.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
RowNumber,10000.0,5000.5,2886.89568,1.0,2500.75,5000.5,7500.25,10000.0
CustomerId,10000.0,15690940.0,71936.186123,15565701.0,15628528.25,15690740.0,15753230.0,15815690.0
CreditScore,10000.0,650.5288,96.653299,350.0,584.0,652.0,718.0,850.0
Age,10000.0,38.9218,10.487806,18.0,32.0,37.0,44.0,92.0
Tenure,10000.0,5.0128,2.892174,0.0,3.0,5.0,7.0,10.0
Balance,10000.0,76485.89,62397.405202,0.0,0.0,97198.54,127644.2,250898.09
NumOfProducts,10000.0,1.5302,0.581654,1.0,1.0,1.0,2.0,4.0
HasCrCard,10000.0,0.7055,0.45584,0.0,0.0,1.0,1.0,1.0
IsActiveMember,10000.0,0.5151,0.499797,0.0,0.0,1.0,1.0,1.0
EstimatedSalary,10000.0,100090.2,57510.492818,11.58,51002.11,100193.9,149388.2,199992.48


In [267]:
#Check for missing values, incorrect data, duplicate data 

print ("\nMissing values :  ", churn_df0.isnull().sum().values.sum())

print ("\nUnique values :  \n",churn_df0.nunique())

print ("\nDuplicate values :  \n",churn_df0.duplicated().sum())


Missing values :   0

Unique values :  
 RowNumber          10000
CustomerId         10000
Surname             2932
CreditScore          460
Geography              3
Gender                 2
Age                   70
Tenure                11
Balance             6382
NumOfProducts          4
HasCrCard              2
IsActiveMember         2
EstimatedSalary     9999
Exited                 2
dtype: int64

Duplicate values :  
 0


In [327]:
#Dropping columns with unique values for all customers

churn_df3 = churn_df0.set_index('Exited')
churn_df3.drop('RowNumber', axis=1, inplace = True)
churn_df3.drop('CustomerId', axis=1, inplace = True)
churn_df3.drop('Surname', axis=1, inplace = True)
churn_df3.drop('Geography', axis=1, inplace = True)
churn_df3.drop('Gender', axis=1, inplace = True)

churn_df3

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
Exited,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,619,42,2,0.00,1,1,1,101348.88
0,608,41,1,83807.86,1,0,1,112542.58
1,502,42,8,159660.80,3,1,0,113931.57
0,699,39,1,0.00,2,0,0,93826.63
0,850,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...
0,771,39,5,0.00,2,1,0,96270.64
0,516,35,10,57369.61,1,1,1,101699.77
1,709,36,7,0.00,1,0,1,42085.58
1,772,42,3,75075.31,2,1,0,92888.52


In [328]:
#Since "Exited" is categorical and being our target column, lets get the size of data based on it

churn_df3.groupby(['Exited']).size()

Exited
0    7963
1    2037
dtype: int64

In [330]:
#Encoding categorical data

X4 = churn_df3.iloc[:, 0:7].values
y4 = churn_df3.iloc[:, 7].values

label_encoder_x_1 = LabelEncoder()
X4[:,0] = label_encoder_x_1.fit_transform(X4[:,0])
transformer = ColumnTransformer(
    transformers=[
       ("OneHot",        # Just a name
       OneHotEncoder(),  # The transformer class
      [0,6]            # The column(s) to be applied on.
     )
 ],
    #remainder='passthrough' # donot apply anything to the remaining columns
)
X4 = transformer.fit_transform(X4.tolist())
X4 = X4.astype('float64')

In [331]:
X4.shape

(10000, 462)

In [332]:
#Splitting the dataset into the Training set and Test set

X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size = 0.2, random_state = 0)

In [333]:
#Feature Scaling

sc = preprocessing.Normalizer()
X3_train1 = sc.fit_transform(X3_train)
X3_test1 = sc.transform(X3_test)

In [334]:
X3_train1.shape

(8000, 10001)

In [335]:
X3_test1.shape

(2000, 10001)

In [336]:
y3_train = tf.keras.utils.to_categorical(y3_train,num_classes=10)
y3_test = tf.keras.utils.to_categorical(y3_test,num_classes=10)

In [337]:
#Initialising the Artificial Neural Network

classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 84))

# Adding the second hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))

# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to the Training set
classifier.fit(X3_train1, y3_train, batch_size = 10, epochs = 100)

ValueError: Error when checking input: expected dense_66_input to have shape (84,) but got array with shape (10001,)

In [None]:
#Predicting the Test set results

y_pred = classifier.predict(X1_test)
y_pred = (y_pred > 0.5)

In [None]:
#Confusion Matrix

cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
#Accuracy score

accuracy_score(y_test,y_pred)