# Bank Churn No Churn Identification

The case study is from an open-source dataset from Kaggle. 

Link to the Kaggle project site:

https://www.kaggle.com/barelydedicated/bank-customer-churn-modeling

Given a Bank customer, can we build a classifier that can determine whether they will leave or not using Neural networks?

Let us see how the Machine learning Algo based model perform on this dataset.

### Read the data set

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("Churn_Modelling.csv")

In [3]:
data.head(2)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0


In [4]:
data.head(2).T

Unnamed: 0,0,1
RowNumber,1,2
CustomerId,15634602,15647311
Surname,Hargrave,Hill
CreditScore,619,608
Geography,France,Spain
Gender,Female,Female
Age,42,41
Tenure,2,1
Balance,0,83807.9
NumOfProducts,1,1


# Process non-numeric data

Separate data into train and test sets ( 70:30). Use random seed as 7 

# 1. Build best accuracy using Logistic Regression to find defaulters
1. Calculate train and test accuracies
2. Print 'Confusion Matrix'
3. Calculate Precision and Recall
4. Calculate ROC-AUC

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

#data cleaning
data=pd.get_dummies(data,drop_first=True)

# Decision tree in Python can take only numerical / categorical colums. It cannot take string / obeject types. 
# The following code loops through each column and checks if the column type is object then converts those columns
# into categorical with each distinct value becoming a category or code.

for feature in data.columns: # Loop through all columns in the dataframe
    if data[feature].dtype == 'object': # Only apply for columns with categorical strings
        data[feature] = pd.Categorical(data[feature]).codes # Replace strings with an integer

 # many columns are of type object i.e. strings. These need to be converted to ordinal type
x = data.drop(["Exited"], axis=1)
y = data[["Exited"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

##transform the data
scalar=StandardScaler()
scalar.fit(x_train) #Note: train data only used to fit;not test

x_train_std=scalar.transform(x_train)  # this will do (mean-m)/np.sqrt(v)
x_test_std=scalar.transform(x_test)

m1 = LogisticRegression()
m1.fit(x_train, y_train)
m1_score_train = m1.score(x_train, y_train) #test accuracy
m1_score = m1.score(x_test, y_test) #test accuracy
print('Model score :',m1_score_train,'m1_score_test=',m1_score)

  y = column_or_1d(y, warn=True)


Model score : 0.7985714285714286 m1_score_test= 0.791


In [6]:
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score, auc, roc_curve
# make predictions
predicted = m1.predict(x_test)
metrics.confusion_matrix(predicted, y_test)

array([[2373,  627],
       [   0,    0]], dtype=int64)

In [7]:
#Precision and Recall of test
tAccuracy = accuracy_score(y_test, predicted)
tRecall = recall_score(y_test, predicted)
tPrecision = precision_score(y_test, predicted)
print('tAccuracy=',tAccuracy,',tRecall=',tRecall,',tPrecision=',tPrecision)

tAccuracy= 0.791 ,tRecall= 0.0 ,tPrecision= 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
#rocauc
from sklearn.metrics import roc_curve, auc
y_predictProb = m1.predict_proba(x_test)
fpr, tpr, thresholds = roc_curve(y_test, y_predictProb[::,1])
roc_auc = auc(fpr, tpr)
roc_auc

0.5702369358633914

# 2. Build best accuracy using Naive Bayes to find defaulters
1. Calculate train and test accuracies
2. Print 'Confusion Matrix'
3. Calculate Precision and Recall
4. Calculate ROC-AUC

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

#data cleaning
#data cleaning
data=pd.get_dummies(data,drop_first=True)

# Decision tree in Python can take only numerical / categorical colums. It cannot take string / obeject types. 
# The following code loops through each column and checks if the column type is object then converts those columns
# into categorical with each distinct value becoming a category or code.

for feature in data.columns: # Loop through all columns in the dataframe
    if data[feature].dtype == 'object': # Only apply for columns with categorical strings
        data[feature] = pd.Categorical(data[feature]).codes # Replace strings with an integer

 # many columns are of type object i.e. strings. These need to be converted to ordinal type
x = data.drop(["Exited"], axis=1)
y = data[["Exited"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

##transform the data
scalar=StandardScaler()
scalar.fit(x_train) #Note: train data only used to fit;not test

x_train_std=scalar.transform(x_train)  # this will do (mean-m)/np.sqrt(v)
x_test_std=scalar.transform(x_test)

m2 = GaussianNB()
m2.fit(x_train, y_train)
m2_score_train = m2.score(x_train, y_train) #test accuracy
m2_score = m2.score(x_test, y_test) #test accuracy
print('Model score train :',m2_score_train,'m2_score_test=',m2_score)

  y = column_or_1d(y, warn=True)


Model score train : 0.7877142857142857 m2_score_test= 0.782


In [10]:
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score, auc, roc_curve
# make predictions
predicted = m2.predict(x_test)
metrics.confusion_matrix(predicted, y_test)

array([[2312,  593],
       [  61,   34]], dtype=int64)

In [11]:
#Precision and Recall of train
print('Precision and Recall of test')
tAccuracy = accuracy_score(y_test, predicted)
tRecall = recall_score(y_test, predicted)
tPrecision = precision_score(y_test, predicted)
print('tAccuracy=',tAccuracy,',tRecall=',tRecall,',tPrecision=',tPrecision)

Precision and Recall of test
tAccuracy= 0.782 ,tRecall= 0.05422647527910686 ,tPrecision= 0.35789473684210527


In [12]:
#rocauc
from sklearn.metrics import roc_curve, auc
y_predictProb = m2.predict_proba(x_test)
fpr, tpr, thresholds = roc_curve(y_test, y_predictProb[::,1])
roc_auc = auc(fpr, tpr)
roc_auc

0.751022098017906

# 3. Build best accuracy using KNN Classifier to find defaulters
1. Calculate train and test accuracies
2. Print 'Confusion Matrix'
3. Calculate Precision and Recall

In [13]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

#data cleaning
data=pd.get_dummies(data,drop_first=True)

# Decision tree in Python can take only numerical / categorical colums. It cannot take string / obeject types. 
# The following code loops through each column and checks if the column type is object then converts those columns
# into categorical with each distinct value becoming a category or code.

for feature in data.columns: # Loop through all columns in the dataframe
    if data[feature].dtype == 'object': # Only apply for columns with categorical strings
        data[feature] = pd.Categorical(data[feature]).codes # Replace strings with an integer

 # many columns are of type object i.e. strings. These need to be converted to ordinal type
x = data.drop(["Exited"], axis=1)
y = data[["Exited"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

##transform the data
scalar=StandardScaler()
scalar.fit(x_train) #Note: train data only used to fit;not test

x_train_std=scalar.transform(x_train)  # this will do (mean-m)/np.sqrt(v)
x_test_std=scalar.transform(x_test)

NNH = KNeighborsClassifier(n_neighbors=3)
#NNH = KNeighborsClassifier()
NNH.fit(x_train, y_train)
m2_score=NNH.score(x_test, y_test)
m2_score_train=NNH.score(x_train, y_train)
print('Model score train :',m2_score_train,'m2_score_test=',m2_score)




Model score train : 0.8348571428571429 m2_score_test= 0.7356666666666667
