# Bank churn prediction

In [1]:
# importing libraries
import os
import numpy as np
from scipy import stats
from scipy.stats import zscore
from scipy.spatial.distance import cdist
from scipy.spatial.distance import pdist
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering 
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from sklearn import metrics
from sklearn.metrics import silhouette_score
from os import system
import warnings
warnings.filterwarnings('ignore')

## I. Exploratory data analysis

In [34]:
'''Reading the data and showcasing it'''
df = pd.read_csv('../Data/bank_nn.csv')
columns = df.columns
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [35]:
df.drop(columns = ['RowNumber','CustomerId', 'Surname'], inplace=True) # dropping columns unique to each customer

In [36]:
'''Dataframe information'''
df.head()
s1 = df.shape
print(df.shape) # Number of rows and columns in the dataframe
print('\033[1m'+ 'Null values: ' , df.isnull().values.any()) # To check for null values in the dataframe and print in bold
print('\033[0m') # To remove bold formatting
print(df.info()) # To check the data type of the columns of the dataframe
print(df.describe().T) # To review the statistical summary of the dataframe
df.head()

(10000, 11)
[1mNull values:  False
[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB
None
                   count           mean           std     min       25%  \
CreditScore      10000.0     650.528800     96.653299  350.00    584.00   
Age              10000.0      38.9

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [37]:
'''Converting object type columns (gender) into categorical type'''
rstruct={'Geography':{'France':0, 'Spain':1, 'Germany':2}}
onehotcol = ['Gender']
df=df.replace(rstruct)
df=pd.get_dummies(df, columns=onehotcol)
df.head()

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Gender_Female,Gender_Male
0,619,0,42,2,0.0,1,1,1,101348.88,1,1,0
1,608,1,41,1,83807.86,1,0,1,112542.58,0,1,0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1,0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1,0
4,850,1,43,2,125510.82,1,1,1,79084.1,0,1,0


In [38]:
'''Bivariate analysis'''

'Bivariate analysis'