# <center> Bank Churn

# Importing data & libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
import time
%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv('../input/churn-for-bank-customers/churn.csv')

In [None]:
data.head()

In [None]:
data=data.set_index('CustomerId').drop(['RowNumber','Surname'],axis=1)

# EDA

In [None]:
data.info()

In [None]:
sns.countplot(data['Exited'],hue=data['Geography'])

In [None]:
sns.boxplot(data=data,x='Exited',y='EstimatedSalary')

In [None]:
plt.figure(figsize=(5,5))
sns.heatmap(pd.DataFrame(data.corr()['Exited']).sort_values(by='Exited').transpose().drop('Exited',axis=1).transpose(),annot=True,cmap='CMRmap')

In [None]:
from sklearn.preprocessing import LabelEncoder
to_be_encoded = ['Geography','Gender']
label_encoder = LabelEncoder()
dfs = []
for i in to_be_encoded:
    temp = pd.DataFrame({'Before Encoding':data[i].unique(),'After Encoding':label_encoder.fit_transform(data[i].unique())})
    #dfs.append([temp.sort_values(by=['After Encoding']),i])
    print(i,':')
    display(temp.sort_values(by=['After Encoding']))
    print('\n')
    data[i] = label_encoder.fit_transform(data[i])


In [None]:
X = data.drop('Exited',axis=1)
#X['nf']=X['Balance']*X['EstimatedSalary']
Y = data['Exited']

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
fs = SelectKBest(score_func=chi2, k='all')
fs.fit(X, Y)
per = []
for i in fs.scores_:
    per.append(round(((i/sum(fs.scores_))*100),3))

features_data = pd.DataFrame({'Feature':X.columns,'Scores':fs.scores_,'Importance (%)':per}).sort_values(by=['Scores'],ascending=False)

plt.figure(figsize=(9,4))
sns.barplot( 'Importance (%)','Feature',orient='h',data=features_data,palette='CMRmap')
insignificant = features_data.loc[features_data['Importance (%)']<0.005]['Feature'].unique()
features_data = features_data.set_index('Feature')
features_data

# Test TrainSPlit

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=100)

In [None]:
from sklearn.metrics import accuracy_score,classification_report

#MultiLayerPerceptron
from sklearn.neural_network import MLPClassifier
mlp=MLPClassifier()

#Bagging
from sklearn.ensemble import BaggingClassifier
bc = BaggingClassifier()

#GBC
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()

#ADA
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()

#XGB
import xgboost as xgb
from xgboost import XGBClassifier
xgb = XGBClassifier() 


# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

#RFC
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

#KNN
from sklearn.neighbors import KNeighborsClassifier
accuracy = []
for i in range(1,40):    
    kn = KNeighborsClassifier(n_neighbors=i)
    kn.fit(X_train,Y_train)
    predK = kn.predict(X_test)
    accuracy.append([accuracy_score(Y_test,predK),i])
    #print('Tested for k =',i)
temp = accuracy[0]
for m in accuracy:
    if temp[0] < m[0]:
        temp=m
knn = KNeighborsClassifier(n_neighbors=temp[1])

#SVM
from sklearn.svm import SVC
svc = SVC()

#Grid
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100, 1000,2000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)

#STcaking
from sklearn.ensemble import StackingClassifier
estimators=[('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
            ('svr',SVC(random_state=42))]
stc = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

print('Models Imported')

In [None]:
model_acc = []
model_time = []
models = [stc,bc,gbc,ada,xgb,lr,rfc,knn,svc,mlp]
for i in models:
    start=time.time()
    i.fit(X_train,Y_train)
    stop=time.time()
    model_acc.append(accuracy_score(Y_test,i.predict(X_test)))
    model_time.append((stop-start))                 
models = pd.DataFrame({'Models':models,'Accuracy':model_acc,'Runtime (s)':model_time})

In [None]:
models = models.sort_values(by=['Accuracy'],ascending=False).reset_index().drop('index',axis=1)
best = models['Models'][0]
models['Models']=models['Models'].astype(str).str.split("(", n = 2, expand = True)[0]
models

Testing if churn can be reduced