In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.layers import Dense,Conv1D,Flatten
from tensorflow.keras.models import Sequential, Model
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from plotly import graph_objects as go
from sklearn.metrics import accuracy_score,confusion_matrix
import plotly.offline as py
import xgboost as xgb
%matplotlib inline

## Reading and Cleaning dataset

In [None]:
data=pd.read_csv('../input/bank-customer-churn-modeling/Churn_Modelling.csv',engine='python')

In [None]:
data.describe()

In [None]:
data.info()

## Exploratory Data Analysis

In [None]:
sizes = data['Exited'].value_counts(sort = True)
colors = ["grey","purple"] 
plt.rcParams['figure.figsize'] = 5,5# Plot
plt.pie(sizes, explode=None, labels=['No','Yes'], colors=colors,autopct='%1.1f%%', shadow=True, startangle=270,)
plt.title('Percentage of Churn in Dataset')
plt.show()

### Gender wise plotting 

In [None]:
df_plot = data.groupby('Gender').Exited.mean().reset_index()
plot_data = [
    go.Bar(
        x=df_plot['Gender'],
        y=df_plot['Exited'],
        width = [0.5, 0.5],
        marker=dict(
        color=['green', 'blue']))]
plot_layout = go.Layout(
        xaxis={"type": "category"},
        yaxis={"title": "Churn Rate"},
        title='Gender',
        plot_bgcolor  = 'rgb(243,243,243)',
        paper_bgcolor  = 'rgb(243,243,243)',
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
py.iplot(fig)

We can see that females have higher churn rate compared to males

### CreditScore plotting

In [None]:
df_plot = data.groupby('CreditScore').Exited.mean().reset_index()
plot_data = [
    go.Scatter(
        x=df_plot['CreditScore'],
        y=df_plot['Exited'],
        mode='markers',
        name='Low',
        marker= dict(size= 7,
            line= dict(width=1),
            color= 'blue',
            opacity= 0.8
           ),
    )]
plot_layout = go.Layout(
        yaxis= {'title': "Churn Rate"},
        xaxis= {'title': "Credit score"},
        title='Credit Score based Churn rate',
        plot_bgcolor  = "rgb(243,243,243)",
        paper_bgcolor  = "rgb(243,243,243)",
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
py.iplot(fig)

Lower credit score customers have high chance of churning out.

### Based on Tenure

In [None]:
df_plot = data.groupby('Tenure').Exited.mean().reset_index()
plot_data = [
    go.Scatter(
        x=df_plot['Tenure'],
        y=df_plot['Exited'],
        mode='markers',
        name='Low',
        marker= dict(size= 6,
            line= dict(width=1),
            color= 'red',
            opacity= 0.8
           ),
    )]
plot_layout = go.Layout(
        yaxis= {'title': "Churn Rate"},
        xaxis= {'title': "Tenure"},
        title='Tenure based Churn rate',
        plot_bgcolor  = "rgb(243,243,243)",
        paper_bgcolor  = "rgb(243,243,243)",
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
py.iplot(fig)

Customer having higher tenure is less susceptible to churning

## Transforming and Splitting data

In [None]:
X= data.iloc[:,3:-1]
y= data.iloc[:,-1].values

The columns of object category are transformed into numeric categorical labels.

In [None]:
for col in X.select_dtypes(include=['object']):
    encoder=LabelEncoder()
    X[col]=encoder.fit_transform(X[col])

In [None]:
X.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

Standardised the values with mean =0 and standard deviation =1

In [None]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

## Tensorflow model

In [None]:
model=Sequential()
model.add(Flatten(input_shape=(10,)))
model.add(Dense(100,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

In [None]:
model.compile(optimizer='adam',metrics=['accuracy'],loss='BinaryCrossentropy')

In [None]:
model.fit(x_train,y_train,batch_size=64,validation_split=0.1,epochs=100)

Taking threshold as 0.5 and making the result 0 and 1

In [None]:
preds=model.predict(x_test)
preds=np.where(preds>0.5,1,0)

In [None]:
accuracy_score(y_test,preds)


The accuracy of the model is around <b>85.9%</b>

The confusion matrix for the model test prediction results is shown below

In [None]:
conf=pd.DataFrame(confusion_matrix(y_test,preds))
conf.index=['Actual_0','Actual_1']
conf.columns=['Predicted_0','Predicted_1']
conf

The model was able to most of 0 (not churned) correctly compared to 1. the data is biased since most of the data is of customers who have not churned

## Xgboost Model

In [None]:
xgb_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.08, objective= 'binary:logistic',n_jobs=-1)
xgb_model.fit(x_train, y_train)
print('Accuracy of XGB classifier on training set: {:.2f}'.format(xgb_model.score(x_train, y_train)))
print('Accuracy of XGB classifier on test set: {:.2f}'.format(xgb_model.score(x_test, y_test)))

The accuracy of this model is improved to 87%

The confusion matrix for the model test prediction results is shown below

In [None]:
preds=xgb_model.predict(x_test)

In [None]:
conf=pd.DataFrame(confusion_matrix(y_test,preds))
conf.index=['Actual_0','Actual_1']
conf.columns=['Predicted_0','Predicted_1']
conf

The prediction for category 1 (churned) is improved