In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import sklearn 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

In [None]:
data = pd.read_csv('/kaggle/input/churn-modelling/Churn_Modelling.csv')
data.head()

In [None]:
data.shape

In [None]:
data.info()

Here we observe that we have 3 categorical features namely Surname,Geography and Gender. Rest all features are numerical ie either int or float

In [None]:
data.describe()

Using the describe function always provides you with interesting insights in your data. As you can see we have the mean values for tenure which could help us determine that for how long does the customer stay with the bank. There's also additional information such as the minimum and maximum balance, estimated salary etc.

In [None]:
data.Exited.value_counts()

We observe that amongst 10000 people, **2037** customers have stopped dealing with the organization. This makes up about **20%** of total customers. This is also a problem of class imbalance that needs to be dealt properly so that we can make sure that our data doesn't overfit on the condition that the customer stays. More on this later

# Exploratory Data Analysis

In [None]:
sns.countplot(x = 'Exited', data=data)

In [None]:
sns.countplot(x = 'Gender', data=data)

There's not much difference in the proportion of male and female genders as inferred from the countplot.

In [None]:
sns.countplot(x = 'Geography', data=data)

Most of our customers(approx 50%) live in **France** while Germany and Spain have almost the same number of customers.

In [None]:
plt.figure(figsize = (12,8))
sns.heatmap(data.corr(),annot=True, cmap='viridis')

Our target variable is **Exited** and from the correlation matrix we see that there are no strong correlations with any of the features corresponding to the target variable. **Age** is slightly correlated.

In [None]:
plt.figure(figsize = (12,6))
sns.scatterplot(x=data['Age'], y = data['Exited'])

From scatterplot you need to observe that people above the age of 73 have not exited. This means people who get old usually don't exit from the organization. There's an outlier around the age of 83 though.

In [None]:
plt.figure(figsize = (12,8))
sns.scatterplot(x = data['Balance'], y = data['EstimatedSalary'], hue = data['Exited'])

In [None]:
plt.figure(figsize = (10,8))
sns.boxplot(data=data, x = 'Exited', y = 'Age')

From the boxplot we can see that the median age of people who exit is 45 compared to 36 of the people who stay. We see a large number of outliers in the segment of people who stay.

In [None]:
plt.figure(figsize = (10,8))
sns.boxplot(data=data, x = 'Exited', y = 'Balance')

We see that people who exit are having a slightly larger median balance compared to those who stay. 

In [None]:
data.groupby('IsActiveMember')['Exited'].value_counts()

**45%** customers are inactive and have not exited from the organization. We can safely assume that these people either have forgotten about their account or else have kept their money in savings. The cause of concern is that **36%** customers who were active in using the services have **exited**.

In [None]:
data.groupby('IsActiveMember')['Balance'].mean()

Not a major difference in the balance of active and inactive members.

# Data Cleaning

In [None]:
data=pd.concat([data, pd.get_dummies(data.Geography)], axis=1)
data.drop('Geography', axis=1, inplace=True)

We now convert the Geography feature into three separate features on the basis of country and drop the Geometry feature.

In [None]:
data['Gender'] = data['Gender'].apply(lambda x : 1 if x=='Female' else 0)

We convert the Gender from categorical to numerical feature by assigning the tag of 1 to Female and 0 to Male. No sexism intended

You may wonder that why are we converting all categorical features into numerical values. The reason for doing this is that most Machine Learning algorithms **expect numerical values** as the input.
In the final step of cleaning the data we'll drop the unnecessary columns in our dataframe.

In [None]:
data.drop(columns=['RowNumber', 'CustomerId','Surname'], axis=1, inplace=True)

In [None]:
target = data['Exited']
data.drop(columns=['Exited'], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
data = ss.fit_transform(data)

We standardize the data before running our model on it

In [None]:
from imblearn.over_sampling import SMOTE
k = 1
sm = SMOTE(sampling_strategy='auto', k_neighbors=k, random_state=42)
data_res, target_res = sm.fit_resample(data, target)

The class imbalance problem is addressed by creating synthetic samples using the SMOTE (Synthetic Minority Over-sampling Technique). We could've upsampled the minority class or downsampled the majority class but that could still result in overfitting/underfitting issues. 

# Modelling

We'll allocate **80%** of our data for training and **20%** of the remaining data as test set.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_res, target_res, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score,f1_score, roc_curve, classification_report,roc_auc_score
def evaluation(X_test, clf, y_test):
  y_pred = clf.predict(X_test)
  print('CLASSIFICATION REPORT')
  print(classification_report(y_test, y_pred))
      
  print('F1-Score')
  print(np.round(f1_score(y_test, y_pred)*100,2))
    
  print('Accuracy')
  accuracy = accuracy_score(y_test, y_pred)
  print(np.round(accuracy*100, 2), '%')

def plot_loss(model):
  prob=model.predict_proba(X_test)[:,1]
  fpr, tpr, thresholds=roc_curve(y_test, prob)
  plt.plot(fpr, tpr, linewidth=2)
  plt.plot([0,1], [0,1], 'k--')
  #plt.title('Logistic Regression ROC curve')
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  print('AUC-ROC')
  print(np.round(roc_auc_score(y_test, prob)*100,2))

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
evaluation(X_test, lr, y_test)
plot_loss(lr)

We got an accuracy of 71% and auc roc score of 77.4 using Logistic Regression. Let's see if other algorithm gives us better results.

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
evaluation(X_test, dtc, y_test)
plot_loss(dtc)

Using Decision Tree Classifier we got an accuracy and auc roc score of 84.21. This is a lot better than Logistic Regression

In [None]:
from sklearn.svm import SVC
svc_model=SVC(probability=True)
svc_model.fit(X_train, y_train)
evaluation(X_test, svc_model, y_test)
plot_loss(svc_model)

Using SVC we got accuracy and roc-auc score of 79.69. This is better than Logistic Regression but Decison Tree is best amongst all three.

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
evaluation(X_test, gbc, y_test)
plot_loss(gbc)

Gradient Boosting gives an accuracy of 85.37% and auc roc score of 93. 

In [None]:
from xgboost import XGBClassifier
xgb_clf = XGBClassifier(learning_rate = 0.3, n_estimators = 180, max_depth = 3)
xgb_clf.fit(X_train, y_train)
evaluation(X_test, xgb_clf, y_test)
plot_loss(xgb_clf)

XGB Classifier gives an accuracy of 90.2 and auc roc score of 96.1.The performance is decent and we get a superlative balance between accuracy and auc roc score

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
evaluation(X_test, rfc, y_test)
plot_loss(rfc)

Random Forest Classifier gives an accuracy of 90.5 and auc roc score of 96.6. The performance is comparative to XGB Classifier but slightly better.

Amongst all the Machine Learning algorithms we employed, Random Forest Classifier outperformed all the algorithms and gave us the best performance. Let's also try employing a deep neural network and see how it performs 

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

model = Sequential()
model.add(Dense(20, activation = 'relu'))
model.add(Dense(15, activation = 'relu'))
model.add(Dense(10, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

In [None]:
model.compile(optimizer='adam', loss = 'binary_crossentropy',metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, batch_size = 128,epochs = 50)

In [None]:
y_pred = model.predict(X_test)
for i in range(0, y_pred.size):
    if y_pred[i] > 0.5:
        y_pred[i] = 1
    else:
        y_pred[i] = 0
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print(accuracy)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = (15, 15))
sns.heatmap(cm, annot = True, fmt = '.0f', linewidths = .1, square = True, cmap='viridis')
plt.xlabel('Prediction')
plt.title('Accuracy: {0}'.format(round(accuracy, 2)))
plt.ylabel('Actual')
plt.show()

Deep neural network gave an accuracy of 79% and was unable to outperform Random Forest with a huge margin. 

**This is the first proper notebook that I've written and if it helped you in some way please upvote it as it would motivate me to write and post more. Thanks for reading.**