In [None]:
from IPython.display import Image
Image(url = 'https://atrium.ai/wp-content/uploads/elementor/thumbs/real_cost_retention-ooi87zuk2wbz6qkh6nglnfwpm2vkbw4t3idvdyf3bc.jpg')

<h3>Summary</h3> Around 20% of the bank's customers in this dataset have exited. Methods for predicting customer attrition are devised, achieving a 0.702, 0.77 roc area under curve score respectively. Age is the most important feature, being positively correlated to the probability that a customer will exit. Three customer groups are identified, one of which is 14% more likely to exit than the mean. This document provides a framework for classifying the customers. Further data has the potential to significantly improve the efficiency of the models.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/predicting-churn-for-bank-customers/Churn_Modelling.csv')

In [None]:
df.head()

In [None]:
df.shape

#### The dataset contains 10,000 bank customers, with 13 independent variables and 1 dependent variable: whether they exited the bank or not. In this document, predictive analysis is leveraged in order to predict future customer behaviour and inform decisions to arrest the attrition of bank clients.

## 1. Data cleaning

Before performing predictive analysis, data must be cleaned and formatted. 'RowNumber','CustomerId','Surname' are not relevant in predictive analysis, and are therefore dropped.

In [None]:
# Dropping the first three columns, as they are not relevant in predictive analysis:
df.drop(labels=['RowNumber','CustomerId','Surname'], axis=1, inplace=True)

In [None]:
df.info()

Some variables that should be categorical objects are integers: HasCrCard, IsActiveMember and Exited. 
Tenure and NumOfProducts are ordinal variables.

In [None]:
for col in ['HasCrCard', 'IsActiveMember']: 
    df[col] = df[col].astype('object')
print(df.dtypes) 

## 2. Exploratory Data Analysis (EDA)

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(df.Exited,palette=['#D7263D','#27FB6B'])
plt.show()

In [None]:
print(f'Ratio of customers who exited: {(df.Exited==1).sum()/len(df):.3f}')

In [None]:
fig_data=['Geography', 'Gender', 'Tenure','NumOfProducts', 'HasCrCard', 
                  'IsActiveMember']
q=1
plt.figure(figsize=(16,12))
# Plot a grid with count plots of all categorical variables
for j in fig_data:
    plt.subplot(2,3,q)
    ax=sns.countplot(df[j],hue=df.Exited, palette=['#D7263D','#27FB6B'])
    plt.xlabel(j)
    q+=1
plt.show()

German citizens have the highest rates of attrition among the 3 countries. Although there are more males than females in the dataset, the latter were responsible for more exits.

People who purchased 2 products were less likely to exit than those who bought only one. Interestingly, the rates of churn of people with 3+ products are very high.

Non-active members were much more likely to exit than active ones.

In [None]:
'''
A heat map showing the correlation between variables
'''

# Get dummies of the non-binary categorical variables
gender_dummies = pd.get_dummies(df.Gender,drop_first=True,dtype='int32')
geography_dummies = pd.get_dummies(df.Geography,drop_first=True,dtype='int32')

# Initialize a new data frame
df_new=pd.DataFrame()
# Loop through all columns
for col in df.columns:
    # If data type is not float, add the categorical variable
    if df[col].dtype!='float64':
        # If non-binary, add the dummies columns
        if col == 'Gender':
            df_new['Male'] = gender_dummies
        elif col =='Geography':
            df_new=pd.concat([df_new,geography_dummies],axis=1)
        # If binary category, add the column as an integer data type  
        else:
            df_new[col] = df[col].astype('int32')
    # If data type is float, simply add it to the new data frame
    else:
        df_new[col] = df[col]

# Get the correlation matrix and plot it
corr = df_new.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr)
plt.show()

Age is the feature most positively correlated with churn status (~0.25).

Germany customers appear to be significantly positiely correlated with a higher balance (~0.5).

## 3. Predictive Analysis on all Customers

Categorical and ordinal data is encoded, and continous data standardised.

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

# Column Transformer to scale numerical data, and encode categorical non-binary columns
ct = ColumnTransformer([
     ("scaling", StandardScaler(), ['CreditScore', 'Age','Balance','EstimatedSalary',
                                   'Tenure','NumOfProducts']),
     ("onehot", OneHotEncoder(sparse=False,drop='if_binary'), ['Gender', 'Geography'])
])

In [None]:
# Save a series of the target variable
data_features = df.drop('Exited', axis=1)

The model is split into train and test sets, using the default shuffling and 3:1 ratio.

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(data_features, df.Exited)

Then, GridSearchCV is performed on Random Forest Classifier, in order to find the most optimal hyper parameters.

In [None]:
'''
Perform grid search to find best the parameters for
RandomForestClassifier
'''

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Define the parameter grid
param_grid = {
   'clf__max_features': ['sqrt', 'log2'],
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [3, 5, None], 
}

# Set up the pipeline
pipe = Pipeline([
    ('preprocess', ct),
    ('clf',RandomForestClassifier())
])

# Grid search, using recall as the score to maximise
grid=GridSearchCV(estimator=pipe, param_grid=param_grid,cv=10,
                  scoring='recall_macro',return_train_score=True,
                  verbose=0)

In [None]:
# Fit the grid search on the training data
grid.fit(X_train, y_train)

In [None]:
grid.best_params_

The best model has no maximum depth, uses sqrt(n_features) when looking for best split, and has a maximum of 200 estimators (Decision Trees). For more info on the RFC: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
grid.best_score_

The recall of the best model is 70.2%

The next step is to assess the model's performance on the test set.

In [None]:
# Use the model on the test data
test_predictions = grid.best_estimator_.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
test_accuracy = accuracy_score(y_test, test_predictions)
test_precision = precision_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_f1_score = f1_score(y_test, test_predictions)
test_roc_auc_score = roc_auc_score(y_test, test_predictions)

print("Accuracy on test data: {:.3f}%".format(test_accuracy*100))
print("Precision on test data: {:.3f}%".format(test_precision*100))
print("Recall on test data: {:.3f}%".format(test_recall*100))
print("F1 Score on test data: {:.3f}%".format(test_f1_score*100))
print("AUC Score on test data: {:.3f}".format(test_roc_auc_score))

Even though the accuracy is relatively high (84.16%), recall/sensitivity is fairly low, meaning that out of the total number of customers who exited, only 41.602% are identified.

In [None]:
from sklearn.metrics import confusion_matrix

def get_conf_matrix(y_test, y_pred):    
    # Get confusion matrix
    data = confusion_matrix(y_test, y_pred) 
    # Build the confusion matrix as a dataframe table
    cm = pd.DataFrame(data, columns=np.unique(y_test), index = np.unique(y_test)) 
    cm.index.name = 'Observed'
    cm.columns.name = 'Predicted'
    plt.figure(figsize = (10,7))
    # Plot a heatmap
    sns.heatmap(cm, annot=True, fmt="d", annot_kws={"size": 12}) 
    plt.title("Confusion Matrix")
    plt.show()
get_conf_matrix(y_test, test_predictions)

## 4. Feature Importance

Let's look at the feature importance, which variables are the best independent predictors:

In [None]:
from sklearn.inspection import permutation_importance
# Obtain feature importance
r = permutation_importance(grid.best_estimator_, df, df.Exited.astype('int32'), 
                           n_repeats=10, random_state=0)

In [None]:
# Print the mean importance and the margin of error, for each variable
for i in r.importances_mean.argsort()[::-1]:
    print(f"{df.columns[i]:<28}"
          f"{r.importances_mean[i]:.3f}"
          f" +/- {r.importances_std[i]:.3f}")

Age is the best predictor, followed by Number of Products.

## 5. Customer Clusters

In this section, unsupervised machine learning is leveraged in order to cluster the customers into different categories.

In [None]:
# Transform the train data using the previously defined metrics
ct.fit(X_train)
X_train_trans = ct.transform(X_train)

Getting a linkage matrix of the transformed data

In [None]:
from scipy.cluster.hierarchy import linkage

linkage_matrix = linkage(X_train_trans, method='complete', metric='euclidean')

Hierarchical Clustering Dendogram

In [None]:
from scipy.cluster.hierarchy import dendrogram, set_link_color_palette

'''
'''
def fancy_dendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.title('Hierarchical Clustering Dendrogram (truncated)')
        plt.xlabel('sample index or (cluster size)')
        plt.ylabel('distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata

Reference: [SciPy Hierarchical Clustering and Dendrogram Tutorial](https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/)

In [None]:
plt.figure(figsize=(9,7))
fancy_dendrogram(
    linkage_matrix,
    truncate_mode='lastp',
    p=8,
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,
    annotate_above=6,
)
plt.show() 

The graph above shows hierarchically clustered groups of customers, based on the distance between them. Each black dot is an individual person.

Elbow Plot is drawn to determine the optimal number of clusters

In [None]:
last = linkage_matrix[-10:, 2]
last_rev = last[::-1]
idxs = np.arange(1, len(last) + 1)
plt.plot(idxs, last_rev)

# 2nd derivative of the distances
acceleration = np.diff(last, 2)  
acceleration_rev = acceleration[::-1]
plt.plot(idxs[:-2] + 1, acceleration_rev)
plt.show()
# If idx 0 is the max of this we want 2 clusters
k = acceleration_rev.argmax() + 2  
print ("clusters:", k)

Reference: [SciPy Hierarchical Clustering and Dendrogram Tutorial](https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/)

Creating 4 flat clusters

In [None]:
from scipy.cluster.hierarchy import fcluster
clusters_smaller  = fcluster(linkage_matrix, 4, criterion='maxclust')
# Show the counts of each cluster
np.unique(clusters_smaller, return_counts=True)

## 6. Customer Profiles Analysis

Add back the dependent variable, and separate the three customer profiles

In [None]:
df_clusters=pd.concat([X_train,y_train],axis=1)

Group1 = df_clusters[clusters_smaller == 1]
Group2 = df_clusters[clusters_smaller == 2]
Group3 = df_clusters[clusters_smaller == 3]
Group4 = df_clusters[clusters_smaller == 4]

In [None]:
Group2.head()

In [None]:
# Count Plots of customers who exited, for each group
f, axes = plt.subplots(1, 4, figsize=(9, 6), sharey = True)
ax1=sns.countplot(Group1.Exited,ax=axes[0], palette=['#D7263D','#27FB6B'])
ax2=sns.countplot(Group2.Exited,ax=axes[1], palette=['#D7263D','#27FB6B'])
ax3=sns.countplot(Group3.Exited,ax=axes[2], palette=['#D7263D','#27FB6B'])
ax4=sns.countplot(Group4.Exited,ax=axes[3], palette=['#D7263D','#27FB6B'])

In [None]:
# Print percentages of exited customers, for each group
for i,group in enumerate([Group1, Group2, Group3, Group4]):
    print(f'Group {i+1} customer attrition rate :',f'{group.Exited.sum()/len(group)*100:.2f}%')

Clustering has identified a particular group of 150 customers who are 90.67% likely to leave. Aggregating this model on top of the supervised learning model may improve its performance and provide additional insights into what makes clients exit, and how churn rates can be decreased. 

In [None]:
categorical_data=['Geography', 'Gender', 'Tenure','NumOfProducts', 'HasCrCard', 
                  'IsActiveMember']
q=1
plt.figure(figsize=(20,20))
for j in categorical_data:
    plt.subplot(3,3,q)
    ax=sns.countplot(X_train[j],hue=clusters_smaller)
    plt.xlabel(j)
    q+=1
plt.show()

The number of products seems to be the strongest determinant for categorising customers into the second group. However, for clients with 3 products, there is an unclear distinction being made between the second and third clusters.

In [None]:
numerical_data=['Age','CreditScore','Balance','EstimatedSalary']

q=1
plt.figure(figsize=(14,14))

for col in numerical_data:
    plt.subplot(2,2,q)
    ax=sns.boxplot(y=X_train[col], x=pd.Series(clusters_smaller), hue = clusters_smaller)
    plt.xlabel('Group')
    q+=1
plt.show()
plt.show()

The four groups are similar in terms of the numerical variables 

### Alternative Model

Because the recall of the first model was so low, an alternative model is plotted, which aims to improve recall by reducing the number of customers who did not exit. Levelling the categories of the target variable should decrease the model's incentive to make negative prediction.

In [None]:
exted = df[df.Exited==1]
others = df[df.Exited!=1][:2000]
new_df = pd.concat([exted, others],axis=0)

X = new_df.drop('Exited',axis=1)

grid=GridSearchCV(estimator=pipe, param_grid=param_grid,cv=5,
                  scoring='f1',return_train_score=True,
                  verbose=0)

X_train, X_test, y_train, y_test = train_test_split(X, new_df.Exited)

grid.fit(X_train, y_train)

In [None]:
grid.best_score_

In [None]:
test_predictions = grid.best_estimator_.predict(X_test)

In [None]:
test_accuracy = accuracy_score(y_test, test_predictions)
test_precision = precision_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_f1_score = f1_score(y_test, test_predictions)
test_roc_auc_score = roc_auc_score(y_test, test_predictions)

print("Accuracy on test data: {:.3f}%".format(test_accuracy*100))
print("Precision on test data: {:.3f}%".format(test_precision*100))
print("Recall on test data: {:.3f}%".format(test_recall*100))
print("F1 Score on test data: {:.3f}%".format(test_f1_score*100))
print("AUC Score on test data: {:.3f}".format(test_roc_auc_score))

In [None]:
def get_conf_matrix(y_test, y_pred):    
    # Get confusion matrix
    data = confusion_matrix(y_test, y_pred) 
    # Build the confusion matrix as a dataframe table
    cm = pd.DataFrame(data, columns=np.unique(y_test), index = np.unique(y_test)) 
    cm.index.name = 'Observed'
    cm.columns.name = 'Predicted'
    plt.figure(figsize = (10,7))
    # Plot a heatmap
    sns.heatmap(cm, annot=True, fmt="d", annot_kws={"size": 12}) 
    plt.title("Confusion Matrix")
    plt.show()
get_conf_matrix(y_test, test_predictions)

This model achieves smaller accuracy, but larger recall, f1 and AUC score. While following it may lead to targeting some customers at small risk of exiting, it misses significantly less of the customers at risk of leaving, and can therefore be a better option.

<h2>7. Conclusion</h2>
Two models are designed to predict whether a customer will exit or not, depending on whether accuracy or recall are the main focus. Customers were agglomeratively clustered into 4 groups, out of which one is of particular interest. While these models could be leveraged to identifiy customers at risk of leaving, additional data could significantly improve the performance of the models.