In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Predicting Customer Attrition using Machine Learning Techniques

---

Customer attrition can have serious financial consequences for a business. Product and service offering has to be more innovative than that of competitors to entice customers to buy and remain loyal. Retaining satisfied customers is generally cheaper than acquiring new ones. 
Therefore, a customer retention strategy can be employed if we know which customer is likely to attrite.

This Notebook presents a solution to the problem of identifying churned customers using Machine Learning techniques.
The dataset is retrieved from https://www.kaggle.com/sakshigoyal7/credit-card-customers and consists of 10,000 bank customers. For each customer, the dataset includes the following variables:

**Demographic**
* Age
* Gender

**Socioeconomic**
* Salary
* Education level
* Marital status

**Business related**
* Credit card category
* Credit card limit
* Duration of relationship with a customer
* Total number of products held
* Months inactive in the last 12 months
* Number of contacts in the last 12 months
* Total revolving balance on the credit card
* Open to buy credit line (Average over the last 12 months)
* Change in transaction amount (Q4 to Q1)
* Total transaction amount in the last 12 months
* Total transaction count in the last 12 months
* Average card utilization ratio

---

# Table of Contents


* [1. Libraries](#libraries)
* [2. Data Extraction and Filterig](#data)
* [3. Data Analysis](#descrana)
    * [3.1 Reading the Graphs](#info)
    * [3.2 Analysis](#ana)
    * [3.3 Correlations](#corr)
* [4. Modeling](#model)

---

# 1. Libraries  <a class="anchor" id="libraries"></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.base import TransformerMixin, BaseEstimator

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler

from category_encoders.leave_one_out import LeaveOneOutEncoder

pd.set_option('display.max_columns', None)
plt.style.use('ggplot')
warnings.filterwarnings("ignore")

np.random.seed(40)

---

# 2. Data Extraction and Filtering  <a class="anchor" id="data"></a>

In [None]:
# load the data
data = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
display(data)

<div align=center>
    The variables CLIENTNUM and the two Naive_Bayes_Classifiers are removed as they are not useful in this analysis.
</div>

In [None]:
# delete irrelevant variables
variables_to_delete = ['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2',
                       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
                       'CLIENTNUM']

data.drop(variables_to_delete, axis=1, inplace=True)

In [None]:
# detecting missing data
fig, ax = plt.subplots(figsize=(20, 6))

ax.set_title('Missing Values')

sns.heatmap(data.isnull(),
            yticklabels=False,
            cbar=False,
            cmap='magma',
            ax=ax)


plt.show()

<div align=center>
Since there are no white dots, which would otherwise indicate a missing value, there are no missing values in the dataset.   
</div>


---

# 3. Data Analysis <a class="anchor" id="descrana"></a>

<div align=center>
    The focus of the analysis in this section is on inspecting the distributions of the attrited and existing customers.
</div>

## 3.1 Reading the graphs <a class="anchor" id="info"></a>

<div align=center>
The graphs used in this analysis are explained below:
</div>

In [None]:
# plot stacked barplot
plt.hist([
    data['Months_Inactive_12_mon'].loc[data['Attrition_Flag'] == 'Attrited Customer'],
    data['Months_Inactive_12_mon'].loc[data['Attrition_Flag'] != 'Attrited Customer']], 
    bins=100, stacked=True, color=['tab:orange','tab:blue']
)
plt.xlabel('Months_Inactive_12_mon')
plt.ylabel('Count')
plt.show()

Given the variable **Months_Inactive_12_mon**, with bins 0,1,...,6, the above graph represents a stacked bar plot.
The horizontal axis represents the bins. The vertical axis represents the number of individuals.
The height of an orange-colored bar represents the numberof attrited customers in a bin. The length of a blue bar represents the number of existing customers.  The total height of both bars combined represents the total number of individuals in a bin.
Note that to retrieve the number of existing customers, we have to subtract the total number of individuals by the amount of attrited customers. 

To illustrate: the total number of individuals belonging to category 3 is roughly 3800. The amount of attrited customers is roughly 800. Consequently, the amount of existing customers is around 3000.

In [None]:
# cut a continous feature into 100 bins
d = pd.DataFrame(data[['Months_Inactive_12_mon', 'Attrition_Flag']].copy())

amount, edges = np.histogram(data['Months_Inactive_12_mon'], bins=100)
edges[-1] += 1e-5

zz = pd.cut(
    data['Months_Inactive_12_mon'].loc[data['Attrition_Flag'] == 'Attrited Customer'], 
    edges, 
    right=False
)
zz = zz.value_counts().sort_index()

# calculate proportion of attrited customers
pp = zz.values / amount

# calculate variance of proportion
stDev = np.sqrt(pp * (1 - pp) / amount)

# determine the length of the confidence interval
mm = (stDev * 1.96 * 2 < 0.12)
mm2 = (stDev * 1.96 * 2  >= 0.12)

# plot bins
plt.bar(edges[:-1][mm], amount[mm] / amount[mm], width=np.diff(edges)[mm], color='tab:blue')
plt.bar(edges[:-1][mm], zz.values[mm] / amount[mm], width=np.diff(edges)[mm], color='tab:orange')

# plot faded bins
plt.bar(edges[:-1][mm2], amount[mm2] / amount[mm2], width=np.diff(edges)[mm2], alpha=0.3, color='tab:blue')
plt.bar(edges[:-1][mm2], zz.values[mm2] / amount[mm2], width=np.diff(edges)[mm2], alpha=0.3, color='tab:orange')

# make fancy
colors = {
    'Existing Customer':'tab:blue', 
    'Attrited Customer':'tab:orange'
}         
labels = list(colors.keys())
handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in labels]
plt.legend(handles, labels)
plt.show()

The above figure is another example of a stacked bar plot. In this plot, the vertical axis represents the proportion of attrited or existing customers.

To illustrate: the proportion of attrited customers in category 3 is around 0.21 (or 21%).

Note that some bars are faded out. This is done to indicate that there are relatively few individuals in a bin.
When there are only a few individuals observed, there is more uncertainty involved in assessing whether the proportion of attrited customers in that bin is consistent with that of the true population.
A bar is faded out when the 95% confidence interval of the proportion of attrited customers ($p$) is greater than 0.12. Here, the confidence interval is derived using the normal approximation of the Bernoulli distribution.

In [None]:
d = pd.DataFrame(data[['Months_on_book', 'Attrition_Flag']].copy())
        
m = d.boxplot(by='Attrition_Flag', vert=False, figsize=(16, 4))
m.set_xlabel('Months_on_book')
m.set_title('')
plt.show()

The above graph is a boxplot. The black circles indicate outliers. The vertical black bar at the far left of the line segment is the minimum value (excluding outliers). The first quantile is represented at the far left side of the box. The median is represented by the vertical blue line. The third quantile is represented at the far right side of the box. The maximum value (excluding outliers) and fourth quantile is represented by the vertical black bar at the far right of the line segment.

## 3.2 Analysis <a class="anchor" id="ana"></a>

In [None]:
attrition_flag = data['Attrition_Flag'].value_counts()
attrition_flag = attrition_flag / attrition_flag.sum()
attrition_flag.plot(kind='bar', color='tab:blue', figsize=(9, 6))
plt.ylabel('Proportion')
plt.show()

<div align=center>
The proportion of attrited customers in the dataset is around 16%. This indicates that for every 100 new customers, we can expect that on average around 16 customers will attrite.
</div>

In [None]:
names = list(data.columns) 
categorical_names = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']
numerical_names = [c for c in names if c not in categorical_names + ['Attrition_Flag']]

target = 1.0 * (data['Attrition_Flag'] != 'Existing Customer')
    
for i, name in enumerate(names):
    if name == 'Attrition_Flag':
        continue
    
    fig = plt.figure(figsize=(16, 9))
    
    ax11 = fig.add_subplot(2,2,1)
    ax11.set_ylabel('Count')
    
    ax21 = fig.add_subplot(2,2,2)
    ax21.set_ylabel('Proportion')
    
    if name in categorical_names:
        
        # stacked bar categorical for counting
        q = data.groupby([name, 'Attrition_Flag']).size().unstack()
        q.plot(kind='bar', ax=ax11, stacked=True, color=['tab:orange','tab:blue'])
        ax11.set_ylabel('Count')
             
        # stacked bar categorical proportion
        ((q.T/q.sum(1).values).T).plot(kind='bar', ax=ax21, stacked=True, color=['tab:orange','tab:blue'])

    else:        
        # create stacked bar plot for counting (100 bins are used)
        ax11.hist([data[name].loc[target == 1],data[name].loc[target == 0]],
          bins=100, stacked=True, color=['tab:orange','tab:blue'])
        
        
        # create the boxplot figure
        d = pd.DataFrame(data[[name, 'Attrition_Flag']].copy())
        ax2 = fig.add_subplot(2,1,2, sharex = ax11)
        d.boxplot(by='Attrition_Flag', ax=ax2, vert=False)
        ax2.set_xlabel(name)
        ax2.set_title('')
        
        # divide a continous feature into 100 bins
        amount, edges = np.histogram(data[name], bins=100)
        edges[-1] += 1e-5
        
        zz = pd.cut(data[name].loc[target==1], edges, right=False).value_counts().sort_index()
        
        # calculate the proportion of attrited customer in each bi
        pp = zz.values / amount
        
        # calculate the standard devation of the probabilities
        stDev = np.sqrt(pp*(1 - pp) / amount)
        
        # determine length of confidence interval
        mm = (stDev*1.96*2 < 0.12)
        mm2 = (stDev*1.96*2  >= 0.12)
        
        # plot bins
        ax21.bar(edges[:-1][mm], amount[mm]/ amount[mm], width=np.diff(edges)[mm], color='tab:blue')
        ax21.bar(edges[:-1][mm], zz.values[mm]/amount[mm], width=np.diff(edges)[mm], color='tab:orange')
        
        # faded out bins
        ax21.bar(edges[:-1][mm2], amount[mm2]/ amount[mm2], width=np.diff(edges)[mm2], alpha=0.3, color='tab:blue')
        ax21.bar(edges[:-1][mm2], zz.values[mm2]/amount[mm2], width=np.diff(edges)[mm2], alpha=0.3, color='tab:orange')
     
        # make it fancy
        colors = {'Existing Customer':'tab:blue', 'Attrited Customer':'tab:orange'}         
        labels = list(colors.keys())
        handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in labels]
        ax11.legend(handles, labels)
        ax21.legend(handles, labels)
        
    # make it even more fancy
    fig.suptitle(name, size=20) 
    ax11.set_xlabel(name)
    ax21.set_xlabel(name)
    fig.tight_layout()
    plt.show()

Several observations can be made from the above figures:
    
* The distribution of most continuous features appears to have heavy tails (high skewness). This might distort our model's learning process.
* In some bins there are relatively few observations. This adds to the uncertainty that our sample might not represent the attributes of the true population. Therefore, our model might have more trouble generalizing to unobserved data. A larger dataset might resolve this.
* Some features appear to have multimodal distributions (Total_Trans_Ct, Total_Trans_Amt). This might indicate heterogeneity and the presence of subgroups. These features can be particularly interesting for modeling.

## 3.3 Correlations <a class="anchor" id="corr"></a>

In [None]:
sns.pairplot(
    data[numerical_names + ['Attrition_Flag']],
    corner=True,
    hue='Attrition_Flag',
    kind='scatter',
    palette={
        'Attrited Customer': 'tab:orange',
        'Existing Customer': 'tab:blue', 
    }
)
plt.show()

From the correlation figure above, we can induce that the correlation between Avg_Open_To_Buy and Credit_limit is close to 1. One of these features can be dropped since correlated features generally don't improve model performance.
By eyeballing I find the following interaction effects interesting enough to be included in the dataset:
* Total_Trans_Ct x Total_Ct_Chng_Q4_Q1
* Total_Trans_Amt x Total_Trans_Ct
* Total_Amt_Chng_Q4_Q1 x Total_Ct_Chng_Q4_Q1
* Total_Amt_Chng_Q4_Q1 x Total_Trans_Amt
* Total_Revolving_Bal x Avg_Utilization_Ratio

---

# 4. Modeling <a class="anchor" id="model"></a>

1. The numerical features will be log-transformed to reduce the skewness.
2. The categorical features (Gender, Marital_Status, etc...) are transformed using Leave-One-Out Target Encoding (LOOE). The reasons for choosing LOOE over One-Hot Encoding are (1) the number of dimensions needed to encode the data is kept to a minimum (2) the encoded features represent numerical values between [0,1].

3. The numerical features are then bounded between [0,1] using the min-max scaler. Hence, all features will be between [0,1].

4. Borderline SMOTE is used to oversample the minority class (attrited customers).

5. The majority class (existing customers) is undersampled such that the ratio between the classes is 1:1. Hence, our dataset will be balanced.

6. Each model is evaluated using a stratified 10-fold cross-validation. The recall score is used as an evaluation metric. The model that has the highest mean test score is then selected as the final model.

In [None]:
y = (data['Attrition_Flag'] == 'Attrited Customer').astype(int)
X = data.drop('Attrition_Flag', axis=1).copy()
X.drop('Avg_Open_To_Buy', axis=1, inplace=True)

X['Total_Trans_CtXTotal_Ct_Chng_Q4_Q1'] = X['Total_Trans_Ct'] * X['Total_Ct_Chng_Q4_Q1'] 
X['Total_Trans_AmtXTotal_Trans_Ct'] = X['Total_Trans_Amt'] * X['Total_Trans_Ct'] 
X['Total_Amt_Chng_Q4_Q1XTotal_Ct_Chng_Q4_Q1'] = X['Total_Amt_Chng_Q4_Q1'] * X['Total_Ct_Chng_Q4_Q1'] 
X['Total_Amt_Chng_Q4_Q1XTotal_Amt_Chng'] = X['Total_Amt_Chng_Q4_Q1'] * X['Total_Trans_Amt'] 
X['Total_Revolving_BalXAvg_Utilization_Ratio'] = X['Total_Revolving_Bal'] * X['Avg_Utilization_Ratio'] 

numerical_names2 = [x for x in X.columns if x not in categorical_names]

In [None]:
# the models to be evaluated
models = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'SGD': SGDClassifier(),
    'RF': RandomForestClassifier(),
    'GB': GradientBoostingClassifier(),
    'NN': MLPClassifier(),
    'AB': AdaBoostClassifier()
}

# hyperparameters for each model
parameters = {
    'SVM': {
        'clf__kernel': ['rbf', 'linear', 'sigmoid'],
    },
    'KNN': {
        'clf__n_neighbors': [1, 2, 3, 4, 5, 10, 20, 40, 80],
    },
    'SGD': {
        
    },
    'RF': {
        'clf__n_estimators': [2000],
        'clf__max_features': [X.shape[1]]
    },
    'GB': {
        'clf__n_estimators': [2000],
    },
    'NN': {
        'clf__hidden_layer_sizes': [100, 250, 500]
    },
    'AB': {
        'clf__n_estimators': [50, 100, 500]
    }
}

In [None]:
# create a Leave-One-Out Encoder class that can be passed 
# to the pipeline
class LOOEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.transformer = LeaveOneOutEncoder()
        
    def fit(self, X, y):
        self.transformer.fit(X, y)
        return self
    
    def transform(self, X):
        return self.transformer.transform(X)

# create a log transform class that can be passed 
# to the pipeline
class LogEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return np.log(X + 1)
    
    
# combine the previous created classses
col_prepro = ColumnTransformer(
                    transformers=[
                        ('cat', LOOEncoder(), categorical_names),
                        ('num', LogEncoder(), numerical_names2)
                    ],
                    remainder='passthrough'
)


In [None]:
best_model = None
best_mean_test_score = 0

# evaluate each model
for model_name in models.keys():
    model = models[model_name]
    params = parameters[model_name]
    
    # the final pipeline
    pipe = Pipeline(
        steps=[
            ('col_prepro', col_prepro),
            ('scaler', MinMaxScaler()),
            ('over', BorderlineSMOTE(
                sampling_strategy=0.3
                )
            ),
            ('under', RandomUnderSampler(
                sampling_strategy=0.5
                )
            ),
            ('clf', model)
        ]
    )
    
    # apply a gridsearch based on the pre-defined hyperparameters
    gs_clf = GridSearchCV(
        pipe, 
        params, 
        n_jobs=-1, 
        cv=StratifiedKFold(
            n_splits=10, 
            shuffle=True
        ), 
        scoring='recall'
    )
    
    gs_clf.fit(X, y)
    
    # retrieve the results
    d = pd.DataFrame(gs_clf.cv_results_)
    best_submodel = d.loc[d['rank_test_score'] == 1]
    
    if best_submodel['mean_test_score'].values[0] > best_mean_test_score:
        best_model = gs_clf.best_estimator_
        best_mean_test_score = best_submodel['mean_test_score'].values[0]
        
    display(d.loc[d['rank_test_score'] <= 3])
    
y_pred = best_model.predict(X)

The highest mean test score of 0.923 is achieved with GradientBoosting. GradientBoosting also appears to be among the most robust models. Its standard deviation across the test sets is approximately 0.018, while most other models have a standard deviation of 0.025 or greater.

The below results are obtained from training the model on the entire dataset and predicting the corresponding attrited customers (in-sample predictions).

In [None]:
try:
    # retrieve the feature importance
    feat_importance = best_model.named_steps['clf'].feature_importances_
    feat_importance = pd.DataFrame(feat_importance, 
                                   index = X.columns, 
                                   columns=['Feature Importance'])
    feat_importance = feat_importance.sort_values(by='Feature Importance')
    feat_importance.plot(kind='barh', figsize=(9, 7))
    plt.show()
except:
    print('Feature importane could not be evaluated')

The above figures visualizes the relative importance of each feature, based on the Gini Impurity Index. The higher the value, the better a predictor. The top 5 predictors are

    (1) Total_Trans_Ct
    (2) Total_Trans_Amt
    (3) Total_Trans_Ct x Total_Ct_Chng_Q4_Q1
    (4) Total_Revolving_Bal
    (5) Total_Relationship_Count

As expected, the variables with a multimodal distribution are among the best predictors. The third best predictor is the interaction effect between Total_Trans_Ct and Total_Ct_Chng_Q4_Q1. The other interaction effects are less important predictors. For a follow-up study, it might be interesting to analyze customer segmentation.

In [None]:
def confusion_matrix2(y_pred, y, names):
    multi_index = [np.array(['Actual', 'Actual', 'Actual']), 
                   np.array([names[0], names[1], 'Total'])]

    multi_column = [np.array(['Predicted', 'Predicted', 'Predicted']), 
                    np.array([names[0], names[1], 'Total'])]

    conf = np.zeros((3,3))
    conf[:2, :2] = confusion_matrix(y, y_pred)
    conf[-1,:] = conf.sum(0)
    conf[:, -1] = conf.sum(1)

    conf = pd.DataFrame(conf, 
                 index=multi_index, 
                 columns=multi_column)

    conf.iloc[-1, -1] = ''

    display(conf)

confusion_matrix2(y_pred, y, ['Existing Customer', 'Attrited Customer'])
print('Recall = {:.2f}'.format(recall_score(y, y_pred)))

The recall score of the in-sample predictions is 1. This is slighly higher than the mean test score of 0.923.