In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Loading data
data    = pd.read_csv('../input/credit-card-customers/BankChurners.csv')
m, n    = data.shape
data.drop(columns=[
    'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
    'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2',
    'CLIENTNUM'], 
          inplace=True)
columns = data.columns.values

# Data doesn't have any NaN
# data.count()

Data are transformed from categorical to numerical, trying to preserve some sort of ordinality (e.g., card category "Gold" is an higher option than "Silver", so the map $M : \text{Categorical} \rightarrow \mathbb{N}$ should be such that $M(\text{Gold}) > M(\text{Silver})$. Unknows are labelled $0$ for convenience.

In [None]:
# Transforming variable (categorical -> numerical)
data['Attrition_Flag'].replace({'Existing Customer': 0, 
                                'Attrited Customer': 1}, inplace=True)
data['Gender'].replace({'M': 1, 
                        'F': 0}, inplace=True)
data['Education_Level'].replace({'Unknown': 0, 
                                 'Uneducated': 1, 
                                 'High School': 2, 
                                 'College': 3, 
                                 'Graduate': 4, 
                                 'Post-Graduate': 5, 
                                 'Doctorate': 6}, inplace=True)
data['Marital_Status'].replace({'Unknown': 0,
                                'Divorced': 1,
                                'Married': 2,
                                'Single': 3}, inplace=True)
data['Income_Category'].replace({'Unknown': 0,
                                'Less than $40K': 1,
                                '$40K - $60K': 2,
                                '$60K - $80K': 3,
                                '$80K - $120K': 4,
                                '$120K +': 5}, inplace=True)
data['Card_Category'].replace({'Blue': 1,
                               'Gold': 2,
                               'Silver': 3,
                               'Platinum': 4}, inplace=True)

No particulary strong correlations has been found among the features.

In [None]:
# Corrleations  
corr = data.corr()

fig, ax = plt.subplots(figsize=(8,8))
im = ax.imshow(corr)
ax.set_xticks(np.arange(len(columns)))
ax.set_yticks(np.arange(len(columns)))
ax.set_xticklabels(columns)
ax.set_yticklabels(columns)
ax.set_title("Correlation matrix")
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")

plt.show()

Being the dataset unbalanced with respect to the target feature ("Attrition_Flag"), a stratified strategy has been chosen for the splitting.

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

# Taking the train/target dataset
target = data['Attrition_Flag'].values
train  = data.drop(columns=['Attrition_Flag']).values.astype('float')

TEST_SIZE = 0.2
sss = StratifiedShuffleSplit(n_splits=1, test_size=TEST_SIZE, random_state=232)
for train_index, test_index in sss.split(train, target):
    x_train, x_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]

In [None]:
# Data scaling
from sklearn.preprocessing import StandardScaler

scaler  = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test  = scaler.fit_transform(x_test)

A bunch of classification models are tested. The business rule puts a focus on finding churners, so the models are tuned using the Recall as cost function.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model    import LogisticRegression
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.naive_bayes     import BernoulliNB
from sklearn.ensemble        import AdaBoostClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer

def recall(y, y_pred):
    cm = confusion_matrix(y, y_pred)
    return cm[1,1] / (cm[1,0] + cm[1,1])

def accuracy(y, y_pred):
    cm = confusion_matrix(y, y_pred)
    return (cm[0,0] + cm[1,1]) / (cm.sum())

estimators = {'LogisticRegression': {'func': LogisticRegression(),
                                     'params': {'C': [0.1, 0.5, 1, 1.2, 1.5]},
                                     'rec': None},
              'KNeighborsClassifier': {'func': KNeighborsClassifier(),
                                     'params': {'n_neighbors': [5, 6, 7, 8, 9, 10, 11]},
                                      'rec': None},
              'BernoulliNB'        : {'func': BernoulliNB(),
                                     'params': {'alpha': [0.1, 0.5, 1, 1.2, 1.5]},
                                     'rec': None},
              'AdaBoostClassifier' : {'func': AdaBoostClassifier(),
                                      'params': {'learning_rate': [0.1, 0.5, 1, 1.2, 1.5]},
                                      'rec': None}
             }

models_to_test = estimators.keys()

for name, estimator in estimators.items():
    if name in models_to_test:
        model = GridSearchCV(estimator=estimator['func'], 
                             param_grid=estimator['params'],
                             scoring=make_scorer(recall))
        model.fit(x_train, y_train)
        preds = model.predict(x_test)
        rec   = recall(y_test, preds)
        acc   = accuracy(y_test, preds)
        estimator["rec"] = rec
        print(f"{name} Recall: {rec}, Accuracy: {acc} Best Params: {model.best_params_}")

best_spec  = 0
best_model = None
for name, estimator in estimators.items():
    if estimator["rec"] > best_spec:
        best_spec  = estimator["rec"]
        best_model = name
    
print(f'Best model: {best_model}')      