## My Credit Card Customers

This is my first Kaggle notebook so I hope is not too bad ;-)

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.style.use('seaborn-white')

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['legend.fontsize'] = 10
plt.rcParams['figure.titlesize'] = 12

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.datasets import make_classification

In [None]:
df = pd.read_csv('../input/credit-card-customers/BankChurners.csv')

In [None]:
df.drop(labels = df.columns[-2:], axis = 1,inplace = True)
df.drop(labels=['CLIENTNUM'], axis = 1, inplace = True)

## Short data preprocessing

In [None]:
df.head()

In [None]:
df['Attrition_Flag'] = df['Attrition_Flag'].map(lambda x: 0 if x == 'Existing Customer' else 1)

In [None]:
df['Income_Category'].value_counts()

In [None]:
def process_eduaction_level(entry):
    if entry == 'Less than $40K':
        return '<40k'
    elif entry == '$40K - $60K':
        return '40k-60k'
    elif entry == '$80K - $120K':
        return '80k-120k'
    elif entry == '$60K - $80K':
        return '60k-80k'
    elif entry == 'Unknown':
        return 'Unknown'
    elif entry == '$120K +':
        return '>120k'

In [None]:
df['Income_Category'] = df['Income_Category'].map(process_eduaction_level)

## Initial Data exploration

In [None]:
df.info()

In [None]:
cat_feat = ['Gender','Education_Level','Marital_Status','Income_Category',
           'Card_Category','Months_Inactive_12_mon']

In [None]:
cont_feat = ['Customer_Age','Months_on_book','Credit_Limit','Total_Revolving_Bal','Avg_Open_To_Buy',
            'Total_Amt_Chng_Q4_Q1','Total_Trans_Amt','Total_Trans_Ct','Total_Ct_Chng_Q4_Q1',
             'Avg_Utilization_Ratio','Dependent_count','Total_Relationship_Count','Contacts_Count_12_mon']

In [None]:
cat_feat = ['Gender','Education_Level','Marital_Status','Income_Category',
           'Card_Category','Months_Inactive_12_mon']

In [None]:
semi_cat = ['Dependent_count','Total_Relationship_Count','Contacts_Count_12_mon']

In [None]:
len(cat_feat + cont_feat) + 1 == len(df.columns)

The data has been manually explored and divided into three categories: categorical, semicategorical and continuous features. Semicategorical refers to numerical features consisting on a relatively small number of integers.

We will start by taking a look at the correlation of the continuous features with the feature that we want to predict - 'Attrition_flag'. First of all we can see how there is are significantly more 'Existing', i.e. non attrited, customers than attrited ones.

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sns.countplot(x = 'Attrition_Flag', data=df)
[ ax.text(p.get_x()+p.get_width()/2., p.get_height()/2 , 
          '{:2.2f}%'.format(p.get_height()/float(len(df))*100), ha="center", 
          fontsize=20, color = 'white') for p in ax.patches]
fig.show()

## Continuous features 

In [None]:
df_cont = df.drop(labels=cat_feat, axis = 1)

In [None]:
fig = plt.figure(figsize=(16,16),dpi=300)
i = 1
for feat in cont_feat:
    ax1 = fig.add_subplot(6,3,i)
    h1 = sns.distplot(a = df_cont[df_cont['Attrition_Flag'] == 0][feat], ax = ax1,
                color = 'green', hist_kws={'alpha':0.3}, kde=False)
    h2 = sns.distplot(a = df_cont[df_cont['Attrition_Flag'] == 1][feat], ax = ax1,
                    color = 'red', hist_kws={'alpha':0.3}, bins = len(h1.patches), kde=False)
    i += 1
plt.tight_layout()
fig.show()

## Categorical features 

In [None]:
df_cat = df.drop(labels=cont_feat, axis = 1)

In [None]:
fig = plt.figure(figsize=(15,15), dpi = 600)
i = 1
for feat in cat_feat:
    ax1 = fig.add_subplot(3,2,i)
    d_temp = df_cat.groupby(feat)['Attrition_Flag']
    sns.barplot(x = d_temp.std().index, y = d_temp.std().values, 
                color = 'royalblue', alpha = 0.5)
    [ ax1.text(p.get_x()+p.get_width()/2., p.get_height()/2 , 
          '{:2.2f}%'.format(p.get_height()*100), ha="center", fontsize=14, color = 'white') for p in ax1.patches]
    i += 1
    ax1.set_ylim(0,1)
plt.tight_layout()

In [None]:
fig = plt.figure(figsize=(15,15), dpi = 600)
i = 1
for feat in cat_feat:
    ax1 = fig.add_subplot(3,2,i)
    sns.pointplot(x = feat, y = 'Attrition_Flag', data = df_cat,
                kind = 'point', ax = ax1)
    i += 1
    ax1.set_ylim(0,1)
plt.tight_layout()

In [None]:
df = pd.get_dummies(data = df, columns = cat_feat, drop_first = True)

## Model selection and training

In [None]:
df.head()

In [None]:
X = df.drop(labels = 'Attrition_Flag', axis = 1)
y = df['Attrition_Flag']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
scl = StandardScaler()

In [None]:
X_train = pd.DataFrame(scl.fit_transform(X_train), columns=X_train.columns)
X_test  = pd.DataFrame(scl.transform(X_test), columns=X_test.columns)

In [None]:
classifiers = ['Linear SVM', 'Radial SVM', 'LogisticRegression', 
               'RandomForestClassifier', 'AdaBoostClassifier', 
               'XGBoostClassifier', 'KNeighborsClassifier','GradientBoostingClassifier']

scoring = ['precision', 'recall']
index = []
for clf in classifiers:
    for scr in scoring:
        idx = (clf,scr)
        index.append(idx)
        
KFold_Score = pd.DataFrame(index = pd.MultiIndex.from_tuples(index,names=['Estimator','Scoring']), 
                           columns = ['f1','f2','f3','f4','f5'])

models = [svm.SVC(kernel='linear'),
          svm.SVC(kernel='rbf'),
          LogisticRegression(max_iter = 1000),
          RandomForestClassifier(n_estimators=200, random_state=42),
          AdaBoostClassifier(random_state = 42),
          xgb.XGBClassifier(n_estimators=100),
          KNeighborsClassifier(),
          GradientBoostingClassifier(random_state=42)
         ]
j = 0
for model in models:
    cv = KFold(n_splits=5, random_state=42, shuffle=True)
    print('Running {:s}...'.format(classifiers[j]), end = " ")
    KFold_Score.loc[classifiers[j],'precision'] = (cross_val_score(model, X_train.values, y_train.values, scoring = 'precision', cv=cv, n_jobs = 5))
    KFold_Score.loc[classifiers[j],'recall'] = (cross_val_score(model, X_train.values, y_train.values, scoring = 'recall', cv=cv, n_jobs = 5))
    print(' finished!')
    j = j+1

In [None]:
KFold_Score['mean'] = KFold_Score.mean(axis = 1)
KFold_Score['std'] = KFold_Score.std(axis = 1)

In [None]:
KFold_Score_r = KFold_Score.reset_index()

In [None]:
KFold_Score_r = pd.concat([
           KFold_Score_r[KFold_Score_r['Scoring'] == 'precision'][['Estimator','mean']].reset_index(),
           KFold_Score_r[KFold_Score_r['Scoring'] == 'recall'][['Estimator','mean']].reset_index()],
           ignore_index = True, keys = 'Estimator',axis = 1)

In [None]:
KFold_Score_r.drop(axis = 1, labels = [0,3,4], inplace = True)
KFold_Score_r.columns = ['Estimator','precision','recall']

In [None]:
KFold_Score_r['f1-score'] = KFold_Score_r.apply(lambda x: 2*x['precision']*x['recall']/(x['precision'] + x['recall']),
                                                axis = 1)

In [None]:
KFold_Score_r

In [None]:
fig, ax = plt.subplots(figsize = (8,8))
sns.lineplot(x = 'Estimator',y = 'precision',  linewidth = 3, data = KFold_Score_r, 
             label = 'precision', color = 'blue')
sns.lineplot(x = 'Estimator',y = 'recall',  linewidth = 3, data = KFold_Score_r,  
             label = 'recall', color = 'red')
sns.lineplot(x = 'Estimator',y = 'f1-score',  linewidth = 3, data = KFold_Score_r,  
             label = 'f1-score', color = 'black')
ax.lines[2].set_linestyle("--")

for tick in ax.get_xticklabels():
    tick.set_rotation(45)

ax.tick_params(which='both', width=2)
ax.tick_params(which='major', length=7)
ax.grid(color='lightgray', linestyle='-', linewidth=1)

The XGBoostClassifier has the best recall while the GradientBoostingClassifier has the highest precision. Both estimators scores lie very close to each other, therefore we will attempt to optimize the parameters for both of them.

In [None]:
KFold_Score_r[(KFold_Score_r['Estimator'] == 'XGBoostClassifier') | (KFold_Score_r['Estimator'] == 'GradientBoostingClassifier')]

## Parameter optimization for the XGBoostClassifier

In [None]:
param_test0 = {
                 'n_estimators': range(500,1600,100)
                }

In [None]:
xgb_clf = xgb.XGBClassifier( 
                         learning_rate =0.1,
                         n_estimators=1000,
                         max_depth=3,
                         min_child_weight=1,
                         gamma=0,
                         subsample=0.8,
                         colsample_bytree=0.8,
                         objective= 'binary:logistic',
                         nthread=4,
                         scale_pos_weight=1,
                         seed=42,
                         use_label_encoder=False)

In [None]:
gsearch0 = GridSearchCV(estimator = xgb_clf, param_grid=param_test0, 
                        scoring = "recall", cv = 5, n_jobs=5, verbose = 2)

In [None]:
gsearch0.fit(X_train.values, y_train.values)

In [None]:
gsearch0.best_params_

In [None]:
param_test1 = {
                 'max_depth':range(1,8,2),
                 'min_child_weight':range(1,6,2)
                }

In [None]:
xgb_clf = xgb.XGBClassifier( 
                         silent = True,
                         learning_rate =0.1,
                         n_estimators=700,
                         max_depth=3,
                         min_child_weight=1,
                         gamma=0,
                         subsample=0.8,
                         colsample_bytree=0.8,
                         objective= 'binary:logistic',
                         nthread=4,
                         scale_pos_weight=1,
                         seed=42,
                         use_label_encoder=False)

In [None]:
gsearch1 = GridSearchCV(estimator = xgb_clf, param_grid=param_test1, 
                        scoring = "recall", cv = 5, n_jobs = 5, verbose = 0)

In [None]:
gsearch1.fit(X_train.values, y_train.values)

In [None]:
gsearch1.best_score_

In [None]:
param_test2 = {
                 'gamma':[0.1*i for i in range(0,20)],
                }

In [None]:
xgb_clf = xgb.XGBClassifier( 
                         
                         learning_rate =0.1,
                         n_estimators=700,
                         max_depth=3,
                         min_child_weight=1,
                         gamma=0,
                         subsample=0.8,
                         colsample_bytree=0.8,
                         objective= 'binary:logistic',
                         nthread=4,
                         scale_pos_weight=1,
                         seed=42,
                         use_label_encoder=False)

In [None]:
gsearch2 = GridSearchCV(estimator = xgb_clf, param_grid=param_test2, 
                        scoring = "recall", cv = 5, n_jobs = 5, verbose = 2)

In [None]:
gsearch2.fit(X_train.values, y_train.values)

In [None]:
gsearch2.best_score_

## Results and feature importance

In [None]:
from xgboost import plot_importance

In [None]:
xgb_clf = gsearch2.estimator

In [None]:
feat_dict = {}
for i in range(0,len(X_train.columns)):
    ky = f'f{i}'
    feat_dict[ky] = X_train.columns[i]

In [None]:
xgb_clf.fit(X_train.values,y_train.values)

In [None]:
fig,ax = plt.subplots(figsize=(6,10))

plot_importance(booster = xgb_clf, ax = ax, height = 0.6, )
new_labels = []
for tick in ax.get_yticklabels():
    new_labels.append(feat_dict[tick.get_text()])
ax.set_yticklabels(new_labels)
plt.tight_layout

In [None]:
y_pred = xgb_clf.predict(X_test.values)

In [None]:
print(classification_report(y_pred,y_test))

<div class="alert alert-block alert-success">
    <br> 
    <br> 
    <h3> 97% accuracy with 94% recall  </h3>
    <br> 
</div>

This is my first go at it. I will continue to do some feature engineering and data preprocessing to try and improve this result. Nevertheless, it's a pretty good score to start with, I think.