In [None]:
#================================================================#
### Importing all the relevant libraries ###
#================================================================#

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.metrics import jaccard_score

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_curve, plot_roc_curve, auc, classification_report, accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, ShuffleSplit

from sklearn.linear_model import LogisticRegression

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree

from imblearn.ensemble import RUSBoostClassifier
from imblearn.over_sampling import RandomOverSampler

import statistics

In [None]:
data = pd.read_csv('../input/credit-card-customers/BankChurners.csv')
data = data.iloc[:, :-2]
data['CLIENTNUM'] = data['CLIENTNUM'].astype('str')

print(data.shape)
data.head()

In [None]:
if data['CLIENTNUM'].nunique() == data.shape[0]:
  print('Level of Data is Customer ID ')

## Exploratory Data Analysis - EDA

In [None]:
#===========================================================================#
### Spread of Target Variable ###
#===========================================================================#

data['Attrition_Flag'].value_counts()

In [None]:
#===========================================================================#
### Gender vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8,5))

sns.barplot(x=data.groupby(['Gender', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Gender'],
            y=data.groupby(['Gender', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['CLIENTNUM'],
            hue=data.groupby(['Gender', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Attrition_Flag'])

plt.show()

In [None]:
#===========================================================================#
### Age vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (15,8))

ax.hist(data[data['Attrition_Flag'] == 'Existing Customer']['Customer_Age'], bins=15, color='g')
ax.hist(data[data['Attrition_Flag'] == 'Attrited Customer']['Customer_Age'], bins=15, color='b')

plt.show()

### Here we notic that age follows somewhat Normal Distribution ###
### We can use this information to create Buckets ###

In [None]:
#===========================================================================#
### Education Level vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

sns.barplot(x=data.groupby(['Education_Level', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Education_Level'],
            y=data.groupby(['Education_Level', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['CLIENTNUM'],
            hue=data.groupby(['Education_Level', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Attrition_Flag'])

plt.show()

### A good chunk of Data has Unknown which can be considered as NAN ###
### We can run an Imputation but it will increase complexity of the model ###
### We can treat it as a category ###

In [None]:
#===========================================================================#
### Marital Status vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

sns.barplot(x=data.groupby(['Marital_Status', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Marital_Status'],
            y=data.groupby(['Marital_Status', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['CLIENTNUM'],
            hue=data.groupby(['Marital_Status', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Attrition_Flag'])

plt.show()

### Similar case of unknowns, trating them as separate category ###

In [None]:
#===========================================================================#
### Income Category vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

sns.barplot(x=data.groupby(['Income_Category', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Income_Category'],
            y=data.groupby(['Income_Category', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['CLIENTNUM'],
            hue=data.groupby(['Income_Category', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Attrition_Flag'])

plt.show()

### Majority are from Low income group ###
### For rest of the categories the distribution remains almost constant ###

In [None]:
#===========================================================================#
### Card Category vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

sns.barplot(x=data.groupby(['Card_Category', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Card_Category'],
            y=data.groupby(['Card_Category', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['CLIENTNUM'],
            hue=data.groupby(['Card_Category', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Attrition_Flag'])

plt.show()

### Almost all of them have a Blue Card, we can either leave this variable or ###
### Roll Up into 2 categories Blue CarD and Others

In [None]:
#===========================================================================#
### Dependents vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

sns.barplot(x=data.groupby(['Dependent_count', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Dependent_count'],
            y=data.groupby(['Dependent_count', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['CLIENTNUM'],
            hue=data.groupby(['Dependent_count', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Attrition_Flag'])

plt.show()

### The distribution looks like normal distribution ###

In [None]:
#===========================================================================#
### Months on Book vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

sns.barplot(x=data.groupby(['Months_on_book', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Months_on_book'],
            y=data.groupby(['Months_on_book', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['CLIENTNUM'],
            hue=data.groupby(['Months_on_book', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Attrition_Flag'])

plt.show()

### No. of months people have been with the company, seems highly skwed at 36 ###

In [None]:
#===========================================================================#
### Total Relationship Count vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

sns.barplot(x=data.groupby(['Total_Relationship_Count', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Total_Relationship_Count'],
            y=data.groupby(['Total_Relationship_Count', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['CLIENTNUM'],
            hue=data.groupby(['Total_Relationship_Count', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Attrition_Flag'])

plt.show()

### No. of bank products held by customer, can be Account, Debit Card ###
### Credit Card, Insurance etc. ###
### Most of them have 3-6 Products ###

In [None]:
#===========================================================================#
### Months Inactive vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

sns.barplot(x=data.groupby(['Months_Inactive_12_mon', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Months_Inactive_12_mon'],
            y=data.groupby(['Months_Inactive_12_mon', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['CLIENTNUM'],
            hue=data.groupby(['Months_Inactive_12_mon', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Attrition_Flag'])

plt.show()

### No. of months inactive in past 12 months, mostly between 1-3 ###
### We can club the customers if required ###

In [None]:
#===========================================================================#
### Contacts vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

sns.barplot(x=data.groupby(['Contacts_Count_12_mon', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Contacts_Count_12_mon'],
            y=data.groupby(['Contacts_Count_12_mon', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['CLIENTNUM'],
            hue=data.groupby(['Contacts_Count_12_mon', 'Attrition_Flag']).agg({'CLIENTNUM':'count'}).reset_index()['Attrition_Flag'])

plt.show()

### No. of contacts in past 12 months ###

In [None]:
#===========================================================================#
### Credit Limit vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

ax.hist(data[data['Attrition_Flag'] == 'Existing Customer']['Credit_Limit'], bins=15)
ax.hist(data[data['Attrition_Flag'] == 'Attrited Customer']['Credit_Limit'], bins=15)

plt.show()

### Most of the customers have credit limit less than 5K ###

In [None]:
#===========================================================================#
### Avg Open to Buy vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

ax.hist(data[data['Attrition_Flag'] == 'Existing Customer']['Avg_Open_To_Buy'], bins=15, color='y')
ax.hist(data[data['Attrition_Flag'] == 'Attrited Customer']['Avg_Open_To_Buy'], bins=15, color='grey')

plt.show()

### Credit Limit - Purchased Goods ###
### Shows people are using the card for Purchases ###

In [None]:
#===========================================================================#
### Revolving Balance vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

ax.hist(data[data['Attrition_Flag'] == 'Existing Customer']['Total_Revolving_Bal'], bins=15, color='g')
ax.hist(data[data['Attrition_Flag'] == 'Attrited Customer']['Total_Revolving_Bal'], bins=15, color='b')

plt.show()

### a revolving balance is the portion of credit card spending that goes unpaid
### at the end of a billing cycle ###
### Graph shows that a lot of customers who churn out have revolving balance close to 0 ###

In [None]:
#===========================================================================#
### Total Transaction Amount vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

ax.hist(data[data['Attrition_Flag'] == 'Existing Customer']['Total_Trans_Amt'], bins=15, color='sienna')
ax.hist(data[data['Attrition_Flag'] == 'Attrited Customer']['Total_Trans_Amt'], bins=15, color='b')

plt.show()

### Max Attrition is if the total transaction amount is less than 2500 ###

In [None]:
#===========================================================================#
### No. of Transactions vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

ax.hist(data[data['Attrition_Flag'] == 'Existing Customer']['Total_Trans_Ct'], bins=15, color='yellow')
ax.hist(data[data['Attrition_Flag'] == 'Attrited Customer']['Total_Trans_Ct'], bins=15, color='orange')

plt.show()

### People with 40 +- 20 transactions are on the risk of churning ###
### Distribution follows somewhat normal Distribution ###

In [None]:
#===========================================================================#
### Amount Change in Timeframe vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

ax.hist(data[data['Attrition_Flag'] == 'Existing Customer']['Total_Amt_Chng_Q4_Q1'], bins=15, color='orange')
ax.hist(data[data['Attrition_Flag'] == 'Attrited Customer']['Total_Amt_Chng_Q4_Q1'], bins=15, color='b')

plt.show()

In [None]:
#===========================================================================#
### Transaction Changes in Timeframe vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

ax.hist(data[data['Attrition_Flag'] == 'Existing Customer']['Total_Ct_Chng_Q4_Q1'], bins=15, color='pink')
ax.hist(data[data['Attrition_Flag'] == 'Attrited Customer']['Total_Ct_Chng_Q4_Q1'], bins=15, color='brown')

plt.show()

In [None]:
#===========================================================================#
### Utilization Ratio in Timeframe vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))

ax.hist(data[data['Attrition_Flag'] == 'Existing Customer']['Avg_Utilization_Ratio'], bins=15, color='g')
ax.hist(data[data['Attrition_Flag'] == 'Attrited Customer']['Avg_Utilization_Ratio'], bins=15, color='orange')

plt.show()

In [None]:
#===========================================================================#
### Correlation Matrix ###
#===========================================================================#

correlation = data.corr()

mask = np.triu(np.ones_like(correlation, dtype=bool))

f, ax = plt.subplots(figsize=(11, 9))

cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio

sns.heatmap(correlation, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.show()

## Analytical Data set

In [None]:
#===========================================================================#
### Encoding Relevant Columns ###
#===========================================================================#

ads_1 = data.drop(labels=['CLIENTNUM'], axis=1)

ads_1['Card_Category'] = np.where(ads_1['Card_Category']=='Blue', 'Blue', 'Other')
print(ads_1['Card_Category'].value_counts(), '\n')

ads_1['Attrition_Flag'] = np.where(ads_1['Attrition_Flag'] == 'Existing Customer', 0, 1)
print(ads_1['Attrition_Flag'].value_counts(), '\n')

ads_1.head()

In [None]:
#===========================================================================#
### Encoding Age Column ###
#===========================================================================#

print('Mean of Customer Age: ', statistics.mean(data['Customer_Age']), '\n')
print('Std Dev of Customer Age: ', statistics.stdev(data['Customer_Age']), '\n')

def ageTransformer(x):
  if x <= 30:
    return 'LT_30'
  elif (x >= 31) & (x <= 40):
    return 'Bet_31_40'
  elif (x >= 41) & (x <= 50):
    return 'Bet_41_50'
  elif (x >= 51) & (x <= 60):
    return 'Bet_51_60'
  elif x > 60:
    return 'GT_60'

ads_1['Customer_Age'] = ads_1['Customer_Age'].astype('int')
ads_1['Customer_Age'] = ads_1['Customer_Age'].apply(lambda x: ageTransformer(x))
print(ads_1['Customer_Age'].value_counts())

In [None]:
ads_1a = ads_1.iloc[:, :12]
ads_1b = ads_1.iloc[:, 12:]
ads_1b.head()

In [None]:
#===========================================================================#
### PCA for all the Cont. Numeric Variables ###
#===========================================================================#

pcaObj = PCA(n_components=2, whiten=True, random_state=3)
ads_2b = pd.DataFrame(pcaObj.fit_transform(ads_1b), columns=['pc_1', 'pc_2'])

print(pcaObj.explained_variance_ratio_)
ads_2b.head()

In [None]:
#===========================================================================#
### Creating one hot encoded Columns for all Encoded Columns ###
#===========================================================================#

print('shape of DF before transformation', ads_1a.shape, '\n')

ads_2a = pd.get_dummies(ads_1a)

print('shape of DF After transformation', ads_2a.shape, '\n')

ads_2a.head()

In [None]:
#===========================================================================#
### The Final Analytical Dataset ###
#===========================================================================#

ads = pd.concat([ads_2a, ads_2b], axis=1)

print(ads.shape)
ads.head()

## Modelling

In [None]:
### If we look at the distribution of target variable ###
### It looks like the case of Class Imbalance ###
### We can try using random oversampler to magnify the minority Class ###

ads['Attrition_Flag'].value_counts()

In [None]:
### Random Over Sampling to Magnify the minority class and handle data imbalance ###

ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(ads, ads['Attrition_Flag'])

ads_resampled = pd.DataFrame(X_resampled, columns= ads.columns)

print(ads_resampled.shape, '\n')
print(ads_resampled['Attrition_Flag'].value_counts(), '\n')
ads_resampled.head()

In [None]:
### Test Train Split ###

X_train, X_test, y_train, y_test = train_test_split(ads_resampled.iloc[:,1:], ads_resampled['Attrition_Flag'], test_size=0.10, random_state=42)

### Decision Tree

- Initially we do not set any pruning criterias like min_samples_split etc.
- We notice that the model Overfits but still performs well on test set
- We can try to manipulate the pruning criteria but need to assess if it actually drives a change or not
- Here, in our case though the model overfits but still we get good results, in other case (the non-ideal ones) this may not be true

In [None]:
#===========================================================================#
### Gender vs Target Variable ###
#===========================================================================#

modelDt = DecisionTreeClassifier(random_state=42)

modelDt.fit(X_train, y_train)

pred_dt_train = modelDt.predict(X_train)
pred_dt_test = modelDt.predict(X_test)

print('### Train Summary ###', '\n')
print(classification_report(y_train, pred_dt_train), '\n')
print(confusion_matrix(y_train, pred_dt_train), '\n')

print('\n', '### Test Summary ###', '\n')
print(classification_report(y_test, pred_dt_test), '\n')
print(confusion_matrix(y_test, pred_dt_test), '\n')


In [None]:
modelDt.feature_importances_

### The 2 PCA components together amount to ~58% of feature Importance ###

In [None]:
### Visualize the Tree ###

# fig = plt.figure(figsize=(75,50))
# _ = plot_tree(modelDt, filled=True)

### Random Forest Classifier

- Again the same case as DT Classifier above
- The RF model Overfits but that is passed on the test set as well
- This may work now but ideally we need to use regularization in RF

In [None]:
#===========================================================================#
### Random Forest Model ###
#===========================================================================#

modelRf = RandomForestClassifier(n_estimators=300, oob_score=True)

modelRf.fit(X_train, y_train)

print('OOB Score: ', modelRf.oob_score_, '\n')

pred_rf_train = modelRf.predict(X_train)
pred_rf_test = modelRf.predict(X_test)

print('### Train Summary ###', '\n')
print(classification_report(y_train, pred_rf_train), '\n')
print(confusion_matrix(y_train, pred_rf_train), '\n')

print('\n', '### Test Summary ###', '\n')
print(classification_report(y_test, pred_rf_test), '\n')
print(confusion_matrix(y_test, pred_rf_test), '\n')

In [None]:
### From the Feature Importance Array, we notice that the PCA Comps ###
### Contribute hugely to in deciding if the customer will be Attired or Not ###
modelRf.feature_importances_