In [None]:
!pip install seaborn==0.11.1

# Import Libraries needed

In [None]:
#Basic Libraries
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
from sklearn.model_selection import train_test_split

#Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

#Ensemble Model Library
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

#Evaluation Library
from sklearn import metrics

#Imbalanced Libraries
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE

# Load and Find Insight From Dataset

Lets load the dataset, drop all Naive_Bayes Column because those aren't used, and see the small sample of our dataset.

In [None]:
dataset = pd.read_csv('../input/credit-card-customers/BankChurners.csv')
dataset.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis=1, inplace=True)

In [None]:
dataset.head()

Lets see the descriptive statistics of our dataset

In [None]:
dataset.describe()

From our descriptive statistics, we can see that the scale of our data is not the same for all features. But we won't be bothered, because the model that we are trying to trained is logical based which is not sensitive to input's scale. Another thing that could be noticed is if we are looking at features where currency is involved, the std deviation value tends to be bigger (sometimes even surpassed the mean value).

Now lets visualize each feature distribution in respect to our target which is Attrited Customer or Existing Customer. For continuous feature I prefer to plot the kde (kernel density estimation) while for discrete feature I use histogram plot.

In [None]:
sns.kdeplot(data=dataset, x="Customer_Age", hue="Attrition_Flag")

In [None]:
sns.countplot(data=dataset, x="Gender", hue="Attrition_Flag")

In [None]:
sns.countplot(data=dataset, x="Dependent_count", hue="Attrition_Flag")

In [None]:
plt.xticks(rotation = 315)
sns.countplot(data=dataset, x="Education_Level", hue="Attrition_Flag")

In [None]:
sns.countplot(data=dataset, x="Marital_Status", hue="Attrition_Flag")

In [None]:
plt.xticks(rotation=315)
sns.countplot(data=dataset, x="Income_Category", hue="Attrition_Flag")

In [None]:
sns.countplot(data=dataset, x="Card_Category", hue="Attrition_Flag")

In [None]:
sns.kdeplot(data=dataset, x="Months_on_book", hue="Attrition_Flag")

In [None]:
sns.countplot(data=dataset, x="Total_Relationship_Count", hue="Attrition_Flag")

In [None]:
sns.countplot(data=dataset, x="Months_Inactive_12_mon", hue="Attrition_Flag")

In [None]:
sns.countplot(data=dataset, x="Contacts_Count_12_mon", hue="Attrition_Flag")

In [None]:
sns.kdeplot(data=dataset, x="Credit_Limit", hue="Attrition_Flag")

In [None]:
sns.kdeplot(data=dataset, x="Total_Revolving_Bal", hue="Attrition_Flag")

In [None]:
sns.kdeplot(data=dataset, x="Avg_Open_To_Buy", hue="Attrition_Flag")

In [None]:
sns.kdeplot(data=dataset, x="Total_Amt_Chng_Q4_Q1", hue="Attrition_Flag")

In [None]:
sns.kdeplot(data=dataset, x="Total_Trans_Amt", hue="Attrition_Flag")

In [None]:
sns.kdeplot(data=dataset, x="Total_Trans_Ct", hue="Attrition_Flag")

In [None]:
sns.kdeplot(data=dataset, x="Total_Ct_Chng_Q4_Q1", hue="Attrition_Flag")

In [None]:
sns.kdeplot(data=dataset, x="Avg_Utilization_Ratio", hue="Attrition_Flag")

As we can see, some features have (almost) normal distribution like *Total_Ct_Chng_Q4_Q1* and *Total_Trans_Amt*, some have Poisson distribution, some have other type of distribution.

Now lets plot the correlation matrix to get another insight.

In [None]:
corr = dataset.drop('CLIENTNUM', axis=1).corr()

In [None]:
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
sns.heatmap(corr, mask=mask, cmap='BrBG', vmin=-1, vmax=1, annot=True)

From the correlation matrix we could see some features are correlated between each other, like *Avg_Open_To_Buy* is highly dependent/correlated to *Credit Limit*, well if we take a look at the definition it is not a shocking fact. It is surprise me that *Months_on_book* is quite correlate with the *Customer_Age*. Well, for instance we could drop one of those correlated feature and just pick one of them because correlated features are just redundant information. But for this notebook, I will keep all those features. I will deal with correlated features for further investigation.

# Preprocess Stage

Now, lets one-hot encode the categorical features using *get_dummies* by *pandas*  

In [None]:
dataset = pd.concat([dataset.drop(['Dependent_count', 'Marital_Status', 'Income_Category', 'Card_Category', 
                                   'Education_Level'], axis=1), pd.get_dummies(dataset.Dependent_count), 
                     pd.get_dummies(dataset.Marital_Status), pd.get_dummies(dataset.Income_Category), 
                     pd.get_dummies(dataset.Card_Category), pd.get_dummies(dataset.Education_Level)], axis=1)

In [None]:
dataset['Gender'] = dataset.Gender.map({'M':1, 'F':0})
dataset['Attrition_Flag'] = dataset.Attrition_Flag.map({'Existing Customer':0, 'Attrited Customer':1})

Seperate our features from our target, and drop unuse feature like *CLIENTNUM* which is just an id in database.

In [None]:
y = dataset.Attrition_Flag
X = dataset.drop(['Attrition_Flag', 'CLIENTNUM'], axis=1)

Lets rename our feature's columns. This is neccesary because we got similar name for several features caused by one-hot encoded process

In [None]:
X.columns = ['Customer_Age','Gender', 'Months_on_book', 'Total_Relationship_Count','Months_Inactive_12_mon',
             'Contacts_Count_12_mon','Credit_Limit','Total_Revolving_Bal','Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1',
             'Total_Trans_Amt', 'Total_Trans_Ct','Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio', 0, 1, 2, 3, 4, 5,
             'Divorced','Married', 'Single', 'Marriage_Unknown','$120K +', '$40K - $60K', '$60K - $80K','$80K - $120K',
             'Less than $40K','Income_Category_Unknown', 'Blue', 'Gold','Platinum', 'Silver', 'College','Doctorate',
             'Graduate','High School','Post-Graduate', 'Uneducated','Education_Unknown']

Split our dataset which will result in 70% of training set and 30% of test set.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=68, test_size=0.3)

Now train a simple Random Forest Classifier and evaluate its performace, I leave all parameters to default except for the max_depth.

In [None]:
clf = RandomForestClassifier(max_depth=10)
clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_test)[:,1]

In [None]:
print(metrics.roc_auc_score(y_test, y_pred_proba))
metrics.plot_confusion_matrix(clf, X_test, y_test) 

Because we are working with imbalanced data I prefer to use ROC_AUC_score instead of accuracy. Then we also interested to see True Positive Rate (TPR) and we also want to avoid False Negative. From the matrix confusion, we got:
TPR = TP/(TP + FN)
    = 357/(357+151)
    = 0.716867

This result isn't good enough. Now lets try to use Adaboost. Adaboost works as an ensemble learning. Adaboost combines plenty of weak classifier to produce a strong classifier. In this sample, we will use our random forest model as our weak classifier, then adaboost will try to combine this model to give a better result. 

In [None]:
clf_adaboost = AdaBoostClassifier(base_estimator=clf, n_estimators=100, random_state=0)
clf_adaboost.fit(X_train, y_train)
y_pred_proba = clf_adaboost.predict_proba(X_test)

In [None]:
print(metrics.roc_auc_score(y_test, y_pred_proba[:,1]))
metrics.plot_confusion_matrix(clf_adaboost, X_test, y_test)

Adaboost Model has been successful to produce a greater learner. ROC_AUC_score has shown a better result. And also our TPR has been improved compared to the random forest classifier. We achieve TPR score of 0.80315. The next session, I will try to do something with the imbalanced data.

# # Undersampling the Majority Class

This time I will try to use NearMiss algorithm to undersampling the *Existing_Customer*. First let take a look at the distribution of our target values. It is shown that Attrited Customer is only 15.7% of our train set.

In [None]:
y_train.value_counts()/len(y_train)

In [None]:
nm = NearMiss()
X_under, y_under = nm.fit_resample(X_train, y_train)

In [None]:
y_under.value_counts()/len(y_under)

After appling Near Miss algorithm, I have a balanced data between Attrited Customer and Existing Customer. Lets try to train Random Forest and Adaboost again on this undersample dataset.

In [None]:
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(X_under, y_under)
y_pred_proba = clf.predict_proba(X_test)[:,1]

In [None]:
print(metrics.roc_auc_score(y_test, y_pred_proba))
metrics.plot_confusion_matrix(clf, X_test, y_test)

We got a worse ROC_AUC_score, but on the other hand, we got a better TPR score as what have been asked in the task. We achieve 0.87992 TPR Score. 

Now, lets try Adaboost Method

In [None]:
clf_adaboost = AdaBoostClassifier(base_estimator=clf, n_estimators=200, random_state=0)
clf_adaboost.fit(X_under, y_under)
y_pred_proba = clf_adaboost.predict_proba(X_test)[:,1]

In [None]:
print(metrics.roc_auc_score(y_test, y_pred_proba))
metrics.plot_confusion_matrix(clf_adaboost, X_test, y_test)

We achieved a better ROC_AUC_score and also the highest TPR value which is 0.9035. So far, undersampling the dataset works well for this dataset. Next section I will try to overpsampling the minority class.

# Oversampling the Minority Class

This time, I will try to use SMOTE algorithm to oversample the minority class and train the Random Forest and Adaboost classifier again and see whether it will give a better result than the Near Miss undersampling.

In [None]:
y_train.value_counts()/len(y_train)

In [None]:
sm = SMOTE(sampling_strategy='auto', random_state=1234)
X_over, y_over = sm.fit_resample(X_train, y_train)

In [None]:
y_over.value_counts()/len(y_over)

SMOTE gives us the balanced train distribution, just like what Near Miss did. Now train the Random Forest Classifier using SMOTE dataset.

In [None]:
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(X_over, y_over)
y_pred_proba = clf.predict_proba(X_test)[:,1]

In [None]:
print(metrics.roc_auc_score(y_test, y_pred_proba))
metrics.plot_confusion_matrix(clf, X_test, y_test)


Compared to the undersampling dataset, we achieved a better ROC_AUC_score but worse TPR value which is 0.8307.

Now lets train the AdaBoost Model.

In [None]:
clf_adaboost = AdaBoostClassifier(base_estimator=clf, n_estimators=200, random_state=0)
clf_adaboost.fit(X_over, y_over)
y_pred_proba = clf_adaboost.predict_proba(X_test)[:,1]

In [None]:
print(metrics.roc_auc_score(y_test, y_pred_proba))
metrics.plot_confusion_matrix(clf_adaboost, X_test, y_test)

We achieved TPR Score of 0.84843. Overall, if we compared Upsampling SMOTE and undersampling Near Miss dataset, we achieved a better result of AUC score. But if our target is to get higher TPR then, I will choose Near Miss dataset and use AdaBoost classifier which give me the best TPR score.