# 1. Dataset fetching and basic preparation

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.utils import resample

In [None]:
file_path = '/kaggle/input/credit-card-customers/BankChurners.csv'
customers = pd.read_csv(file_path)
customers.drop('Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', axis=1, inplace=True)
customers.drop('Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', axis=1, inplace=True)

We split the dataset into training and test set to avoid data snooping during the visualization.  
Value that we look for is also mapped from categories to numbers.

In [None]:
test_percentage = 0.2
test_number = int(test_percentage * len(customers))

attrition_flag = customers['Attrition_Flag'].map({'Attrited Customer': 0, 'Existing Customer': 1})
y_test, y_train = attrition_flag[test_number:], attrition_flag[:test_number]

In [None]:
customers.drop('CLIENTNUM', axis=1, inplace=True)
attrition_flag_names =  customers['Attrition_Flag']
customers.drop('Attrition_Flag', axis=1, inplace=True)

X_test, X_train = customers[test_number:], customers[:test_number]

# 2. Data analysis

In [None]:
attrition_flag.value_counts()

In [None]:
X_train.head()

In [None]:
X_train.info()

In [None]:
plt.figure(figsize=(16, 6))
vis_customers = customers.copy()
vis_customers['Attrition_Flag'] = attrition_flag
mask = np.triu(np.ones_like(vis_customers.corr(), dtype=np.bool))

heatmap = sns.heatmap(vis_customers.corr(), mask=mask, cmap="viridis", annot=True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=16);

  It may be reasonable to drop some of the least meaningful ones like Avg_Open_To_Buy (another already overlapping feature, no interesting correlations with other features).

In [None]:
plt.figure(figsize=(8, 12))
heatmap = sns.heatmap(vis_customers.corr()[['Attrition_Flag']].sort_values(by='Attrition_Flag', ascending=False), vmin=-0.5, vmax=0.5, annot=True, cmap='BrBG')
heatmap.set_title('Features Correlating with client attrition', fontdict={'fontsize':18}, pad=16);

In [None]:
vis_customers['Attrition_Flag'] = attrition_flag_names

fig, ax = plt.subplots(ncols=3,figsize=(20,5))
sns.scatterplot(data=vis_customers, x="Total_Ct_Chng_Q4_Q1", y="Total_Trans_Ct", hue="Attrition_Flag", ax=ax[0])
sns.scatterplot(data=vis_customers, x="Total_Revolving_Bal", y="Total_Trans_Ct", hue="Attrition_Flag", ax=ax[1])
sns.scatterplot(data=vis_customers, x="Customer_Age", y="Total_Trans_Ct", hue="Attrition_Flag", ax=ax[2])

fig, ax = plt.subplots(ncols=3,figsize=(20,5))
sns.scatterplot(data=vis_customers, x="Months_on_book", y="Total_Trans_Ct", hue="Attrition_Flag", ax=ax[0]);
sns.scatterplot(data=vis_customers, x="Months_Inactive_12_mon", y="Total_Trans_Ct", hue="Attrition_Flag", ax=ax[1]);
sns.scatterplot(data=vis_customers, x="Contacts_Count_12_mon", y="Total_Trans_Ct", hue="Attrition_Flag", ax=ax[2]);

We can deduce a few things from the diagrams:
* The more transactions a client makes the higher the chance that he will stay
* People usually churn after 2-4 months of inactivity
* More contacts over the 12 months usually means a will to churn

In [None]:
fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2, 3,figsize=(30,20))

pd.crosstab(attrition_flag_names,vis_customers['Gender']).plot(kind='bar',ax=ax1, rot=0,  ylim=[0,5000])
pd.crosstab(attrition_flag_names,vis_customers['Education_Level']).plot(kind='bar',ax=ax2, rot=0, ylim=[0,3000])
pd.crosstab(attrition_flag_names,vis_customers['Marital_Status']).plot(kind='bar',ax=ax3, rot=0, ylim=[0,4500])
pd.crosstab(attrition_flag_names,vis_customers['Income_Category']).plot(kind='bar',ax=ax4, rot=0, ylim=[0,3200])
pd.crosstab(attrition_flag_names,vis_customers['Card_Category']).plot(kind='bar',ax=ax5, rot=0, ylim=[0,9000])
pd.crosstab(attrition_flag_names,vis_customers['Months_Inactive_12_mon']).plot(kind='bar',ax=ax6, rot=0, ylim=[0,3500]);

We can discard columns least correlated to our target value.

In [None]:
X_train.drop(['Credit_Limit', 'Avg_Open_To_Buy', 'Months_on_book', 'Customer_Age', 'Dependent_count'], axis=1, inplace=True)
X_test.drop(['Credit_Limit', 'Avg_Open_To_Buy', 'Months_on_book', 'Customer_Age', 'Dependent_count'], axis=1, inplace=True)

# 3. Correcting the imbalance

In [None]:
attrition_flag_names.value_counts().plot.pie(ylabel='', autopct='%1.1f%%', figsize=(8,8));

Attrited customer data seems to be heavily undersampled. We need to even out this discrepancy to avoid issues with low precision.[](http://)

In [None]:
df_full = X_train.copy()
df_full['Attrition_flag'] = y_train

attrited_customer = df_full[df_full['Attrition_flag']==0]
existing_customer = df_full[df_full['Attrition_flag']==1]


attrited_upsampled = resample(attrited_customer,
                                 replace=True,
                                 n_samples=(len(existing_customer.index) - len(attrited_customer.index)),
                                 random_state=41)

X_train = pd.concat([df_full, attrited_upsampled])

X_train['Attrition_flag'].value_counts()

y_train = X_train.pop('Attrition_flag')

# 4. Data preparation

In [None]:
cat_attribs = X_train.loc[:,X_train.dtypes==np.object].columns
cat_attribs

In [None]:
num_attribs = np.setxor1d(X_train.columns.values, cat_attribs)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
 ('std_scaler', StandardScaler()),
 ])
full_pipeline = ColumnTransformer([
 ("num", num_pipeline, np.array(num_attribs)),
 ("cat", OneHotEncoder(), np.array(cat_attribs)),
 ])

In [None]:
X_train_prepared = full_pipeline.fit_transform(X_train)
X_prepared = pd.DataFrame.from_records(X_train_prepared)

X_test_prepared = full_pipeline.fit_transform(X_test)

# 5. Implementation

We will try out a bunch of different classification alogrithms and choose the most promising ones for further hyperparameter tuning.

In [None]:
def get_scores(clf, X, y):
    y_predictions = clf.predict(X)
    print("Precision: ", precision_score(y, y_predictions))
    print("Recall: ", recall_score(y, y_predictions))
    print("F1 score: ", f1_score(y, y_predictions))
    print("AUC: ", roc_auc_score(y, y_predictions))
    plot_confusion_matrix(clf, X, y);

## 5.1 k-Nearest Neighbors

In [None]:
nbrs = KNeighborsClassifier(n_neighbors=5,metric='minkowski')
nbrs.fit(X_train_prepared, y_train)

In [None]:
get_scores(nbrs, X_test_prepared, y_test)

## 5.2 Support Vector Classification

In [None]:
svc_clf = SVC()
svc_clf.fit(X_train_prepared, y_train)

In [None]:
get_scores(svc_clf, X_test_prepared, y_test)

## 5.3 Random Forest

In [None]:
ran_for = RandomForestClassifier()
ran_for.fit(X_train_prepared, y_train)

In [None]:
get_scores(ran_for, X_test_prepared, y_test)

## 5.4 Gradient boosting Classifier

In [None]:
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train_prepared, y_train)

In [None]:
get_scores(gb_clf, X_test_prepared, y_test)

# 6. Hyperparameter tuning

Now we should increase the scores of the algorithms. For that Grid Search will be used as the number of parameters will usually not be that high.

## 6.1 k-Nearest Neighbors

In [None]:
param_grid = [
 {'n_neighbors': [1, 2, 3],
  'weights': ['uniform', 'distance'],
 }]
grid_search = GridSearchCV(nbrs,
                           param_grid,
                           cv=5,
                           return_train_score=True)

grid_search.fit(X_train_prepared, y_train)

nbrs_params = grid_search.best_params_

print(nbrs_params)

In [None]:
nbrs = KNeighborsClassifier(**nbrs_params)
nbrs.fit(X_train_prepared, y_train)
get_scores(nbrs, X_test_prepared, y_test)

## 6.2 Support Vector Classification

In [None]:
param_grid = {
    'C':[1,10,100,1000],
    'gamma':['scale', 'auto'],
    'kernel':['linear','rbf']
}
grid_search = GridSearchCV(svc_clf,
                           param_grid,
                           cv=5,
                           return_train_score=True,
                           n_jobs=-1)

grid_search.fit(X_train_prepared, y_train)

svc_params = grid_search.best_params_

print(svc_params)

In [None]:
svc_clf = SVC(**svc_params)
svc_clf.fit(X_train_prepared, y_train)
get_scores(svc_clf, X_test_prepared, y_test)

## 6.3 Random Forest

In [None]:
param_grid = { 
    'n_estimators': [50, 75, 100, 125, 150],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [5, 7, 11, 13],
    'criterion' :['gini', 'entropy']
}

grid_search = GridSearchCV(ran_for,
                           param_grid,
                           cv=5,
                           return_train_score=True,
                           n_jobs=-1)

grid_search.fit(X_train_prepared, y_train)

forest_params = grid_search.best_params_

print(forest_params)

In [None]:
ran_for = RandomForestClassifier(**forest_params)
ran_for.fit(X_train_prepared, y_train)
get_scores(ran_for, X_test_prepared, y_test)

## 6.4 Gradient boosting Classifier

In [None]:
param_grid = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 5),
    "min_samples_leaf": np.linspace(0.1, 0.5, 5),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.7, 0.8, 0.9, 1.0],
    "n_estimators":[10]
}

grid_search = GridSearchCV(gb_clf,
                           param_grid,
                           cv=5,
                           return_train_score=True,
                           n_jobs=-1)

grid_search.fit(X_train_prepared, y_train)

gb_params = grid_search.best_params_

print(gb_params)

In [None]:
gb_clf = GradientBoostingClassifier(**gb_params)
gb_clf.fit(X_train_prepared, y_train)
get_scores(gb_clf, X_test_prepared, y_test)

# 7 Conclusion

Random forest is the best model after the hyperparameter tuning resulting in Area Under the Curve of 0.84 on the test set.