### Import data exploration/cleansing library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

### Data exploration

In [None]:
data_location = r'/kaggle/input/credit-card-customers/'
df = pd.read_csv(f'{data_location}BankChurners.csv')
df.shape

In [None]:
df.head()

In [None]:
df = df.iloc[:,:-2]
df.head()

In [None]:
def plot_category(df, col):
    print(col)
    fig, ax = plt.subplots(figsize=(8, 6))
    ax = sns.countplot(data=df, x=col, ax=ax)
    for p in ax.patches:
        ax.annotate(text = format(p.get_height(), '.0f'),
                   xy = (p.get_x() + p.get_width()/2 , p.get_height() + 0.1),
                   ha = 'center',
                   va = 'center',
                   xytext = (0, 9),
                   textcoords = 'offset points')
    plt.title(col + '_count')
    plt.show()

In [None]:
def plot_num(df, col):
    print(col)
    fig, ax = plt.subplots(1, 3, figsize=(18, 5))
    sns.histplot(data=df, x=col, ax=ax[0]).set_title(col + '_hist')
    sns.kdeplot(data=df, x=col, ax=ax[1]).set_title(col + '_kde')
    sns.boxplot(data=df, x=col, ax=ax[2]).set_title(col + '_box')
    plt.show()

In [None]:
print('Data Visualization of Object Type')
for col in df.select_dtypes(include='object'):
    plot_category(df, col)
print('Data Visualization of Numeric Type')
for col in df.select_dtypes(exclude='object'):
    plot_num(df, col)

In [None]:
# create correlation matrix
df_corr = df.select_dtypes(exclude='object')
fig, ax = plt.subplots(figsize=(18, 12))
sns.heatmap(data=df_corr.corr(), annot=True, square=True,
           vmin=-1, vmax=1, ax=ax)
plt.title('Correlation Matrix')
plt.show()

In [None]:
corr = df_corr.corr().unstack().sort_values(ascending=False).drop_duplicates().reset_index()
corr = corr[corr['level_0'] != corr['level_1']]
# we only see high correlated items (i.e. >.7 or <.7)
corr[(corr[0] >= .7) | (corr[0] <= -.7)]

We see high correlation between Credit Limit and Avg_Open_To_Buy <br>
We see high correlation between Total_Trans_Ct and Total_Trans_Amt <br>
We see high correlation between Customer_Age and Months_on_book

### Data Cleansing

In [None]:
trainset = df.copy()
trainset.shape

In [None]:
y = trainset['Attrition_Flag']
trainset.drop(columns=['Attrition_Flag'], inplace=True)
trainset.drop(columns=['CLIENTNUM'], inplace=True) #client number should has no relationship
index_map = {'Existing Customer':0, 'Attrited Customer':1}
y = y.map(index_map)
print(f'Training shape: {trainset.shape}\nTarget shape: {y.shape}')

In [None]:
trainset.head()

In [None]:
# One Hot Encoder
for col in trainset.select_dtypes(include='object'):
    trainset = pd.concat([pd.get_dummies(trainset[col], prefix=col), trainset], axis=1)
    trainset.drop(columns=col, inplace=True)

trainset.info()

### Data Modeling

In [None]:
# import library
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, mean_squared_error
from sklearn.metrics import classification_report, plot_confusion_matrix

In [None]:
# split training, testing dataset
X_train, X_test, y_train, y_test = train_test_split(trainset, 
                                                   y,
                                                   test_size=0.2,
                                                   random_state=1)
print(f'Training size: {X_train.shape}\nTesting size: {X_test.shape}')

In [None]:
# create algo
d_tree = RandomForestClassifier(random_state=1) 
xgb = XGBClassifier()
cat = CatBoostClassifier(random_state=1, iterations=500, verbose=0)

algos = [d_tree, xgb, cat]

In [None]:
for algo in algos:
    model = algo.fit(X_train, y_train)
    model_acc = model.score(X_train, y_train)
    y_pred = model.predict(X_test)
    test_acc = accuracy_score(y_true=y_test, y_pred=y_pred)
    recall = recall_score(y_true=y_test, y_pred=y_pred)
    precision = precision_score(y_true=y_test, y_pred=y_pred)
    mse = mean_squared_error(y_true=y_test, y_pred=y_pred)
    
    print(f'Model: {algo.__class__.__name__}')
    print('-'*50)
    print(f'Model Accuracy: {model_acc:.4f}, Testing Accuracy: {test_acc:.4f}')
    print(f'Recall: {recall:.4f}, Precision: {precision:.4f}, MSE: {mse:.4f}')
    print('-'*50)
    
    plot_confusion_matrix(model, X_test, y_test,
                         display_labels=model.classes_,
                         normalize='true',
                         cmap=plt.cm.Blues)
    plt.title(f'{algo.__class__.__name__} normalied confusion matrix')
    plt.show()
    
    print(classification_report(y_test, y_pred))
    print('-'*50)

Comparing Random Forest, XGBoost and CatBoost, CatBoost works with the highest accuracy 97.38% with the lowerest MSE 0.0262 on the testing dataset.

### Model Refinement

In [None]:
# import library
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [None]:
#fine-tuning parameters
iterations = [int(x) for x in np.arange(start=300, stop=500, step=100)]
learning_rate = [float(x) for x in np.arange(start=0.01, stop=0.05, step=0.01)]
max_depth = [int(x) for x in np.arange(start=4, stop=10, step=2)]


parameter_grid = {'iterations':iterations,
                 'learning_rate':learning_rate,
                 'max_depth':max_depth,
                 }

scoreFunction = {'accuracy':make_scorer(accuracy_score)}

In [None]:
cat = CatBoostClassifier(random_state=1, verbose=0)
random_cat = GridSearchCV(cat, param_grid=parameter_grid, scoring=scoreFunction, n_jobs=-1,
                         refit='accuracy')

In [None]:
random_model = random_cat.fit(X_train, y_train)

In [None]:
print(f'The best parameters: {random_model.best_params_}')
cat = random_model.best_estimator_
print(cat.get_params())
y_pred = cat.predict(X_test)
acc = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(classification_report(y_true=y_test, y_pred=y_pred))
print('Accuracy:',format(acc*100, '.2f'))
print('Recall:', format(recall*100, '.2f'))

In [None]:
# get important features
feature_list = pd.Series(cat.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_list.nlargest(10).sort_values(ascending=True).plot(kind='barh')
plt.show()

In [None]:
### data visualization on cat boost
from catboost import Pool

In [None]:
is_category = X_train.dtypes == 'object'
cat_feature_index = np.where(is_category)[0]
pool = Pool(X_train, y_train, cat_features=[], feature_names=list(X_train.columns))
model.plot_tree(tree_idx=0,
               pool=pool)