# Introduction

In [None]:
# import libraries
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

In [None]:
# read dataset
df = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')

In [None]:
# inspect dataset
pd.set_option('max_columns', None)
# df.shape # (10127, 23)
df.head(10)

In [None]:
# drop irrelevant data (last two rows and CLIENTNUM)
df.drop('Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', axis=1, inplace=True)
df.drop('Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', axis=1, inplace=True)
df.drop('CLIENTNUM', axis=1, inplace=True)
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.describe()

# Data Vizualization



Lets take a look at the histograms of all our data

In [None]:
# data vizualization with plotly

from plotly.subplots import make_subplots

fig = make_subplots(rows=5, cols=4)

trace0 = go.Histogram(x=df['Attrition_Flag'],name="Exsisiting Customer or not")
trace1 = go.Histogram(x=df['Customer_Age'], name='Age')
trace2 = go.Histogram(x=df['Gender'], name='Sex')
trace3 = go.Histogram(x=df['Dependent_count'], name='Dependent Count')
trace4 = go.Histogram(x=df['Education_Level'], name='Education Level')
trace5 = go.Histogram(x=df['Marital_Status'], name='Marital Status')
trace6 = go.Histogram(x=df['Card_Category'], name='Type of Card')
trace7 = go.Histogram(x=df['Months_on_book'], name='period of relationsip with book')
trace8 = go.Histogram(x=df['Total_Relationship_Count'], name='number of products held')
trace9 = go.Histogram(x=df['Months_Inactive_12_mon'], name='monthe incactive')
trace10 = go.Histogram(x=df['Contacts_Count_12_mon'], name='number of contacts')
trace11 = go.Histogram(x=df['Income_Category'], name='Income Category')
trace12 = go.Histogram(x=df['Credit_Limit'], name='Credit Limit')
trace13 = go.Histogram(x=df['Total_Revolving_Bal'], name='Revolving Balance')
trace14 = go.Histogram(x=df['Avg_Open_To_Buy'], name='Open to Buy Credit Line')
trace15 = go.Histogram(x=df['Total_Amt_Chng_Q4_Q1'], name='Change in Transaction Amount')
trace16 = go.Histogram(x=df['Total_Trans_Amt'], name='Transaction amount')
trace17 = go.Histogram(x=df['Total_Trans_Ct'], name='Transaction Count')
trace18 = go.Histogram(x=df['Total_Ct_Chng_Q4_Q1'], name='change in transaction count')
trace19 = go.Histogram(x=df['Avg_Utilization_Ratio'], name='Card Utilization Ratio')

fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 1, 3)
fig.append_trace(trace3, 1, 4)
fig.append_trace(trace4, 2, 1)
fig.append_trace(trace5, 2, 2)
fig.append_trace(trace6, 2, 3)
fig.append_trace(trace7, 2, 4)
fig.append_trace(trace8, 3, 1)
fig.append_trace(trace9, 3, 2)
fig.append_trace(trace10, 3, 3)
fig.append_trace(trace11, 3, 4)
fig.append_trace(trace12, 4, 1)
fig.append_trace(trace13, 4, 2)
fig.append_trace(trace14, 4, 3)
fig.append_trace(trace15, 4, 4)
fig.append_trace(trace16, 5, 1)
fig.append_trace(trace17, 5, 2)
fig.append_trace(trace18, 5, 3)
fig.append_trace(trace19, 5, 4)

fig.update_layout(
    autosize=False,
    width=1000,
    height=800
)

fig.show()

In [None]:
# correlation heatmap with seaborn
import seaborn as sns
corr = df.corr()
sns.heatmap(corr)

# Checking Missing Values

In [None]:
df.columns.isnull()

There are no missing values that we need to modify.

# Feature Selection


*   change string columns to categorical values



In [None]:
# changing 'Attrition_Flag' and 'Gender' Into Binary
# make a copy of our data first 
cat_df = df.copy()

customer = {'Existing Customer': 1, 'Attrited Customer': 0} # create dictionary
cat_df['Attrition_Flag'] = [customer[i] for i in cat_df['Attrition_Flag']]


gender = {'M': 1, 'F': 0}  # dictionary
cat_df['Gender'] = [gender[i] for i in cat_df['Gender']]

In [None]:
# Now we One hot encode the other string columns with pd.getdummies
prep_df = pd.get_dummies(cat_df)
prep_df.head(10)

# And we get a dataframe with all integers

# Modeling


In [None]:
# import Train test split library
from sklearn.model_selection import train_test_split

# seperate dependent and independent variables
X = prep_df.iloc[:, 1:].values
y = prep_df.iloc[:, 0].values

# Train test split our data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [None]:
# Import Classification libraries

import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [None]:
xgb_clf = xgb.XGBClassifier(n_estimators = 100, random_state=42)
lr_clf = LogisticRegression()
RF_clf = RandomForestClassifier(n_estimators=100, random_state=42)
GB_clf = GradientBoostingClassifier(n_estimators=100, random_state=42)
SVC_clf = SVC(random_state=42)
gnb_clf = GaussianNB()
mlp_clf = MLPClassifier(random_state=42)


In [None]:
estimators = [xgb_clf, lr_clf, RF_clf, GB_clf, SVC_clf, gnb_clf, mlp_clf]
for e in estimators:
    e.fit(X_train, y_train)

In [None]:
# Training Accuracy
for e in estimators:
    print('{} - {:.1f}%'.format(e.__class__.__name__, e.score(X_train, y_train)*100))

In [None]:
# Testing Accuracy
for e in estimators:
    print('{} - {:.1f}%'.format(e.__class__.__name__, e.score(X_test, y_test)*100))

We can see here that our XGBClassifier performs better than all other classifiers.

In [None]:
# lets fine tune our variables for XGB classifier
# code is from https://www.kaggle.com/tilii7/hyperparameter-grid-search-with-xgboost
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV

params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

folds = 5
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)

random_search = RandomizedSearchCV(xgb_clf, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=3, random_state=42 )

In [None]:
random_search.fit(X, y)

In [None]:
print(random_search.best_params_)

In [None]:
random_search.best_score_