Let's start with some imports:

In [None]:
import math
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

#warnings supression
import warnings
warnings.filterwarnings('ignore')

Now let's read the data

In [None]:
data = pd.read_csv ("/kaggle/input/credit-card-customers/BankChurners.csv")
print("The data shape is : {} ".format(data.shape))
data.head()

# Arranging the data
The last 2 columns seem to be predictions. Change their name to be more manageable

In [None]:
str1 = "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1"
str2 = "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"
data.rename(columns={str1 : "pred1", str2 : "pred2"}, inplace=True)
data.info()

Most of the columns are numerical but 6 are categorical.
Let's start by taking a better look at categorical data


In [None]:
cat_columns = data.select_dtypes(include = ['object'])
unique_values = cat_columns.nunique(dropna=False)
print (unique_values)

Attrition flag (our label) - has 2 unique values

In [None]:
col = np.unique(data['Attrition_Flag'].values)
print (col)
# Change "Existing customer" to 1 and "Attrited Customer" to 0
data.loc[data['Attrition_Flag'] == 'Attrited Customer', 'Attrition_Flag'] = 0
data.loc[data['Attrition_Flag'] == 'Existing Customer', 'Attrition_Flag'] = 1
data['Attrition_Flag'] = data['Attrition_Flag'].astype(int)

Next - gender also has 2 unique values

In [None]:
col = np.unique(data['Gender'].values)
print (col)
# Change "M" to 0 and "F" to 1
data.loc[data['Gender'] == 'M', 'Gender'] = 0
data.loc[data['Gender'] == 'F', 'Gender'] = 1
data['Gender'] = data['Gender'].astype(int)

Education_Level - 7 unique values

In [None]:
col = np.unique(data['Education_Level'].values)
print (col)
# Change 'College'=14 'Doctorate'=21 'Graduate'=16 'High School'=12 'Post-Graduate'=18 'Uneducated'=8 'Unknown'= Mode
data.loc[data['Education_Level'] == 'College', 'Education_Level'] = 14
data.loc[data['Education_Level'] == 'Doctorate', 'Education_Level'] = 21
data.loc[data['Education_Level'] == 'Graduate', 'Education_Level'] = 16
data.loc[data['Education_Level'] == 'High School', 'Education_Level'] = 12
data.loc[data['Education_Level'] == 'Post-Graduate', 'Education_Level'] = 18
data.loc[data['Education_Level'] == 'Uneducated', 'Education_Level'] = 8
data.loc[data['Education_Level'] == 'Unknown', 'Education_Level'] = 0 # Will be fixed later
data['Education_Level'] = data['Education_Level'].astype(int)

Income category - 6 unique values. Take a representative value of each category, and the mode for "Unknown"

In [None]:
col = np.unique(data['Income_Category'].values)
print (col)
data.loc[data['Income_Category'] == 'Less than $40K', 'Income_Category'] = 30
data.loc[data['Income_Category'] == '$40K - $60K', 'Income_Category'] = 50
data.loc[data['Income_Category'] == '$60K - $80K', 'Income_Category'] = 70
data.loc[data['Income_Category'] == '$80K - $120K', 'Income_Category'] = 100
data.loc[data['Income_Category'] == '$120K +', 'Income_Category'] = 150
data.loc[data['Income_Category'] == 'Unknown', 'Income_Category'] = 0 # Will be fixed later
data['Income_Category'] = data['Income_Category'].astype(int)

Card Category - 4 unique values. There seems to be a clear scale

In [None]:
col = np.unique(data['Card_Category'].values)
print (col)
data.loc[data['Card_Category'] == 'Blue', 'Card_Category'] = 1
data.loc[data['Card_Category'] == 'Silver', 'Card_Category'] = 2
data.loc[data['Card_Category'] == 'Gold', 'Card_Category'] = 3
data.loc[data['Card_Category'] == 'Platinum', 'Card_Category'] = 4
data['Card_Category'] = data['Card_Category'].astype(int)

Finally, marital status - 4 unique values but no clear order - use Hotkey encoding

In [None]:
data = pd.get_dummies(data, drop_first = True)

data.info()
print (data.isnull().sum())

Good - all fileds are now numeric and we have no missing values
We just need to fix the unknown income category and education levels according to the mode

In [None]:
col = data['Income_Category']
mode = col.mode()[0]
data.loc[data['Income_Category'] == 0, 'Income_Category'] = mode

col = data['Education_Level']
mode = col.mode()[0]
data.loc[data['Education_Level'] == 0, 'Education_Level'] = mode

# Initial analysis
Generate a correlation map


In [None]:
corrmat = data.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)
plt.show()

Not a lot of outstanding correlations: 
The two predictors are anti correlated to each other, and one of them  has a good correlation with the label.
marital status catergories are obvilusly anti-correlated
Other than that "Avg_Open_to_Buy is correlated with "Credit limit"

We will remove the client number and the two predictors and create our X and y matrices


In [None]:
used_data = data.drop (['CLIENTNUM', 'pred1', 'pred2', 'Attrition_Flag'], axis=1)
X = used_data.values 
y = data['Attrition_Flag'].values
y2 = data['pred1'].values
y3 = data['pred2'].values

Is the data balanced?

In [None]:
print ("% of 1s in label:", y.mean())

84% existing customers, only 16% attrited. This is not god enough. Let's use SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample (X,y)
print ("After SMOTE: % of 1s in label:", y_res.mean())

Next, let's normalize the data:

In [None]:
for feature in range (X.shape[1]):
    min = X_res[:,feature].min()
    max = X_res[:,feature].max()
    X_res[:,feature] = (X_res[:,feature]-min) / (max-min)

Split into training and test:

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (X_res, y_res, test_size=0.2, random_state=42)
print("The training data size is : {} ".format(X_train.shape))
print("The test data size is : {} ".format(X_test.shape))

# Modeling
We are ready to model! Let's start with a decision tree calssifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dct = DecisionTreeClassifier(max_depth=None)
dct.fit(X_train,y_train)
dct_training_score = 100*dct.score(X_train, y_train)
print ('Tree Depth:', dct.get_depth())
print ('Tree Leaves:', dct.get_n_leaves())
dct_test_score = 100*dct.score(X_test, y_test)
print("Decision Tree accuracy. Train : {:.2f}%, Test: {:.2f}%. ".format(dct_training_score, dct_test_score))

This obviously includes some overfitting. Let's see what is the optimal depth

In [None]:
max_d = dct.get_depth()
dct_training_score, dct_test_score = np.zeros(max_d), np.zeros(max_d)
for i in range (max_d):
  dct = DecisionTreeClassifier(max_depth=i+1)
  dct.fit(X_train,y_train)
  dct_training_score[i] = 100*dct.score(X_train, y_train)
  dct_test_score[i] = 100*dct.score(X_test, y_test)

print (np.around (dct_training_score, decimals=2))  
print (np.around (dct_test_score, decimals=2))
plt.plot (dct_training_score)
plt.plot(dct_test_score)
plt.show()

The optimal depth is around 7. Let's get most important tree features

In [None]:
features = used_data.columns
importances = dct.feature_importances_
leading_indices = (-importances).argsort()[:23]
print ("Leading features sorted by importance:")
for i in range (21):
    print (i+1, features[leading_indices[i]], round(100*importances[leading_indices[i]],2), '%')