In [None]:
import math
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

#warnings supression
import warnings
warnings.filterwarnings('ignore')

In [None]:
# import data
data = pd.read_csv('../input/credit-card-customers/BankChurners.csv')
data. head()


# Exploring the dataset
1. figure out the columns and their type.
2. verify if there is any missing data.
3. to know the dataset dim

In [None]:
data.info()


In [None]:
data.isnull().sum()

In [None]:
print("The data shape: ", data.shape)

We can see that most of the columns are numerical but we have some categorical.

Lets have a better look of the categorical columns:

In [None]:
cat_col = data.select_dtypes(include=['object'])
values = cat_col.nunique()
print(values)

# **Dealing with categorical columns**

In [None]:
for column in cat_col:
    print('Feature: ', column)
    print('Feature Values: ', data[column].unique())

the label is the attrition flag which has 2 categorise: we will change it to 0, 1.

In [None]:
data = data.replace('Existing Customer', 1)
data = data.replace('Attrited Customer', 0)

The gender has 2 unique values as well

In [None]:
data = data.replace('M', 1)
data = data.replace('F', 0)
data

**the entire categorical columns has a logic order (except marital status) that we can give them a numerical order**

Education level has 7 unique values - I will modify every education level with its number of years to be educated

In [None]:
# Change 'College'=14 'Doctorate'=21 'Graduate'=16 'High School'=12 'Post-Graduate'=18 'Uneducated'=8 'Unknown'= Mode (most common)
data.loc[data['Education_Level'] == 'College',       'Education_Level'] = 15
data.loc[data['Education_Level'] == 'Doctorate',     'Education_Level'] = 20
data.loc[data['Education_Level'] == 'Graduate',      'Education_Level'] = 16
data.loc[data['Education_Level'] == 'High School',   'Education_Level'] = 12
data.loc[data['Education_Level'] == 'Post-Graduate', 'Education_Level'] = 18
data.loc[data['Education_Level'] == 'Uneducated',    'Education_Level'] = 8
data.loc[data['Education_Level'] == 'Unknown',       'Education_Level'] = data['Education_Level'].mode()[0]
data['Education_Level'] = data['Education_Level'].astype(int)

* Feature:  Card_Category - depends on its importance
* Feature Values:  Blue = 1 Silver = 2 Gold = 3 Platinum = 4

In [None]:
data = data.replace('Blue', 1)
data = data.replace('Silver', 2)
data = data.replace('Gold', 3)
data = data.replace('Platinum', 4)
data

Income_Category: because we have a range for every value we will take a representative value for each one

In [None]:
# Feature:  Income_Category
# Feature Values: 
 # 'Less than $40K' = 20
 # '$60K - $80K' =  70 
 # '$40K - $60K' = 50
 # '$80K - $120K' = 100
 # '$120K +' = 140
data = data.replace('Less than $40K', 20)
data = data.replace('$40K - $60K', 50)
data = data.replace('$60K - $80K', 70)
data = data.replace('$80K - $120K', 100)
data = data.replace('$120K +', 150)
data.loc[data['Income_Category'] == 'Unknown', 'Income_Category'] = data['Income_Category'].mode()[0]
data['Income_Category'] = data['Income_Category'].astype(int)

* Martial status has no clear order therefor I use Hotkey encoding.

In [None]:
data = pd.get_dummies(data, columns=['Marital_Status'], drop_first = True)
data

# **Preprocessing**

The 2 last Naive bayes's columns looks to be a predictors therefore we will drop them and the first column (clientnum) which doesn't give us any information.

In [None]:
A = 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1'
B = 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'
C = 'CLIENTNUM'
data = data.drop([A, B, C], axis=1)

In [None]:
corrmat = data.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)
plt.show()

* We can see that we have some correlated features such as 'Avg_Open_To_Buy' with 'Credit_limit'. 
* additionally, the martial categories are anti correlated to each other.

* **Lets check if the data is balanced**

In [None]:
# split into X and y
y = data['Attrition_Flag']
X = data.drop(columns='Attrition_Flag')

In [None]:
print("the percent of 1s in the label: {} %".format(y.mean()*100))

We can see that the data is imbalanced - we will use SMOTE to balance.

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(random_state=69)
X_s, y_s = oversample.fit_resample(X, y)
X_s = X_s.to_numpy()
y_s = y_s.to_numpy()
print("the percent of 1s in the label after using SMOTE: {} %".format(y_s.mean()*100))

* **Next - normalize data**

In [None]:
for col in range(X.shape[1]):
    min = X_s[:, col].min()
    max = X_s[:, col].max()
    X_s[:, col] = (X_s[:, col] - min) / (max - min)

* **Split into train and test**

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split (X_s, y_s, train_size=0.75, random_state=69)
print("The training data size : {} ".format(X_train.shape))
print("The test data size : {} ".format(X_test.shape))

# **Modeling**

* # **Decision Tree model**

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=None).fit(x_train, y_train)
y_prediction = dt.predict(x_test)

dt_training_score = dt.score(X_test, y_test)
dt_test_score = dt.score(X_train, y_train)

print ('Max Depth:', dt.get_depth())

print('Decision Tree Accuracy: ')
print('Train {}'.format(np.around(dt_training_score * 100, decimals=2)))
print('Test {}'.format(np.around(dt_test_score * 100, decimals=2)))



We got 100% accuracy at test - seems that we have overfiting

Lets plot the accuracy score to find out

In [None]:
max_depth = dt.get_depth()
acc_train_list = []
acc_test_list = []

for i in range(1, max_depth + 1):
    dt = DecisionTreeClassifier(max_depth = i).fit(x_train, y_train)
    y_prediction = dt.predict(x_test)
    
    dt_training_score = 100 * dt.score(X_test, y_test)
    dt_test_score = 100 * dt.score(X_train, y_train)
    acc_train_list.append(dt_training_score)
    acc_test_list.append(dt_test_score)
    

plt.plot (acc_train_list)
plt.plot(acc_test_list)
plt.xlabel("Max Depth")
plt.ylabel("Accuracy Score")
plt.show()

We can see that around max depth = 5 the train-test starting to split.

* # **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# First we will check the number of estimator is the best to use

estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
acc_train_list = []
acc_test_list = []

for i in estimators:
    rnf = RandomForestClassifier(n_estimators=i).fit(x_train, y_train)
    y_prediction = rnf.predict(x_test)
    
    train_accuracy = rnf.score(x_train, y_train)
    test_accuracy = rnf.score(x_test, y_test)
    acc_train_list.append(train_accuracy)
    acc_test_list.append(test_accuracy)

plt.plot (estimators, acc_train_list)
plt.plot(estimators, acc_test_list)
plt.xlabel("number of estimators")
plt.ylabel("Accuracy Score")
plt.show()


We can see that from ~30 estimators the accuracy is getting stable

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Now we will check the number of estimator is the best to use

acc_train_list = []
acc_test_list = []

for i in range(1, 33):
    rnf = RandomForestClassifier(max_depth=i).fit(x_train, y_train)
    y_prediction = rnf.predict(x_test)
    
    train_accuracy = rnf.score(x_train, y_train)
    test_accuracy = rnf.score(x_test, y_test)
    acc_train_list.append(train_accuracy)
    acc_test_list.append(test_accuracy)

x = np.linspace(1, 32, 32)

plt.plot (x, acc_train_list)
plt.plot(x, acc_test_list)
plt.xlabel("Max Depth")
plt.ylabel("Accuracy Score")
plt.show()

We can see the accuracy score getting stable from max depth ~7

In [None]:
# checking the accuracy with the best parameters
rnf = RandomForestClassifier(n_estimators=30, max_depth=7).fit(x_train, y_train)
y_prediction = rnf.predict(x_test)
    
train_accuracy = rnf.score(x_train, y_train)
test_accuracy = rnf.score(x_test, y_test)

print('Random Forest Accuracy: ')
print('Train {}'.format(np.around(train_accuracy * 100, decimals=2)))
print('Test {}'.format(np.around(test_accuracy * 100, decimals=2)))
