In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', None)
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from scipy.stats import zscore
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

In [4]:
# Alter the file path as you see fit
df = pd.read_csv('Cleaned Train.csv')
df.head()

Unnamed: 0,ID,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,1,23,12,19114.12,1824.843333,3,4,3,4,3,7,11.27,4,1,809.98,26.82262,1,49.574949,80.415295,3,312.494089,2
1,0x1603,2,23,12,19114.12,1824.843333,3,4,3,4,3,4,11.27,4,1,809.98,31.94496,1,49.574949,118.280222,4,284.629162,2
2,0x1604,3,23,12,19114.12,1824.843333,3,4,3,4,3,7,11.27,4,1,809.98,28.609352,1,49.574949,81.699521,5,331.209863,2
3,0x1605,4,23,12,19114.12,1824.843333,3,4,3,4,5,4,6.27,4,1,809.98,31.377862,1,49.574949,199.458074,6,223.45131,2
4,0x1606,5,23,12,19114.12,1824.843333,3,4,3,4,6,4,11.27,4,1,809.98,24.797347,1,49.574949,41.420153,2,341.489231,2


In [5]:
df.set_index('ID', drop=True, inplace=True)

X = df.drop('Credit_Score', axis=1)
y = df[['Credit_Score']]
y1 = y.copy()
y['Credit_Score'] = y['Credit_Score'].astype('category')

In [6]:
y.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 0x1602 to 0x25fed
Data columns (total 1 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   Credit_Score  100000 non-null  category
dtypes: category(1)
memory usage: 879.0+ KB


In [7]:
y.head()

Unnamed: 0_level_0,Credit_Score
ID,Unnamed: 1_level_1
0x1602,2
0x1603,2
0x1604,2
0x1605,2
0x1606,2


In [8]:
y['Credit_Score'].value_counts(normalize=True)*100

1    53.174
0    28.998
2    17.828
Name: Credit_Score, dtype: float64

In [9]:
# Converting Cateogry columns to 'category' data type
cat_col = ['Month', 'Occupation', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour']
for i in cat_col:
    X[i] = X[i].astype('category')

In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 0x1602 to 0x25fed
Data columns (total 21 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   Month                     100000 non-null  category
 1   Age                       100000 non-null  int64   
 2   Occupation                100000 non-null  category
 3   Annual_Income             100000 non-null  float64 
 4   Monthly_Inhand_Salary     100000 non-null  float64 
 5   Num_Bank_Accounts         100000 non-null  int64   
 6   Num_Credit_Card           100000 non-null  int64   
 7   Interest_Rate             100000 non-null  int64   
 8   Num_of_Loan               100000 non-null  int64   
 9   Delay_from_due_date       100000 non-null  int64   
 10  Num_of_Delayed_Payment    100000 non-null  int64   
 11  Changed_Credit_Limit      100000 non-null  float64 
 12  Num_Credit_Inquiries      100000 non-null  int64   
 13  Credit_Mix                10

In [11]:
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,100000.0,33.31634,10.764812,14.0,24.0,33.0,42.0,56.0
Annual_Income,100000.0,50505.123449,38299.422093,7005.93,19342.9725,36999.705,71683.47,179987.28
Monthly_Inhand_Salary,100000.0,4197.393268,3186.540184,303.645417,1626.594167,3095.905,5957.715,15204.633333
Num_Bank_Accounts,100000.0,5.36712,2.593856,-1.0,3.0,5.0,7.0,11.0
Num_Credit_Card,100000.0,5.53224,2.068324,0.0,4.0,5.0,7.0,11.0
Interest_Rate,100000.0,14.53208,8.74133,1.0,7.0,13.0,20.0,34.0
Num_of_Loan,100000.0,3.53288,2.446356,0.0,2.0,3.0,5.0,9.0
Delay_from_due_date,100000.0,21.09005,14.829336,-1.0,10.0,18.0,28.0,67.0
Num_of_Delayed_Payment,100000.0,13.31969,6.22476,0.0,9.0,14.0,18.0,25.0
Changed_Credit_Limit,100000.0,10.472003,6.657893,0.0,5.37,9.4,14.85,36.97


### Let's look at Cross Validation accuracy scores for various Classifiers without Hypertuning any parameters

In [12]:
print('Simple Decision Tree =',np.mean(cross_val_score(DecisionTreeClassifier(max_depth=7, min_samples_leaf=3), X, y, scoring='accuracy')))
print('Ada Boosting =',np.mean(cross_val_score(AdaBoostClassifier(n_estimators=100), X, y, scoring='accuracy')))

Simple Decision Tree = 0.7077500000000001
Ada Boosting = 0.6509


### As we can see, simple classifiers aren't giving good accuracies on data. Let's proceed to hypertune the parameters.

### Training the models below with all the 100,000 records was computationally very intensive so we have splitted this Train data again to train-test datasets. We will be hypertuning the models with this sub-train set.

In [13]:
# 85:15 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

## 1. Decision Tree

In [14]:
DecisionTreeClassifier().get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [15]:
param_grid = {'criterion': ['gini', 'log_loss', 'entropy'],
              'max_depth': [3, 5, 7],
              'max_features': ['sqrt', len(X.columns)],
              'min_samples_leaf': [6, 7, 8, 9, 10]}

grid_search1 = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring='accuracy')
grid_search1.fit(X_train, y_train)

In [16]:
grid_search1.best_params_

{'criterion': 'gini',
 'max_depth': 7,
 'max_features': 21,
 'min_samples_leaf': 9}

In [17]:
# Defining a dataframe to add all accuracies of different models
a = pd.DataFrame(columns=['Model', 'Performance on Sub-Test set', 'Accuracy of CV on Entire Train set'])

# Let's look at performance on test set
model_dt = grid_search1.best_estimator_
model_dt.fit(X_train, y_train)
a1 = model_dt.score(X_test, y_test)

In [18]:
# Let's look at Cross Validation score for the entire Train set i.e., not only the sub-test but entire dataset
model_dt = grid_search1.best_estimator_
b1 = cross_val_score(model_dt, X, y, cv=5, scoring='accuracy')
b1

array([0.699  , 0.7028 , 0.7128 , 0.70365, 0.7215 ])

In [19]:
print('Mean Accuracy from Cross Validation =',np.mean(b1))
a.loc[len(a.index)] = ['Decision Tree', round(a1, 2), round(np.mean(b1), 2)]

Mean Accuracy from Cross Validation = 0.70795


## 2. Ada Boosting

In [20]:
AdaBoostClassifier().get_params()

{'algorithm': 'SAMME.R',
 'base_estimator': 'deprecated',
 'estimator': None,
 'learning_rate': 1.0,
 'n_estimators': 50,
 'random_state': None}

In [22]:
shuffle_split = ShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

param_grid = {'estimator': [DecisionTreeClassifier(max_depth=5, min_samples_leaf=3), DecisionTreeClassifier(max_depth=7, min_samples_leaf=5), model_dt],
              'n_estimators': [100, 150, 200, 250],
              'learning_rate': [1, 2]}

rand_search3 = RandomizedSearchCV(AdaBoostClassifier(), param_grid, scoring='accuracy', n_iter=30, cv=shuffle_split, random_state=42)
rand_search3.fit(X_train, y_train)

In [23]:
rand_search3.best_params_

{'n_estimators': 250,
 'learning_rate': 1,
 'estimator': DecisionTreeClassifier(max_depth=7, max_features=21, min_samples_leaf=9)}

In [24]:
# Let's look at performance on test set
model_ada = rand_search3.best_estimator_
model_ada.fit(X_train, y_train)
a3 = model_ada.score(X_test, y_test)

In [25]:
# Let's look at Cross Validation score
model_ada = rand_search3.best_estimator_
b3 = cross_val_score(model_ada, X, y, cv=5, scoring='accuracy')
b3

array([0.6355 , 0.64195, 0.6298 , 0.6287 , 0.63545])

In [26]:
print('Mean Accuracy from Cross Validation =',np.mean(b3))
a.loc[len(a.index)] = ['Ada Boosting', round(a3, 2), round(np.mean(b3), 2)]

Mean Accuracy from Cross Validation = 0.63428


In [27]:
a

Unnamed: 0,Model,Performance on Sub-Test set,Accuracy of CV on Entire Train set
0,Decision Tree,0.71,0.71
1,Ada Boosting,0.71,0.63


# THE END