In [7]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', None)
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from scipy.stats import zscore
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, ShuffleSplit
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier

In [9]:
# Alter the file path as you see fit
df = pd.read_csv('Cleaned Train.csv')
df.head()

Unnamed: 0,ID,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,1,23,12,19114.12,1824.843333,3,4,3,4,3,7,11.27,4,1,809.98,26.82262,1,49.574949,80.415295,3,312.494089,2
1,0x1603,2,23,12,19114.12,1824.843333,3,4,3,4,3,4,11.27,4,1,809.98,31.94496,1,49.574949,118.280222,4,284.629162,2
2,0x1604,3,23,12,19114.12,1824.843333,3,4,3,4,3,7,11.27,4,1,809.98,28.609352,1,49.574949,81.699521,5,331.209863,2
3,0x1605,4,23,12,19114.12,1824.843333,3,4,3,4,5,4,6.27,4,1,809.98,31.377862,1,49.574949,199.458074,6,223.45131,2
4,0x1606,5,23,12,19114.12,1824.843333,3,4,3,4,6,4,11.27,4,1,809.98,24.797347,1,49.574949,41.420153,2,341.489231,2


In [10]:
df.set_index('ID', drop=True, inplace=True)

X = df.drop('Credit_Score', axis=1)
y = df[['Credit_Score']]
y1 = y.copy()
y['Credit_Score'] = y['Credit_Score'].astype('category')

In [11]:
y.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 0x1602 to 0x25fed
Data columns (total 1 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   Credit_Score  100000 non-null  category
dtypes: category(1)
memory usage: 879.0+ KB


In [12]:
y.head()

Unnamed: 0_level_0,Credit_Score
ID,Unnamed: 1_level_1
0x1602,2
0x1603,2
0x1604,2
0x1605,2
0x1606,2


In [13]:
y['Credit_Score'].value_counts(normalize=True)*100

1    53.174
0    28.998
2    17.828
Name: Credit_Score, dtype: float64

In [14]:
# Converting Cateogry columns to 'category' data type
cat_col = ['Month', 'Occupation', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour']
for i in cat_col:
    X[i] = X[i].astype('category')

In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 0x1602 to 0x25fed
Data columns (total 21 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   Month                     100000 non-null  category
 1   Age                       100000 non-null  int64   
 2   Occupation                100000 non-null  category
 3   Annual_Income             100000 non-null  float64 
 4   Monthly_Inhand_Salary     100000 non-null  float64 
 5   Num_Bank_Accounts         100000 non-null  int64   
 6   Num_Credit_Card           100000 non-null  int64   
 7   Interest_Rate             100000 non-null  int64   
 8   Num_of_Loan               100000 non-null  int64   
 9   Delay_from_due_date       100000 non-null  int64   
 10  Num_of_Delayed_Payment    100000 non-null  int64   
 11  Changed_Credit_Limit      100000 non-null  float64 
 12  Num_Credit_Inquiries      100000 non-null  int64   
 13  Credit_Mix                10

In [16]:
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,100000.0,33.31634,10.764812,14.0,24.0,33.0,42.0,56.0
Annual_Income,100000.0,50505.123449,38299.422093,7005.93,19342.9725,36999.705,71683.47,179987.28
Monthly_Inhand_Salary,100000.0,4197.393268,3186.540184,303.645417,1626.594167,3095.905,5957.715,15204.633333
Num_Bank_Accounts,100000.0,5.36712,2.593856,-1.0,3.0,5.0,7.0,11.0
Num_Credit_Card,100000.0,5.53224,2.068324,0.0,4.0,5.0,7.0,11.0
Interest_Rate,100000.0,14.53208,8.74133,1.0,7.0,13.0,20.0,34.0
Num_of_Loan,100000.0,3.53288,2.446356,0.0,2.0,3.0,5.0,9.0
Delay_from_due_date,100000.0,21.09005,14.829336,-1.0,10.0,18.0,28.0,67.0
Num_of_Delayed_Payment,100000.0,13.31969,6.22476,0.0,9.0,14.0,18.0,25.0
Changed_Credit_Limit,100000.0,10.472003,6.657893,0.0,5.37,9.4,14.85,36.97


### Let's look at Cross Validation accuracy scores for various Classifiers without Hypertuning any parameters

In [17]:
print('Random Forest =',np.mean(cross_val_score(RandomForestClassifier(max_depth=7, min_samples_leaf=3), X, y, scoring='accuracy',)))
print('Gradient Boosting =',np.mean(cross_val_score(GradientBoostingClassifier(max_depth=7, min_samples_leaf=3), X, y, cv=3, scoring='accuracy')))
print('XG Boosting =',np.mean(cross_val_score(XGBClassifier(enable_categorical=True), X, y1, scoring='accuracy')))

Random Forest = 0.69738
Gradient Boosting = 0.7045800625402907
XG Boosting = 0.6991299999999999


### As we can see, simple classifiers aren't giving good accuracies on data. Let's proceed to hypertune the parameters.

### Training the models below with all the 100,000 records was computationally very intensive so we have splitted this Train data again to train-test datasets. We will be hypertuning the models with this sub-train set.

In [18]:
# 85:15 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

## 1. Random Forest

In [19]:
RandomForestClassifier().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [20]:
shuffle_split = ShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

param_grid = {'n_estimators': [100, 150, 200],
              'criterion': ['gini', 'entropy'],
              'max_depth': [3, 5, 7],
              'max_features': ['sqrt', len(X.columns)],
              'min_samples_leaf': [6, 7, 8, 9, 10],
              'oob_score': [True, False]}

rand_search2 = RandomizedSearchCV(RandomForestClassifier(), param_grid, scoring='accuracy', n_iter=100, verbose=2, cv=shuffle_split, random_state=42)
rand_search2.fit(X_train, y_train)

Fitting 1 folds for each of 100 candidates, totalling 100 fits
[CV] END criterion=entropy, max_depth=3, max_features=21, min_samples_leaf=8, n_estimators=150, oob_score=True; total time=  42.1s
[CV] END criterion=gini, max_depth=3, max_features=21, min_samples_leaf=8, n_estimators=100, oob_score=True; total time=  24.7s
[CV] END criterion=entropy, max_depth=5, max_features=21, min_samples_leaf=8, n_estimators=150, oob_score=False; total time= 1.1min
[CV] END criterion=entropy, max_depth=7, max_features=sqrt, min_samples_leaf=6, n_estimators=150, oob_score=True; total time=  20.0s
[CV] END criterion=gini, max_depth=3, max_features=21, min_samples_leaf=10, n_estimators=150, oob_score=True; total time=  35.0s
[CV] END criterion=entropy, max_depth=5, max_features=21, min_samples_leaf=6, n_estimators=150, oob_score=True; total time= 1.1min
[CV] END criterion=gini, max_depth=3, max_features=sqrt, min_samples_leaf=8, n_estimators=150, oob_score=False; total time=   8.0s
[CV] END criterion=gin

In [21]:
rand_search2.best_params_

{'oob_score': False,
 'n_estimators': 100,
 'min_samples_leaf': 7,
 'max_features': 21,
 'max_depth': 7,
 'criterion': 'gini'}

In [22]:
# Defining a dataframe to add all accuracies of different models
a = pd.DataFrame(columns=['Model', 'Performance on Sub-Test set', 'Accuracy of CV on Entire Train set'])

# Let's look at performance on test set
model_rf = rand_search2.best_estimator_
model_rf.fit(X_train, y_train)
a2 = model_rf.score(X_test, y_test)

In [23]:
# Let's look at Cross Validation score
model_rf = rand_search2.best_estimator_
b2 = cross_val_score(model_rf, X, y, cv=5, scoring='accuracy')
b2

array([0.7039, 0.7051, 0.716 , 0.7038, 0.7246])

In [24]:
print('Mean Accuracy from Cross Validation =',np.mean(b2))
a.loc[len(a.index)] = ['Random Forest', round(a2, 2), round(np.mean(b2), 2)]

Mean Accuracy from Cross Validation = 0.7106800000000001


## 2. Gradient Boosting

In [25]:
GradientBoostingClassifier().get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [26]:
param_grid = {'n_estimators': [100, 150, 200],
              'criterion': ['log_loss', 'exponential'],
              'max_depth': [5, 7, 9],
              'max_features': ['sqrt', len(X.columns)],
              'min_samples_leaf': [7, 8, 9, 10],
              'criterion': ['friedman_mse', 'squared_error'],
              'validation_fraction': [0.1, 0.2, 0.3]}

rand_search4 = RandomizedSearchCV(GradientBoostingClassifier(), param_grid, scoring='accuracy', n_iter=100, cv=shuffle_split, verbose=2, random_state=42)
rand_search4.fit(X_train, y_train)

Fitting 1 folds for each of 100 candidates, totalling 100 fits
[CV] END criterion=squared_error, max_depth=9, max_features=21, min_samples_leaf=10, n_estimators=100, validation_fraction=0.2; total time= 4.6min
[CV] END criterion=friedman_mse, max_depth=7, max_features=sqrt, min_samples_leaf=7, n_estimators=150, validation_fraction=0.1; total time= 1.3min
[CV] END criterion=friedman_mse, max_depth=9, max_features=21, min_samples_leaf=7, n_estimators=100, validation_fraction=0.1; total time= 4.6min
[CV] END criterion=friedman_mse, max_depth=5, max_features=sqrt, min_samples_leaf=10, n_estimators=150, validation_fraction=0.1; total time=  56.7s
[CV] END criterion=squared_error, max_depth=9, max_features=sqrt, min_samples_leaf=10, n_estimators=150, validation_fraction=0.3; total time= 1.6min
[CV] END criterion=squared_error, max_depth=5, max_features=21, min_samples_leaf=9, n_estimators=150, validation_fraction=0.3; total time= 4.1min
[CV] END criterion=squared_error, max_depth=9, max_feat

In [27]:
rand_search4.best_params_

{'validation_fraction': 0.1,
 'n_estimators': 200,
 'min_samples_leaf': 8,
 'max_features': 21,
 'max_depth': 9,
 'criterion': 'friedman_mse'}

In [28]:
# Let's look at performance on test set
model_gb = rand_search4.best_estimator_
model_gb.fit(X_train, y_train)
a4 = model_gb.score(X_test, y_test)

In [29]:
# Let's look at Cross Validation score
model_gb = rand_search4.best_estimator_
b4 = cross_val_score(model_gb, X, y, cv=5, scoring='accuracy')
b4

array([0.6903 , 0.693  , 0.69855, 0.6938 , 0.7014 ])

In [30]:
print('Mean Accuracy from Cross Validation =',np.mean(b4))
a.loc[len(a.index)] = ['Gradient Boosting', round(a4, 2), round(np.mean(b4), 2)]

Mean Accuracy from Cross Validation = 0.6954100000000001


In [31]:
a

Unnamed: 0,Model,Performance on Sub-Test set,Accuracy of CV on Entire Train set
0,Random Forest,0.71,0.71
1,Gradient Boosting,0.81,0.7


# THE END