In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', None)
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from scipy.stats import zscore
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

In [4]:
# Alter the file path as you see fit
df = pd.read_csv('Cleaned Train.csv')
df.head()

Unnamed: 0,ID,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,1,23,12,19114.12,1824.843333,3,4,3,4,3,7,11.27,4,1,809.98,26.82262,1,49.574949,80.415295,3,312.494089,2
1,0x1603,2,23,12,19114.12,1824.843333,3,4,3,4,3,4,11.27,4,1,809.98,31.94496,1,49.574949,118.280222,4,284.629162,2
2,0x1604,3,23,12,19114.12,1824.843333,3,4,3,4,3,7,11.27,4,1,809.98,28.609352,1,49.574949,81.699521,5,331.209863,2
3,0x1605,4,23,12,19114.12,1824.843333,3,4,3,4,5,4,6.27,4,1,809.98,31.377862,1,49.574949,199.458074,6,223.45131,2
4,0x1606,5,23,12,19114.12,1824.843333,3,4,3,4,6,4,11.27,4,1,809.98,24.797347,1,49.574949,41.420153,2,341.489231,2


In [5]:
df.set_index('ID', drop=True, inplace=True)

X = df.drop('Credit_Score', axis=1)
y = df[['Credit_Score']]
y1 = y.copy()
y['Credit_Score'] = y['Credit_Score'].astype('category')

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 0x1602 to 0x25fed
Data columns (total 21 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Month                     100000 non-null  int64  
 1   Age                       100000 non-null  int64  
 2   Occupation                100000 non-null  int64  
 3   Annual_Income             100000 non-null  float64
 4   Monthly_Inhand_Salary     100000 non-null  float64
 5   Num_Bank_Accounts         100000 non-null  int64  
 6   Num_Credit_Card           100000 non-null  int64  
 7   Interest_Rate             100000 non-null  int64  
 8   Num_of_Loan               100000 non-null  int64  
 9   Delay_from_due_date       100000 non-null  int64  
 10  Num_of_Delayed_Payment    100000 non-null  int64  
 11  Changed_Credit_Limit      100000 non-null  float64
 12  Num_Credit_Inquiries      100000 non-null  int64  
 13  Credit_Mix                100000 non-null  

In [7]:
y.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 0x1602 to 0x25fed
Data columns (total 1 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   Credit_Score  100000 non-null  category
dtypes: category(1)
memory usage: 879.0+ KB


In [8]:
y['Credit_Score'].value_counts(normalize=True)*100

Credit_Score
1    53.174
0    28.998
2    17.828
Name: proportion, dtype: float64

In [9]:
# Scaling X
X_scaled = X.apply(zscore)

### Let's look at Cross Validation accuracy scores for various Classifiers without Hypertuning any parameters

In [10]:
print('KNN =',np.mean(cross_val_score(KNeighborsClassifier(), X_scaled, y, scoring='accuracy')))
print('XG Boosting =',np.mean(cross_val_score(XGBClassifier(enable_categorical=True), X, y1, scoring='accuracy')))

KNN = 0.64211
XG Boosting = 0.69965


### As we can see, simple classifiers aren't giving good accuracies on data. Let's proceed to hypertune the parameters.

## 1. KNN Classification

In [11]:
KNeighborsClassifier().get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [12]:
param_grid = {'n_neighbors': [3, 5, 7],
              'weights': ['uniform', 'distance'],
              'algorithm': ['auto', 'brute']}

grid_search1 = GridSearchCV(KNeighborsClassifier(), param_grid, scoring='accuracy')
grid_search1.fit(X_scaled, y)

In [13]:
grid_search1.best_params_

{'algorithm': 'auto', 'n_neighbors': 7, 'weights': 'distance'}

In [14]:
# Let's look at Cross Validation score now
model_dt = grid_search1.best_estimator_
a1 = cross_val_score(model_dt, X_scaled, y, cv=5, scoring='accuracy')
a1

array([0.655  , 0.65775, 0.65345, 0.6556 , 0.6597 ])

In [15]:
print('Mean Accuracy from Cross Validation =',np.mean(a1))

Mean Accuracy from Cross Validation = 0.6562999999999999


## 2. XG Boosting

In [16]:
#XGBClassifier().get_params()

In [17]:
param_grid = {
    'n_estimators': [100, 150, 200, 250, 300],  # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 0.2],  # Rate at which the model's weights are updated
    'max_depth': [3, 5, 7],  # Maximum depth of each tree
    'min_child_weight': [1, 3, 5],  # Minimum sum of instance weight (hessian) needed in a child
    'subsample': [0.6, 0.8, 1.0],  # Fraction of samples used for fitting each tree
    'colsample_bytree': [0.6, 0.8, 1.0],  # Fraction of features used for fitting each tree
    'gamma': [0, 0.1, 0.2],  # Minimum loss reduction required to make a further partition on a leaf node
    'reg_alpha': [0, 0.1, 0.5],  # L1 regularization term on weights
    'reg_lambda': [0, 0.1, 0.5],  # L2 regularization term on weights
    'scale_pos_weight': [1, 2, 3]  # Control the balance of positive and negative weights
}

rand_search2 = RandomizedSearchCV(XGBClassifier(), param_grid, scoring='accuracy', n_iter=100, random_state=42)
rand_search2.fit(X, y1)

In [18]:
rand_search2.best_params_

{'subsample': 0.6,
 'scale_pos_weight': 1,
 'reg_lambda': 0,
 'reg_alpha': 0.1,
 'n_estimators': 150,
 'min_child_weight': 1,
 'max_depth': 7,
 'learning_rate': 0.01,
 'gamma': 0,
 'colsample_bytree': 1.0}

In [19]:
# Let's look at Cross Validation score now
model_xgb = rand_search2.best_estimator_
a2 = cross_val_score(model_xgb, X, y1, cv=5, scoring='accuracy')
a2

array([0.70305, 0.70525, 0.7169 , 0.70845, 0.7268 ])

In [20]:
print('Mean Accuracy from Cross Validation =',np.mean(a2))

Mean Accuracy from Cross Validation = 0.71209
