In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import scipy.stats as stats
from tqdm import tqdm
import math
from termcolor import cprint
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

import missingno as msno
import warnings
warnings.filterwarnings('ignore')

SEED = 42

In [2]:
df = pd.read_csv('credit_score_clean_data.csv')

## Handel Categorical Data

In [3]:
df.drop(columns=['ID', 'Customer_ID', 'Name', 'SSN', 'Credit_Mix'], inplace=True)

In [4]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
Occupation,150000,15,Lawyer,10644
Type_of_Loan,150000,6261,No Data,17112
Credit_History_Age,150000,408,17 Years and 10 Months,1100
Payment_of_Min_Amount,150000,3,Yes,78484
Payment_Behaviour,150000,6,Low_spent_Small_value_payments,42046
Credit_Score,100000,3,Standard,53174


## Handel Numerical Data

In [5]:
df.describe(exclude='object').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Month,150000.0,6.5,3.452064,1.0,3.75,6.5,9.25,12.0
Age,150000.0,33.47906,10.767158,14.0,25.0,33.0,42.0,56.0
Annual_Income,150000.0,50505.123449,38299.35826,7005.93,19342.9725,36999.705,71683.47,179987.28
Monthly_Inhand_Salary,150000.0,4192.525296,3182.728812,303.645417,1625.558333,3091.0,5951.373333,15204.633333
Num_Bank_Accounts,150000.0,5.368707,2.594388,-1.0,3.0,5.0,7.0,11.0
Num_Credit_Card,150000.0,5.53366,2.068672,0.0,4.0,5.0,7.0,11.0
Interest_Rate,150000.0,14.53208,8.741316,1.0,7.0,13.0,20.0,34.0
Num_of_Loan,150000.0,3.53288,2.446352,0.0,2.0,3.0,5.0,9.0
Delay_from_due_date,150000.0,21.070393,14.813531,-1.0,10.0,18.0,28.0,62.0
Num_of_Delayed_Payment,150000.0,13.301967,6.26306,-2.0,9.0,14.0,18.0,25.0


# Feature Engineering

### Credit History Age

In [6]:
df['Credit_History_Age'] = df['Credit_History_Age'].astype('category')

In [7]:
def split_year_month(date, is_month=None):
    date = date.replace('Years', '').replace('Months', '')
    date_list = date.split('and')
    
    if is_month:
        return int(date_list[1].strip())
    
    return int(date_list[0].strip())
    

In [8]:
df['Credit_History_Age_Year'] = df['Credit_History_Age'].apply(lambda x: split_year_month(x, is_month=False))

In [9]:
df['Credit_History_Age_Month'] = df['Credit_History_Age'].apply(lambda x: split_year_month(x, is_month=True))

In [10]:
df.drop(columns='Credit_History_Age', inplace=True)

## Split df train and test

In [11]:
df_train = df[df['Credit_Score'].notna() == True]
df_test = df[df['Credit_Score'].isna() == True]

In [12]:
X = df_train.drop(columns='Credit_Score')
y = df_train[['Credit_Score']]

## Encoding

In [13]:
def multiple_label_encoder(df):
    cat_col_list = df.select_dtypes(exclude=[np.number]).columns
    df[cat_col_list]= df[cat_col_list].apply(LabelEncoder().fit_transform)
    
    return df
    

In [14]:
X = multiple_label_encoder(X)
y = multiple_label_encoder(y)

## Scaling

In [15]:
sc = StandardScaler()
X_scaled = sc.fit_transform(X)

## Split X train and X test

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, random_state=SEED, test_size = 0.2, stratify=y)

## Modeling

In [17]:
### Inbalanced data
np.unique(y, return_counts=True)

(array([0, 1, 2]), array([17828, 28998, 53174]))

In [18]:
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', max_depth=22)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)

In [19]:
print(classification_report(y_val, y_pred_rf))

              precision    recall  f1-score   support

           0       0.65      0.82      0.72      3566
           1       0.79      0.82      0.81      5799
           2       0.86      0.76      0.81     10635

    accuracy                           0.79     20000
   macro avg       0.77      0.80      0.78     20000
weighted avg       0.80      0.79      0.79     20000



In [20]:
y_train_pred_rf =  rf.predict(X_train)
print(classification_report(y_train, y_train_pred_rf))

              precision    recall  f1-score   support

           0       0.81      1.00      0.89     14262
           1       0.93      0.97      0.95     23199
           2       1.00      0.89      0.94     42539

    accuracy                           0.94     80000
   macro avg       0.91      0.96      0.93     80000
weighted avg       0.94      0.94      0.94     80000



In [21]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 22,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### Hyperparameter Tuning

In [163]:
# params = {
#     'n_estimators' : range(50, 200, 50),
#     'max_depth': range(3, 21, 3),
#     'min_samples_leaf': range(1,5),
#     'criterion':['gini', 'entropy'],
#     'oob_score'   : [True]
# }

In [164]:
# gs_obj = GridSearchCV(rf, param_grid=params, scoring='f1',  cv=5)

In [166]:
# gs_obj.fit(X_train, y_train)
# y_pred_rf = gs_obj.predict(X_val)

## XGBoost

In [27]:
xgb_obj = xgb.XGBClassifier()
xgb_obj.fit(X_train, y_train)
y_pred_xgb = xgb_obj.predict(X_val)

In [28]:
print(classification_report(y_val, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.70      0.67      0.69      3566
           1       0.78      0.73      0.75      5799
           2       0.77      0.81      0.79     10635

    accuracy                           0.76     20000
   macro avg       0.75      0.74      0.74     20000
weighted avg       0.76      0.76      0.76     20000



In [29]:
y_train_pred_xgb = xgb_obj.predict(X_train)
print(classification_report(y_train, y_train_pred_xgb))

              precision    recall  f1-score   support

           0       0.80      0.76      0.78     14262
           1       0.81      0.79      0.80     23199
           2       0.82      0.85      0.84     42539

    accuracy                           0.82     80000
   macro avg       0.81      0.80      0.81     80000
weighted avg       0.82      0.82      0.82     80000



## GridSearch

In [30]:
param_grid = {
    'n_estimators' : [100],
    'learning_rate': np.linspace(0.01, 0.3, 2).round(3),
    'max_depth'    : [5, 6],    
}

In [31]:
xgb_obj_grs = xgb.XGBClassifier()
grs_obj_xgb = GridSearchCV(xgb_obj_grs, param_grid, scoring='f1', cv=5, n_jobs=-1 )

In [32]:
grs_obj_xgb.fit(X_train, y_train)
y_pred_grs = grs_obj_xgb.predict(X_val)

Traceback (most recent call last):
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 1123, in f1_score
    return fbeta_score(
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 1261, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_cl

Traceback (most recent call last):
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 1123, in f1_score
    return fbeta_score(
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 1261, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_cl

Traceback (most recent call last):
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 1123, in f1_score
    return fbeta_score(
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 1261, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_cl

Traceback (most recent call last):
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 1123, in f1_score
    return fbeta_score(
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 1261, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Users/andishetavakkoli/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_cl