This python notebook explores the credit risk classification without using credit score metric.

In [1]:
import pickle
import os.path as path
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from pipeline_components import Stage1Classifier, Stage2Classifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.inspection import permutation_importance

In [2]:
df = pickle.load(open(path.abspath(r'..\artifacts\target_associated_data.pkl'), 'rb'))
# df = pickle.load(open(r'C:\Users\Monika\Projects\Credit Risk Checker\artifacts\target_associated_data.pkl', 'rb'))
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Approved_Flag', 'PROSPECTID', 'Credit_Score'], axis=1), 
                 LabelEncoder().fit_transform(df['Approved_Flag']),
                 test_size=0.2, random_state=42)

In [3]:
xgb_param_grid = {
    'n_estimators': [50, 100, 200],    
    'max_depth': [3, 5, 7],    
    'learning_rate': [0.01, 0.1, 0.2],  
    'subsample': [0.8, 1.0],   
    'colsample_bytree': [0.8, 1.0],     
    'gamma': [0, 1],   
    'reg_alpha': [0, 1],     
    'reg_lambda': [1, 2],    
}

pipeline = Pipeline(steps=[('stage1', Stage1Classifier(base_model=DecisionTreeClassifier(random_state=42))),
                           ('stage2', Stage2Classifier(p1_p3_model=XGBClassifier(random_state=42, n_jobs=-1), 
                                                       p2_p4_model=DecisionTreeClassifier(random_state=42),
                                                       p1_p3_param_grid=xgb_param_grid))
                           ])

pipeline.fit(X_train, y_train)
y_test_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))

p1-p3 model score: 0.9600030734138372
              precision    recall  f1-score   support

           0       0.74      0.75      0.75      1014
           1       0.82      0.81      0.81      5045
           2       0.34      0.34      0.34      1325
           3       0.64      0.67      0.65      1029

    accuracy                           0.71      8413
   macro avg       0.63      0.64      0.64      8413
weighted avg       0.71      0.71      0.71      8413

[[ 765  240    7    2]
 [ 231 4074  627  113]
 [  42  558  447  278]
 [   1   89  249  690]]


In [4]:
dt = DecisionTreeClassifier().fit(X_train, y_train)

In [5]:
dt.predict(X_test)

array([2, 3, 1, ..., 2, 1, 3])

In [6]:
confusion_matrix(y_test, dt.predict(X_test))

array([[ 703,  256,   49,    6],
       [ 243, 4122,  589,   91],
       [  34,  574,  454,  263],
       [   0,   91,  254,  684]], dtype=int64)

In [7]:
print(classification_report(y_test, dt.predict(X_test)))

              precision    recall  f1-score   support

           0       0.72      0.69      0.71      1014
           1       0.82      0.82      0.82      5045
           2       0.34      0.34      0.34      1325
           3       0.66      0.66      0.66      1029

    accuracy                           0.71      8413
   macro avg       0.63      0.63      0.63      8413
weighted avg       0.71      0.71      0.71      8413



In [8]:
df.groupby('Approved_Flag')['Credit_Score'].agg(["min", "max"])

Unnamed: 0_level_0,min,max
Approved_Flag,Unnamed: 1_level_1,Unnamed: 2_level_1
P1,701,809
P2,669,700
P3,489,776
P4,469,658


Despite CIBIL agencies formutes the credit scores using existing customer credit records, the classification model appears to have lost critical predictive power when this metric is excluded. This indicates that the credit score might be capturing nuanced risk factors beyond the raw credit record data.

The unexpected degradation of credit risk classification without the credit score suggests that these scores might encapsulate information beyond the credit record data available with us.