This python notebook explore formulation of credit score metric using the available credit customer record excluding the CIBIL-provided credit score.

In [1]:
import pickle
import os.path as path
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_pickle('../artifacts/full_data_nonull.pkl')
print(df.shape)

df.info()

(42064, 84)
<class 'pandas.core.frame.DataFrame'>
Index: 42064 entries, 0 to 51335
Data columns (total 84 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   PROSPECTID                    42064 non-null  int64  
 1   Total_TL                      42064 non-null  int64  
 2   Tot_Closed_TL                 42064 non-null  int64  
 3   Tot_Active_TL                 42064 non-null  int64  
 4   Total_TL_opened_L6M           42064 non-null  int64  
 5   Tot_TL_closed_L6M             42064 non-null  int64  
 6   pct_tl_open_L6M               42064 non-null  float64
 7   pct_tl_closed_L6M             42064 non-null  float64
 8   pct_active_tl                 42064 non-null  float64
 9   pct_closed_tl                 42064 non-null  float64
 10  Total_TL_opened_L12M          42064 non-null  int64  
 11  Tot_TL_closed_L12M            42064 non-null  int64  
 12  pct_tl_open_L12M              42064 non-null  float64

In [3]:
print('categorical columns: ', df.select_dtypes(include=['object']).columns, end='\n\n')

cat_col = df.select_dtypes(include='object').columns
for col in cat_col:
    print(col, ": ", df[col].unique())
    

categorical columns:  Index(['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2',
       'first_prod_enq2', 'Approved_Flag'],
      dtype='object')

MARITALSTATUS :  ['Married' 'Single']
EDUCATION :  ['12TH' 'GRADUATE' 'SSC' 'POST-GRADUATE' 'UNDER GRADUATE' 'OTHERS'
 'PROFESSIONAL']
GENDER :  ['M' 'F']
last_prod_enq2 :  ['PL' 'ConsumerLoan' 'AL' 'CC' 'others' 'HL']
first_prod_enq2 :  ['PL' 'ConsumerLoan' 'others' 'AL' 'HL' 'CC']
Approved_Flag :  ['P2' 'P1' 'P3' 'P4']


In [4]:
# one-hot encoding
df = pd.get_dummies(df, columns=['MARITALSTATUS', 'GENDER', 'last_prod_enq2', 'first_prod_enq2'])

#label encoding
encode_map = {'SSC':1, 'OTHERS': 1, '12TH': 2, 'GRADUATE': 3, 'UNDER GRADUATE': 3, 'POST-GRADUATE': 4, 'PROFESSIONAL': 4}
df['EDUCATION'] = df['EDUCATION'].map(encode_map)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42064 entries, 0 to 51335
Data columns (total 96 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   PROSPECTID                    42064 non-null  int64  
 1   Total_TL                      42064 non-null  int64  
 2   Tot_Closed_TL                 42064 non-null  int64  
 3   Tot_Active_TL                 42064 non-null  int64  
 4   Total_TL_opened_L6M           42064 non-null  int64  
 5   Tot_TL_closed_L6M             42064 non-null  int64  
 6   pct_tl_open_L6M               42064 non-null  float64
 7   pct_tl_closed_L6M             42064 non-null  float64
 8   pct_active_tl                 42064 non-null  float64
 9   pct_closed_tl                 42064 non-null  float64
 10  Total_TL_opened_L12M          42064 non-null  int64  
 11  Tot_TL_closed_L12M            42064 non-null  int64  
 12  pct_tl_open_L12M              42064 non-null  float64
 13  pct_tl

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Approved_Flag', 'PROSPECTID', 'Credit_Score'], axis=1), 
                 LabelEncoder().fit_transform(df['Credit_Score']),
                 test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)

(33651, 93) (8413, 93)


In [7]:
X_train.select_dtypes(include='object').columns

Index([], dtype='object')

In [8]:
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
print(dt.score(X_test, y_test))

0.7689581886618266


In [9]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))

0.8889391299908243


In [10]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
print(xgb.score(X_test, y_test))

0.9001474319612266


In [11]:
print('DCT RMSE: ', np.sqrt(mean_squared_error(y_train, dt.predict(X_train))))
print('RF RMSE: ', np.sqrt(mean_squared_error(y_train, rf.predict(X_train))))
print('XGB RMSE: ', np.sqrt(mean_squared_error(y_train, xgb.predict(X_train))))

DCT RMSE:  0.02437900691431946
RF RMSE:  2.628788089063488
XGB RMSE:  5.283890281072964


In [12]:
print('DCT RMSE: ', np.sqrt(mean_squared_error(y_test, dt.predict(X_test))))
print('RF RMSE: ', np.sqrt(mean_squared_error(y_test, rf.predict(X_test))))
print('XGB RMSE: ', np.sqrt(mean_squared_error(y_test, xgb.predict(X_test))))

DCT RMSE:  10.082578332522457
RF RMSE:  6.99047762608153
XGB RMSE:  6.628357648394856


In [13]:
print(xgb.get_booster().save_config())

{"learner":{"generic_param":{"device":"cpu","fail_on_invalid_gpu_id":"0","n_jobs":"0","nthread":"0","random_state":"0","seed":"0","seed_per_iteration":"0","validate_parameters":"1"},"gradient_booster":{"gbtree_model_param":{"num_parallel_tree":"1","num_trees":"100"},"gbtree_train_param":{"process_type":"default","tree_method":"auto","updater":"grow_quantile_histmaker","updater_seq":"grow_quantile_histmaker"},"name":"gbtree","specified_updater":false,"tree_train_param":{"alpha":"0","cache_opt":"1","colsample_bylevel":"1","colsample_bynode":"1","colsample_bytree":"1","eta":"0.300000012","gamma":"0","grow_policy":"depthwise","interaction_constraints":"","lambda":"1","learning_rate":"0.300000012","max_bin":"256","max_cat_threshold":"64","max_cat_to_onehot":"4","max_delta_step":"0","max_depth":"6","max_leaves":"0","min_child_weight":"1","min_split_loss":"0","monotone_constraints":"()","refresh_leaf":"1","reg_alpha":"0","reg_lambda":"1","sampling_method":"uniform","sketch_ratio":"2","sparse_

In [16]:
xgb_tuning = GridSearchCV(XGBRegressor(random_state=42), 
                          param_grid = {'n_estimators': [50, 100]
                                        , 'max_depth': [5, 8]
                                        , 'learning_rate': [0.2, 0.3, 0.4]
                                        , 'min_child_weight': [1, 3, 5]
                                        #, 'subsample': [0.8, 1.0]
                                        #, 'colsample_bytree': [0.8, 1.0]
                                        #, 'gamma': [0, 0.1, 0.2],  
                                    }, 
                          cv=3, n_jobs=-1, 
                          scoring='r2')

xgb_tuning.fit(X_train, y_train)

print(xgb_tuning.best_score_)
best_xgb = xgb_tuning.best_estimator_

0.9026601695950269


In [19]:
print('train XGB RMSE: ', np.sqrt(mean_squared_error(y_train, best_xgb.predict(X_train))))
print('test XGB RMSE: ', np.sqrt(mean_squared_error(y_test, best_xgb.predict(X_test))))

train XGB RMSE:  5.853729883553944
test XGB RMSE:  6.398063573259226


In [22]:
xgb_tuning.best_params_

{'learning_rate': 0.2,
 'max_depth': 5,
 'min_child_weight': 5,
 'n_estimators': 100}

In [25]:
# refining
xgb_refine = GridSearchCV(XGBRegressor(random_state=42), 
                          param_grid = {'n_estimators': [100, 150, 200]
                                        , 'max_depth': [5, 7]
                                        , 'learning_rate': [0.02, 0.05, 0.1]
                                        , 'min_child_weight': [5, 7]
                                        #, 'subsample': [0.8, 1.0]
                                        #, 'colsample_bytree': [0.8, 1.0]
                                        #, 'gamma': [0, 0.1, 0.2],  
                                    }, 
                          cv=3, n_jobs=-1, 
                          scoring='r2')

xgb_refine.fit(X_train, y_train)

print(xgb_refine.best_score_)
print(xgb_refine.best_params_)
best_xgb = xgb_refine.best_estimator_

0.9058513848828862
{'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 5, 'n_estimators': 200}


In [26]:
print('train XGB RMSE: ', np.sqrt(mean_squared_error(y_train, best_xgb.predict(X_train))))
print('test XGB RMSE: ', np.sqrt(mean_squared_error(y_test, best_xgb.predict(X_test))))

train XGB RMSE:  5.986243910385184
test XGB RMSE:  6.327345421394047


- The estimated credit score, generated through various modeling approaches, achieved a best RÂ² score of approximately 0.9 and an RMSE of around 6. Given that the credit score scale ranges from 300 to 900, this level of error is not severe. 

- The slight inaccuracy in estimation may indicate that CIBIL incorporates additional attributes in its credit score formulation, or that exploring different algorithms could improve generalization.

- Based on the achieved accuracy and manageable error margin, the formulated credit score demonstrates potential as a reliable alternative to the CIBIL-provided credit score for credit risk classification. Following has been explored in [`explore3_model.ipynb`](explore3_model.ipynb).