In [68]:
import pandas as pd
import pickle
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.feature_selection import SequentialFeatureSelector, RFE, SelectFromModel
import seaborn as sns

In [55]:
df = pd.read_csv("./dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93312 entries, 0 to 93311
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       93312 non-null  float64
 1   Occupation                93312 non-null  int64  
 2   Annual_Income             93312 non-null  float64
 3   Monthly_Inhand_Salary     93312 non-null  float64
 4   Num_Bank_Accounts         93312 non-null  float64
 5   Num_Credit_Card           93312 non-null  float64
 6   Interest_Rate             93312 non-null  float64
 7   Num_of_Loan               93312 non-null  int64  
 8   Delay_from_due_date       93312 non-null  float64
 9   Num_of_Delayed_Payment    93312 non-null  float64
 10  Changed_Credit_Limit      93312 non-null  float64
 11  Num_Credit_Inquiries      93312 non-null  float64
 12  Credit_Mix                93312 non-null  int64  
 13  Outstanding_Debt          93312 non-null  float64
 14  Credit

In [71]:
def cross_val_metrics(clf, X, y, cv=5):
    accuracy = []
    f1 = []
    precision = []
    recall = []
    kf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=36)
    for train_idx, test_idx in kf.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy.append(metrics.accuracy_score(y_test, y_pred))
        precision.append(metrics.precision_score(y_test, y_pred, average=None))
        recall.append(metrics.recall_score(y_test, y_pred, average=None))
        f1.append(metrics.f1_score(y_test, y_pred, average=None))
    print("accuracy", np.mean(accuracy))
    print("precision", np.mean(precision, axis=0))
    print("recall",np.mean(recall, axis=0))
    print("f1",np.mean(f1, axis=0))

In [56]:
corr = df.corr()
corr_target = abs(corr["Credit_Score"])
corr_target.describe()

count    31.000000
mean      0.244838
std       0.192501
min       0.007299
25%       0.144132
50%       0.164118
75%       0.369661
max       1.000000
Name: Credit_Score, dtype: float64

In [57]:
#relevant_features = corr_target[corr_target > corr_target.quantile(1/3)]
relevant_features = corr_target
relevant_features.sort_values(inplace=True)
relevant_features = relevant_features.tail(16)
relevant_features

Monthly_Balance           0.164118
Changed_Credit_Limit      0.171321
Monthly_Inhand_Salary     0.194045
Annual_Income             0.196990
Credit_Mix                0.281879
Payment_of_Min_Amount     0.287411
Num_of_Loan               0.352082
Num_of_Delayed_Payment    0.362101
Num_Bank_Accounts         0.377222
Outstanding_Debt          0.380709
Credit_History_Age        0.384106
Num_Credit_Card           0.396542
Delay_from_due_date       0.419952
Num_Credit_Inquiries      0.430619
Interest_Rate             0.479631
Credit_Score              1.000000
Name: Credit_Score, dtype: float64

In [58]:
corr_features = df[relevant_features.index].corr()
corr_features.values[np.tril_indices_from(corr_features.values)] = np.nan
corr_features = corr_features.applymap(lambda x: abs(x))

In [59]:
corr_features.unstack().describe()

count    120.000000
mean       0.370407
std        0.142214
min        0.153324
25%        0.265093
50%        0.372575
75%        0.465076
max        0.997799
dtype: float64

In [60]:
corr_arr = []

for col in relevant_features.index:
    for col2 in relevant_features.index:
        if col == col2 or col in corr_arr or col2 in corr_arr:
            continue
        if abs(df[[col, col2]].corr()[col][col2]) > .75:
            corr_arr.append(col)

In [61]:
relevant_features.drop(corr_arr, axis=0, inplace=True)

In [62]:
relevant_features

Monthly_Balance           0.164118
Changed_Credit_Limit      0.171321
Annual_Income             0.196990
Credit_Mix                0.281879
Payment_of_Min_Amount     0.287411
Num_of_Loan               0.352082
Num_of_Delayed_Payment    0.362101
Num_Bank_Accounts         0.377222
Outstanding_Debt          0.380709
Credit_History_Age        0.384106
Num_Credit_Card           0.396542
Delay_from_due_date       0.419952
Num_Credit_Inquiries      0.430619
Interest_Rate             0.479631
Credit_Score              1.000000
Name: Credit_Score, dtype: float64

In [63]:
corr_arr

['Monthly_Inhand_Salary']

In [64]:
labels = ['Poor', 'Standard', 'Good']

In [65]:
X = df[relevant_features.index].drop(['Credit_Score'], axis=1)
y = df['Credit_Score']
X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=36)

print(X_train.shape, y_train.shape)

(74649, 14) (74649,)


In [66]:
clf = RandomForestClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(pd.DataFrame(metrics.confusion_matrix(y_test, y_pred, labels=np.unique(y_test)), 
             index=['True ' + x for x in labels], 
             columns=['Predict ' + x for x in labels]))
print(metrics.classification_report(y_test, y_pred, target_names=labels))

               Predict Poor  Predict Standard  Predict Good
True Poor              4272               935            48
True Standard          1226              8101           753
True Good                14               871          2443
              precision    recall  f1-score   support

        Poor       0.78      0.81      0.79      5255
    Standard       0.82      0.80      0.81     10080
        Good       0.75      0.73      0.74      3328

    accuracy                           0.79     18663
   macro avg       0.78      0.78      0.78     18663
weighted avg       0.79      0.79      0.79     18663



In [72]:
cross_val_metrics(clf, X, y, cv=5)

accuracy 0.8025442556014115
precision [0.79009059 0.82139378 0.76470499]
recall [0.81492632 0.81812768 0.73578573]
f1 [0.80231571 0.81974804 0.74989136]
