In [1]:
import pandas as pd
import pickle
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.feature_selection import SequentialFeatureSelector, RFE, SelectFromModel
import seaborn as sns

In [2]:
df = pd.read_csv("./dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93312 entries, 0 to 93311
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       93312 non-null  float64
 1   Occupation                93312 non-null  int64  
 2   Annual_Income             93312 non-null  float64
 3   Monthly_Inhand_Salary     93312 non-null  float64
 4   Num_Bank_Accounts         93312 non-null  float64
 5   Num_Credit_Card           93312 non-null  float64
 6   Interest_Rate             93312 non-null  float64
 7   Num_of_Loan               93312 non-null  int64  
 8   Delay_from_due_date       93312 non-null  float64
 9   Num_of_Delayed_Payment    93312 non-null  float64
 10  Changed_Credit_Limit      93312 non-null  float64
 11  Num_Credit_Inquiries      93312 non-null  float64
 12  Credit_Mix                93312 non-null  int64  
 13  Outstanding_Debt          93312 non-null  float64
 14  Credit

In [3]:
labels = ['Poor', 'Standard', 'Good']

In [10]:
X = df.drop(['Credit_Score'], axis=1)
y = df['Credit_Score']
X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=36)

print(X_train.shape, y_train.shape)

(74649, 30) (74649,)


In [5]:
clf = RandomForestClassifier()
selector = RFE(clf)
selector.fit(X_train, y_train)

In [7]:
selector.support_

array([False, False,  True,  True, False,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True, False,  True, False, False, False, False, False, False,
       False, False, False])

In [11]:
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)
X = selector.transform(X)
selected = [col for i, col in enumerate(df.drop('Credit_Score', axis=1).columns) if selector.support_[i]]
selected

['Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_Credit_Card',
 'Interest_Rate',
 'Delay_from_due_date',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Credit_Mix',
 'Outstanding_Debt',
 'Credit_Utilization_Ratio',
 'Credit_History_Age',
 'Total_EMI_per_month',
 'Amount_invested_monthly',
 'Monthly_Balance']

In [12]:
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(pd.DataFrame(metrics.confusion_matrix(y_test, y_pred, labels=np.unique(y_test)), 
             index=['True ' + x for x in labels], 
             columns=['Predict ' + x for x in labels]))
print(metrics.classification_report(y_test, y_pred, target_names=labels))

               Predict Poor  Predict Standard  Predict Good
True Poor              4191              1012            52
True Standard          1198              8167           715
True Good                22               922          2384
              precision    recall  f1-score   support

        Poor       0.77      0.80      0.79      5255
    Standard       0.81      0.81      0.81     10080
        Good       0.76      0.72      0.74      3328

    accuracy                           0.79     18663
   macro avg       0.78      0.77      0.78     18663
weighted avg       0.79      0.79      0.79     18663



In [13]:
def cross_val_metrics(clf, X, y, cv=5):
    accuracy = []
    f1 = []
    precision = []
    recall = []
    kf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=36)
    for train_idx, test_idx in kf.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy.append(metrics.accuracy_score(y_test, y_pred))
        precision.append(metrics.precision_score(y_test, y_pred, average=None))
        recall.append(metrics.recall_score(y_test, y_pred, average=None))
        f1.append(metrics.f1_score(y_test, y_pred, average=None))
    print("accuracy", np.mean(accuracy))
    print("precision", np.mean(precision, axis=0))
    print("recall",np.mean(recall, axis=0))
    print("f1",np.mean(f1, axis=0))

In [14]:
cross_val_metrics(clf, X, y)

accuracy 0.8003258593599878
precision [0.78823336 0.81788258 0.76490404]
recall [0.81150091 0.82017147 0.72256295]
f1 [0.79969583 0.8190135  0.74305921]
