In [1]:
import pandas as pd
import pickle
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.feature_selection import SequentialFeatureSelector, RFE, SelectFromModel
import seaborn as sns
import phik

In [2]:
df = pd.read_csv("./dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93312 entries, 0 to 93311
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       93312 non-null  float64
 1   Occupation                93312 non-null  int64  
 2   Annual_Income             93312 non-null  float64
 3   Monthly_Inhand_Salary     93312 non-null  float64
 4   Num_Bank_Accounts         93312 non-null  float64
 5   Num_Credit_Card           93312 non-null  float64
 6   Interest_Rate             93312 non-null  float64
 7   Num_of_Loan               93312 non-null  int64  
 8   Delay_from_due_date       93312 non-null  float64
 9   Num_of_Delayed_Payment    93312 non-null  float64
 10  Changed_Credit_Limit      93312 non-null  float64
 11  Num_Credit_Inquiries      93312 non-null  float64
 12  Credit_Mix                93312 non-null  int64  
 13  Outstanding_Debt          93312 non-null  float64
 14  Credit

In [3]:
def cross_val_metrics(clf, X, y, cv=5):
    accuracy = []
    f1 = []
    precision = []
    recall = []
    kf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=36)
    for train_idx, test_idx in kf.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy.append(metrics.accuracy_score(y_test, y_pred))
        precision.append(metrics.precision_score(y_test, y_pred, average=None))
        recall.append(metrics.recall_score(y_test, y_pred, average=None))
        f1.append(metrics.f1_score(y_test, y_pred, average=None))
    print("accuracy", np.mean(accuracy))
    print("precision", np.mean(precision, axis=0))
    print("recall",np.mean(recall, axis=0))
    print("f1",np.mean(f1, axis=0))

In [4]:
labels = ['Poor', 'Standard', 'Good']

In [5]:
relevant_features = ['Debt Consolidation Loan', 'Changed_Credit_Limit', 'Annual_Income',
       'Num_of_Loan', 'Credit_History_Age', 'Num_of_Delayed_Payment',
       'Num_Bank_Accounts', 'Delay_from_due_date', 'Num_Credit_Inquiries',
       'Num_Credit_Card', 'Outstanding_Debt', 'Interest_Rate', 'Credit_Mix']

In [6]:
X = df[relevant_features]
y = df['Credit_Score']
X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=36)

print(X_train.shape, y_train.shape)

(74649, 13) (74649,)


In [7]:
clf = DecisionTreeClassifier()
clf.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [8]:
clf = DecisionTreeClassifier(random_state=36)
params = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None] + list(range(10, 51, 5)),
    'min_samples_leaf': [1, 2, 3, 4] + list(range(5, 51, 5)),
    'min_samples_split': list(range(2, 50, 5))
}
grid = GridSearchCV(clf, params, cv=4, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
print(grid.best_params_)

Fitting 4 folds for each of 4200 candidates, totalling 16800 fits
{'criterion': 'entropy', 'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [13]:
clf = DecisionTreeClassifier(**grid.best_params_)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(pd.DataFrame(metrics.confusion_matrix(y_test, y_pred, labels=np.unique(y_test)), 
             index=['True ' + x for x in labels], 
             columns=['Predict ' + x for x in labels]))
print(metrics.classification_report(y_test, y_pred, target_names=labels))

               Predict Poor  Predict Standard  Predict Good
True Poor              4175              1032            48
True Standard          1369              7798           913
True Good                77               885          2366
              precision    recall  f1-score   support

        Poor       0.74      0.79      0.77      5255
    Standard       0.80      0.77      0.79     10080
        Good       0.71      0.71      0.71      3328

    accuracy                           0.77     18663
   macro avg       0.75      0.76      0.76     18663
weighted avg       0.77      0.77      0.77     18663



In [10]:
clf.get_depth()

25

In [11]:
cross_val_metrics(clf, X, y)

accuracy 0.7746485229707546
precision [0.75500243 0.80396624 0.72018604]
recall [0.78413744 0.78761079 0.72039857]
f1 [0.76925072 0.79568352 0.72022403]


In [15]:
clf = DecisionTreeClassifier(**grid.best_params_)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)
print(pd.DataFrame(metrics.confusion_matrix(y_train, y_pred, labels=np.unique(y_train)), 
             index=['True ' + x for x in labels], 
             columns=['Predict ' + x for x in labels]))
print(metrics.classification_report(y_train, y_pred, target_names=labels))

               Predict Poor  Predict Standard  Predict Good
True Poor             20680               340             1
True Standard          1547             38451           320
True Good                 9               433         12868
              precision    recall  f1-score   support

        Poor       0.93      0.98      0.96     21021
    Standard       0.98      0.95      0.97     40318
        Good       0.98      0.97      0.97     13310

    accuracy                           0.96     74649
   macro avg       0.96      0.97      0.96     74649
weighted avg       0.97      0.96      0.96     74649



In [12]:
clf = DecisionTreeClassifier(**grid.best_params_)
clf = clf.fit(X, y)

clf.features = relevant_features
clf.labels = labels

with open('model.pkl', 'wb') as file:
    pickle.dump(clf, file)