In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
# !pip install feature-engine
from feature_engine.outliers import Winsorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline as imbpipe
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import pickle
import json

# !pip install shap==0.44.0
import shap

import warnings
warnings.filterwarnings(action="ignore")

In [2]:
df = pd.read_csv(r"https://raw.githubusercontent.com/rafifaditio/Data/main/credit_risk_dataset.csv")
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


# **Cleaning**

In [3]:
# rename columns
df.rename(columns={"person_age":"age", "person_income": "income",
                   "person_home_ownership":"home_ownership",
                   "loan_status":"loan_default",
                   "cb_person_default_on_file":"hist_default",
                   "cb_person_cred_hist_length":"cred_hist_length"},
          inplace=True)

# drop duplicates
df.drop_duplicates(inplace=True)

In [4]:
# split
X = df.drop("loan_default", axis=1)
y = df.loan_default

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)

for i in [X_train, X_test, y_train, y_test]:
    print(i.shape)

(22691, 11)
(9725, 11)
(22691,)
(9725,)


# **Preprocess - Pipeline**

In [5]:
num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")),
                     ("scaler", StandardScaler())])

ord_pipe = Pipeline([("ord_enc", OrdinalEncoder(categories=[["OTHER", "RENT", "OWN", "MORTGAGE"],
                                 ["A", "B", "C", "D", "E", "F", "G"]]))])

ohe_pipe = Pipeline ([("ohe_enc", OneHotEncoder(sparse=False))])

full_prep = ColumnTransformer([("num", num_pipe, ["loan_percent_income", "income", "loan_int_rate"]),
                               ("ord", ord_pipe, ["home_ownership", "loan_grade"])])

full_pipe = Pipeline([("prep", full_prep),
                      ("model", RandomForestClassifier(random_state=0))])

full_imbhandling = imbpipe([
            ("preprocess", full_prep),
            ("over_sampl", SMOTENC([3, 4], sampling_strategy=0.9, random_state=0)),
            ("model", RandomForestClassifier(random_state=0))
])

In [6]:
full_imbhandling

# **Hyper-Param Tuning**

In [7]:
# with imbalance handling
params = [
    {"model__n_estimators": [70],        # 1
     "model__max_features": [2, 5],            # 2
     "model__max_depth": [10, None]}              # 2 Total 1*2*2 = 4 combinations
  ]

rfc = RandomForestClassifier(random_state=0)

grid_search = GridSearchCV(full_imbhandling, params, cv=5,
                           scoring="f1",
                           return_train_score=True)
grid_search.fit(X_train, y_train)

# **Final Pipeline - Preprocess & Model**

In [8]:
X_train.head()

Unnamed: 0,age,income,home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,hist_default,cred_hist_length
8224,22,56004,MORTGAGE,6.0,MEDICAL,B,6000,11.86,0.11,N,3
10004,25,82000,RENT,2.0,MEDICAL,A,9200,6.03,0.11,N,3
21434,29,48000,MORTGAGE,8.0,VENTURE,A,4500,5.42,0.09,N,6
14135,25,100000,MORTGAGE,9.0,EDUCATION,B,8000,10.25,0.08,N,3
13979,25,60000,RENT,5.0,HOMEIMPROVEMENT,B,14000,,0.23,N,2


In [9]:
X_test.head(1)

Unnamed: 0,age,income,home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,hist_default,cred_hist_length
13777,26,64900,RENT,4.0,MEDICAL,B,13625,,0.21,N,3


In [10]:
# with imbalance handling
model = grid_search.best_estimator_

# model.fit(train, y_train)

model_train = model.predict(X_train)
model_test = model.predict(X_test)

print(classification_report(y_train, model_train))
print(classification_report(y_test, model_test))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94     17729
           1       0.78      0.77      0.78      4962

    accuracy                           0.90     22691
   macro avg       0.86      0.85      0.86     22691
weighted avg       0.90      0.90      0.90     22691

              precision    recall  f1-score   support

           0       0.93      0.93      0.93      7598
           1       0.76      0.74      0.75      2127

    accuracy                           0.89      9725
   macro avg       0.84      0.84      0.84      9725
weighted avg       0.89      0.89      0.89      9725



In [11]:
model.predict(X_test.head(5))

array([0, 0, 0, 0, 1], dtype=int64)

# **Save Pipeline**

In [12]:
pickle.dump(model, open("backend/model_final.pkl", "wb"))

model = pickle.load(open("backend/model_final.pkl", "rb"))
model.predict

<bound method Pipeline.predict of Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['loan_percent_income',
                                                   'income', 'loan_int_rate']),
                                                 ('ord',
                                                  Pipeline(steps=[('ord_enc',
                                                                   OrdinalEncoder(categories=[['OTHER',
                                                                                               'RENT',
                                     

In [13]:
model.predict(X_test.head(5))

array([0, 0, 0, 0, 1], dtype=int64)

# **Serving Scenario**

In [14]:
df = pd.read_csv(r"https://raw.githubusercontent.com/rafifaditio/Data/main/credit_risk_dataset.csv")
real_data = df.loc[0].to_dict()
real_data

{'person_age': 22,
 'person_income': 59000,
 'person_home_ownership': 'RENT',
 'person_emp_length': 123.0,
 'loan_intent': 'PERSONAL',
 'loan_grade': 'D',
 'loan_amnt': 35000,
 'loan_int_rate': 16.02,
 'loan_status': 1,
 'loan_percent_income': 0.59,
 'cb_person_default_on_file': 'Y',
 'cb_person_cred_hist_length': 3}

In [15]:
replacement = {"person_age":"age", "person_income": "income",
                "person_home_ownership":"home_ownership",
                "loan_status":"loan_default",
                "cb_person_default_on_file":"hist_default",
                "cb_person_cred_hist_length":"cred_hist_length"}

for k, v in list(real_data.items()):
    real_data[replacement.get(k, k)] = real_data.pop(k)

print(real_data)

{'age': 22, 'income': 59000, 'home_ownership': 'RENT', 'person_emp_length': 123.0, 'loan_intent': 'PERSONAL', 'loan_grade': 'D', 'loan_amnt': 35000, 'loan_int_rate': 16.02, 'loan_default': 1, 'loan_percent_income': 0.59, 'hist_default': 'Y', 'cred_hist_length': 3}


In [16]:
model.predict(pd.DataFrame(real_data,index=[0]))[0]

1

In [17]:
# test 2
X_test.iloc[0,:]

age                         26
income                   64900
home_ownership            RENT
person_emp_length          4.0
loan_intent            MEDICAL
loan_grade                   B
loan_amnt                13625
loan_int_rate              NaN
loan_percent_income       0.21
hist_default                 N
cred_hist_length             3
Name: 13777, dtype: object

In [18]:
for i in ["person_home_ownership",'loan_intent','loan_grade','cb_person_default_on_file']:
    print(df[i].unique())

['RENT' 'OWN' 'MORTGAGE' 'OTHER']
['PERSONAL' 'EDUCATION' 'MEDICAL' 'VENTURE' 'HOMEIMPROVEMENT'
 'DEBTCONSOLIDATION']
['D' 'B' 'C' 'A' 'E' 'F' 'G']
['Y' 'N']


In [19]:
real_data = {"person_age": 26,
"person_income": 64900,
"person_home_ownership": "RENT",
"person_emp_length": 4.0,
"loan_intent": "MEDICAL",
"loan_grade": "B",
"loan_amnt": 13625,
"loan_int_rate": np.NaN,
"loan_percent_income": 0.21,
"cb_person_default_on_file": "N",
"cb_person_cred_hist_length": 3}

replacement = {"person_age":"age", "person_income": "income",
                "person_home_ownership":"home_ownership",
                "loan_status":"loan_default",
                "cb_person_default_on_file":"hist_default",
                "cb_person_cred_hist_length":"cred_hist_length"}

for k, v in list(real_data.items()):
    real_data[replacement.get(k, k)] = real_data.pop(k)

print(real_data)
print(model.predict(pd.DataFrame(real_data,index=[0]))[0])

{'age': 26, 'income': 64900, 'home_ownership': 'RENT', 'person_emp_length': 4.0, 'loan_intent': 'MEDICAL', 'loan_grade': 'B', 'loan_amnt': 13625, 'loan_int_rate': nan, 'loan_percent_income': 0.21, 'hist_default': 'N', 'cred_hist_length': 3}
0


In [20]:
import sklearn
import imblearn
import feature_engine
import uvicorn
import fastapi
import pandas
import streamlit
import numpy
import altair

for i in [sklearn,imblearn,feature_engine,uvicorn,fastapi,pandas,streamlit,numpy,altair]:
    print(i.__version__)

1.3.2
0.12.0
1.2.0
0.30.1
0.111.0
1.5.0
1.10.0
1.22.4
4.1.0
