In [59]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [60]:
df = pd.read_csv("./CreditScoring.csv")
df.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [61]:
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [62]:
status_values = {
    1: "ok",
    2: "default",
    3: "unk"
}

df.status = df.status.map(status_values)
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,1,60,30,2,1,3,73,129,0,0,800,846
1,ok,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,default,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,ok,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,ok,0,1,36,26,1,1,1,46,107,0,0,310,910


In [63]:
home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

df.home = df.home.map(home_values)

In [64]:
marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

df.marital = df.marital.map(marital_values)

In [65]:
records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

df.records = df.records.map(records_values)

In [66]:
job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.job = df.job.map(job_values)

In [67]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [68]:
df.describe().round().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
seniority,4455.0,8.0,8.0,0.0,2.0,5.0,12.0,48.0
time,4455.0,46.0,15.0,6.0,36.0,48.0,60.0,72.0
age,4455.0,37.0,11.0,18.0,28.0,36.0,45.0,68.0
expenses,4455.0,56.0,20.0,35.0,35.0,51.0,72.0,180.0
income,4455.0,763317.0,8703625.0,0.0,80.0,120.0,166.0,99999999.0
assets,4455.0,1060341.0,10217569.0,0.0,0.0,3500.0,6000.0,99999999.0
debt,4455.0,404382.0,6344253.0,0.0,0.0,0.0,0.0,99999999.0
amount,4455.0,1039.0,475.0,100.0,700.0,1000.0,1300.0,5000.0
price,4455.0,1463.0,628.0,105.0,1118.0,1400.0,1692.0,11140.0


In [69]:
for col in ["income", "assets", "debt"]:
    df[col] = df[col].replace(to_replace=99999999, value=np.nan)

In [70]:
df.isnull().sum()

status        1
seniority     0
home          0
time          0
age           0
marital       0
records       0
job           0
expenses      0
income       34
assets       47
debt         18
amount        0
price         0
dtype: int64

In [71]:
df.describe().round().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
seniority,4455.0,8.0,8.0,0.0,2.0,5.0,12.0,48.0
time,4455.0,46.0,15.0,6.0,36.0,48.0,60.0,72.0
age,4455.0,37.0,11.0,18.0,28.0,36.0,45.0,68.0
expenses,4455.0,56.0,20.0,35.0,35.0,51.0,72.0,180.0
income,4421.0,131.0,86.0,0.0,80.0,120.0,165.0,959.0
assets,4408.0,5403.0,11573.0,0.0,0.0,3000.0,6000.0,300000.0
debt,4437.0,343.0,1246.0,0.0,0.0,0.0,0.0,30000.0
amount,4455.0,1039.0,475.0,100.0,700.0,1000.0,1300.0,5000.0
price,4455.0,1463.0,628.0,105.0,1118.0,1400.0,1692.0,11140.0


In [72]:
df.status.value_counts()

ok         3200
default    1254
Name: status, dtype: int64

In [73]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=143)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=145)

In [74]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [75]:
y_train = (df_train.status=="default").astype("int").values
y_val = (df_val.status=="default").astype("int").values
y_test = (df_test.status=="default").astype("int").values

In [76]:
del df_train["status"]
del df_val["status"]
del df_test["status"]

In [77]:
df_train.head()

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,12,owner,24,36,married,no,fixed,60,243.0,6000.0,0.0,600,1376
1,1,rent,48,46,married,no,fixed,103,125.0,1500.0,0.0,1100,1408
2,19,owner,48,52,married,no,fixed,60,68.0,6000.0,0.0,850,1393
3,12,owner,60,28,single,yes,fixed,35,172.0,3000.0,2000.0,950,1184
4,5,rent,60,31,married,no,fixed,86,109.0,0.0,0.0,830,1700


In [78]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

In [79]:
train_dicts = df_train.fillna(0).to_dict(orient="records")

In [80]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [81]:
dv.get_feature_names_out()

array(['age', 'amount', 'assets', 'debt', 'expenses', 'home=ignore',
       'home=other', 'home=owner', 'home=parents', 'home=private',
       'home=rent', 'home=unk', 'income', 'job=fixed', 'job=freelance',
       'job=others', 'job=partime', 'marital=divorced', 'marital=married',
       'marital=separated', 'marital=single', 'marital=widow', 'price',
       'records=no', 'records=yes', 'seniority', 'time'], dtype=object)

In [82]:
val_dicts = df_val.fillna(0).to_dict(orient="records")
X_val = dv.transform(val_dicts)

### Gradient boosting and XGBoost

In [83]:
import xgboost as xgb

In [84]:
features = dv.get_feature_names_out()
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [85]:
# adding watchlist

watchlist = [(dtrain, "train"), (dval, "val")]

In [86]:
def parse_xgb_op(output):

    results = []
    for line in output.stdout.strip().split("\n"):
        it_line, train_line, val_line = line.split("\t")

        it = int(it_line.strip("[]"))
        train = float(train_line.split(":")[1])
        val = float(val_line.split(":")[1])

        results.append((it, train, val))
        columns = ['num_iter', "train_auc", "val_auc"]

    df_results = pd.DataFrame(results, columns=columns)
    return df_results

In [87]:
df_full_train = df_full_train.reset_index(drop=True)

In [88]:
y_full_train = (df_full_train.status=="default").astype("int").values
y_full_train

array([0, 1, 0, ..., 1, 0, 0])

In [89]:
del df_full_train["status"]

In [90]:
dict_full_train = df_full_train.to_dict(orient="records")

dv = DictVectorizer(sparse=False)

X_full_train = dv.fit_transform(dict_full_train)


dict_test = df_test.to_dict(orient="records")
X_test = dv.transform(dict_test)

In [91]:
dfulltrain = xgb.DMatrix(X_full_train, label=y_full_train,
                    feature_names=dv.get_feature_names_out())

dtest = xgb.DMatrix(X_test,
                    feature_names=dv.get_feature_names_out())

In [92]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 3,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dfulltrain, num_boost_round=175)

In [93]:
y_pred = model.predict(dtest)
y_pred[:10]

array([0.05018909, 0.86128134, 0.16283342, 0.15887548, 0.04413635,
       0.02989012, 0.15406555, 0.01076851, 0.08983512, 0.08721088],
      dtype=float32)

In [94]:
roc_auc_score(y_test, y_pred)

0.8431434208903688

## Building prediction Service with BentoML

In [99]:
import bentoml

bentoml.xgboost.save_model("credit_risk_xgb_model", model,
                            custom_objects={
                                "dictVectorizer":dv
                            })

Model(tag="credit_risk_xgb_model:r7inheuec2qn3paj", path="C:\Users\soumy\bentoml\models\credit_risk_xgb_model\r7inheuec2qn3paj\")