In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
data = 'CreditScoring.csv'

df = pd.read_csv(data)
df.columns = df.columns.str.lower()

### Preprocessing

In [3]:
status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
}

df.status = df.status.map(status_values)

In [4]:
home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

df.home = df.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

df.marital = df.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

df.records = df.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.job = df.job.map(job_values)

In [5]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [6]:
for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)
    
df = df[df.status != 'unk'].reset_index(drop=True)

In [106]:
df_train.iloc[[0]]

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,10,owner,36,36,married,no,freelance,75,0.0,10000.0,0.0,1000,1400


In [38]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=11)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = (df_train.status == 'default').astype('int').values
y_val = (df_val.status == 'default').astype('int').values
y_test = (df_test.status == 'default').astype('int').values

del df_train['status']
del df_val['status']
del df_test['status']

### Seeing how the distribution of 'status' laid out on set threshold

In [14]:
T = 120.0
df_left = df[df.income <= T]
df_right = df[df.income > T]

display(df_left[['income','status']])
print(df_left.status.value_counts(normalize=True))
display(df_right[['income','status']])
print(df_left.status.value_counts(normalize=True))

Unnamed: 0,income,status
4,107.0,ok
7,80.0,ok
8,107.0,ok
9,80.0,default
14,50.0,default
...,...,...
4445,69.0,ok
4448,77.0,default
4449,92.0,default
4450,75.0,ok


status
ok         0.633131
default    0.366869
Name: proportion, dtype: float64


Unnamed: 0,income,status
0,129.0,ok
1,131.0,ok
2,200.0,default
3,182.0,ok
5,214.0,ok
...,...,...
4443,242.0,ok
4446,190.0,ok
4447,160.0,ok
4452,140.0,ok


status
ok         0.633131
default    0.366869
Name: proportion, dtype: float64


In [16]:
T = 500.0
df_left = df[df.assets <= T]
df_right = df[df.assets > T]

display(df_left[['assets','status']])
print(df_left.status.value_counts(normalize=True))
display(df_right[['assets','status']])
print(df_left.status.value_counts(normalize=True))

Unnamed: 0,assets,status
0,0.0,ok
1,0.0,ok
4,0.0,ok
7,0.0,ok
9,0.0,default
...,...,...
4435,0.0,default
4444,0.0,default
4448,0.0,default
4449,0.0,default


status
ok         0.599633
default    0.400367
Name: proportion, dtype: float64


Unnamed: 0,assets,status
2,3000.0,default
3,2500.0,ok
5,3500.0,ok
6,10000.0,ok
8,15000.0,ok
...,...,...
4446,3500.0,ok
4447,3000.0,ok
4450,3000.0,ok
4451,3500.0,default


status
ok         0.599633
default    0.400367
Name: proportion, dtype: float64


### Building dataset

In [39]:
from sklearn.feature_extraction import DictVectorizer

# Filling missing values with 0
dict_train = df_train.fillna(0).to_dict(orient='records')
dict_val = df_val.fillna(0).to_dict(orient='records')

In [47]:
dict_train[10]

{'seniority': 1,
 'home': 'owner',
 'time': 48,
 'age': 39,
 'marital': 'married',
 'records': 'yes',
 'job': 'freelance',
 'expenses': 60,
 'income': 0.0,
 'assets': 6000.0,
 'debt': 0.0,
 'amount': 1300,
 'price': 3626}

In [40]:
dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(dict_train)
X_val = dv.transform(dict_val)

### Bringing Forth MLFLow and SHAP

In [18]:
import shap
import mlflow
from mlflow.models import infer_signature
import xgboost

In [60]:
list(dv.get_feature_names_out())

['age',
 'amount',
 'assets',
 'debt',
 'expenses',
 'home=ignore',
 'home=other',
 'home=owner',
 'home=parents',
 'home=private',
 'home=rent',
 'home=unk',
 'income',
 'job=fixed',
 'job=freelance',
 'job=others',
 'job=partime',
 'job=unk',
 'marital=divorced',
 'marital=married',
 'marital=separated',
 'marital=single',
 'marital=unk',
 'marital=widow',
 'price',
 'records=no',
 'records=yes',
 'seniority',
 'time']

In [61]:
model_train = pd.DataFrame(X_train, columns=list(dv.get_feature_names_out()))
model_train

Unnamed: 0,age,amount,assets,debt,expenses,home=ignore,home=other,home=owner,home=parents,home=private,...,marital=married,marital=separated,marital=single,marital=unk,marital=widow,price,records=no,records=yes,seniority,time
0,36.0,1000.0,10000.0,0.0,75.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1400.0,1.0,0.0,10.0,36.0
1,32.0,1100.0,0.0,0.0,35.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1330.0,0.0,1.0,6.0,48.0
2,40.0,1320.0,0.0,0.0,75.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1600.0,1.0,0.0,1.0,48.0
3,23.0,1078.0,0.0,0.0,35.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1079.0,1.0,0.0,1.0,48.0
4,46.0,1100.0,4000.0,0.0,60.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1897.0,1.0,0.0,5.0,36.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2667,45.0,800.0,20000.0,0.0,45.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1600.0,1.0,0.0,18.0,36.0
2668,29.0,1000.0,3500.0,500.0,60.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1290.0,1.0,0.0,7.0,60.0
2669,19.0,400.0,0.0,0.0,35.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,600.0,1.0,0.0,1.0,24.0
2670,43.0,2500.0,18000.0,0.0,60.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,2976.0,1.0,0.0,15.0,48.0


In [84]:
model_val = pd.DataFrame(X_val, columns=list(dv.get_feature_names_out()))
model_val

Unnamed: 0,age,amount,assets,debt,expenses,home=ignore,home=other,home=owner,home=parents,home=private,...,marital=married,marital=separated,marital=single,marital=unk,marital=widow,price,records=no,records=yes,seniority,time
0,31.0,550.0,0.0,0.0,49.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,999.0,1.0,0.0,6.0,36.0
1,38.0,1000.0,0.0,0.0,56.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1307.0,1.0,0.0,18.0,60.0
2,40.0,700.0,0.0,0.0,58.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,982.0,0.0,1.0,17.0,24.0
3,29.0,1200.0,6000.0,3000.0,75.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1521.0,1.0,0.0,2.0,48.0
4,61.0,750.0,0.0,0.0,57.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,830.0,1.0,0.0,25.0,42.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,21.0,350.0,1500.0,0.0,35.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,350.0,1.0,0.0,0.0,12.0
887,45.0,1150.0,5000.0,0.0,75.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1452.0,1.0,0.0,12.0,60.0
888,36.0,3900.0,29000.0,0.0,60.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,4812.0,0.0,1.0,2.0,60.0
889,25.0,300.0,0.0,0.0,35.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1397.0,0.0,1.0,3.0,24.0


In [73]:
# Set the MLflow tracking URI to 'http' or 'https'
mlflow.set_tracking_uri('http://127.0.0.1:5000')  # Update the URI as needed

# Fit an XGBoost binary classifier on the training data split
model = xgboost.XGBClassifier().fit(model_train, y_train)

# Create a model signature
signature = infer_signature(model_val, model.predict(model_val))

# Build the Evaluation Dataset from the test set
eval_data = model_val.copy()
eval_data["label"] = y_val

with mlflow.start_run() as run:
    # Log the baseline model to MLflow
    mlflow.sklearn.log_model(model, "model", signature=signature)
    model_uri = mlflow.get_artifact_uri("model")

    # Evaluate the logged model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )

 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/11/12 20:47:21 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/12 20:47:21 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/12 20:47:24 INFO mlflow.models.evaluation.default_evaluator: Shap explainer TreeExplainer is used.
Unable to serialize underlying model using MLflow, will use SHAP serialization


### Optuna

In [75]:
import optuna
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

In [76]:
model

In [None]:
{'reg_alpha': 9.656363185071173e-06,
 'reg_lambda': 2.9644565348722744e-06,
 'num_leaves': 209,
 'learning_rate': 0.0504229854696161,
 'max_depth': 10,
 'colsample_bytree': 0.09612300634389437,
 'subsample': 0.4037860779670473,
 'subsample_freq': 6,
 'n_estimators': 300,
 'random_state': 23,
 'min_child_weight': 13}

In [98]:
dv.get_feature_names_out()

array(['age', 'amount', 'assets', 'debt', 'expenses', 'home=ignore',
       'home=other', 'home=owner', 'home=parents', 'home=private',
       'home=rent', 'home=unk', 'income', 'job=fixed', 'job=freelance',
       'job=others', 'job=partime', 'job=unk', 'marital=divorced',
       'marital=married', 'marital=separated', 'marital=single',
       'marital=unk', 'marital=widow', 'price', 'records=no',
       'records=yes', 'seniority', 'time'], dtype=object)

In [100]:
xgboost.DMatrix(X_train, label=y_train, feature_names=dv.get_feature_names_out())

<xgboost.core.DMatrix at 0x1eebfbb0ca0>

In [None]:
features = dv.get_feature_names_out()
dtrain = xgboost.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgboost.DMatrix(X_val, label=y_val, feature_names=features)

In [85]:
def objective(trial):
    params = {
        'random_state': 23,
        'n_estimators': 200,
        'reg_alpha': trial.suggest_float('reg_alpha', 1E-10, 1E-5),
        'reg_lambda': trial.suggest_float('reg_lambda', 1E-10, 1E-5),
        'num_leaves': trial.suggest_int('num_leaves', 150, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 10,50),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 0.1),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'min_child_weight': trial.suggest_int('min_child_samples', 1, 20),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 10)
    }
    
    # Create the pipeline with SMOTE and LGBMClassifier
    pipeline = Pipeline([
        ('classifier', xgboost.XGBClassifier(**params))
    ])
    
    pipeline.fit(model_train, y_train)
    
    # Make predictions on the validation set
    y_pred = pipeline.predict(model_val)

    # Calculate MAE as the evaluation metric
    roc_auc = roc_auc_score(y_val, y_pred)

    # Return the evaluation metric value as the objective value to be minimized
    return roc_auc

In [86]:
%%time

# Create an Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[I 2023-11-12 20:59:43,579] A new study created in memory with name: no-name-1481a088-0172-4890-b685-5da22266de72


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:44,339] Trial 0 finished with value: 0.5900529171005787 and parameters: {'reg_alpha': 8.531261917780023e-06, 'reg_lambda': 6.1133351792453475e-06, 'num_leaves': 297, 'learning_rate': 0.033460767283431954, 'max_depth': 28, 'colsample_bytree': 0.08096000939477681, 'subsample': 0.31760827174362793, 'min_child_samples': 16, 'subsample_freq': 5}. Best is trial 0 with value: 0.5900529171005787.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:44,872] Trial 1 finished with value: 0.594829987164273 and parameters: {'reg_alpha': 2.728848559118151e-06, 'reg_lambda': 9.807121972578338e-06, 'num_leaves': 267, 'learning_rate': 0.061261882818594994, 'max_depth': 19, 'colsample_bytree': 0.017548266542040118, 'subsample': 0.49192856439088395, 'min_child_samples': 13, 'subsample_freq': 4}. Best is trial 1 with value: 0.594829987164273.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:45,312] Trial 2 finished with value: 0.629132256423918 and parameters: {'reg_alpha': 7.820478949158031e-06, 'reg_lambda': 3.594049357472304e-06, 'num_leaves': 191, 'learning_rate': 0.09803083128273675, 'max_depth': 12, 'colsample_bytree': 0.04177143508682934, 'subsample': 0.3554522157760903, 'min_child_samples': 20, 'subsample_freq': 1}. Best is trial 2 with value: 0.629132256423918.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:45,773] Trial 3 finished with value: 0.5466839020609818 and parameters: {'reg_alpha': 8.941453787087396e-06, 'reg_lambda': 5.8038290576509195e-06, 'num_leaves': 238, 'learning_rate': 0.07930564974213065, 'max_depth': 36, 'colsample_bytree': 0.024524761941292997, 'subsample': 0.10675977107189616, 'min_child_samples': 15, 'subsample_freq': 6}. Best is trial 2 with value: 0.629132256423918.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:46,915] Trial 4 finished with value: 0.5487636530963164 and parameters: {'reg_alpha': 1.9709513789383777e-06, 'reg_lambda': 4.899284214820991e-06, 'num_leaves': 238, 'learning_rate': 0.03919658786161316, 'max_depth': 45, 'colsample_bytree': 0.016586299014186442, 'subsample': 0.864566326272419, 'min_child_samples': 4, 'subsample_freq': 8}. Best is trial 2 with value: 0.629132256423918.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:47,621] Trial 5 finished with value: 0.5487636530963164 and parameters: {'reg_alpha': 3.006321232821169e-06, 'reg_lambda': 7.458225133995206e-06, 'num_leaves': 230, 'learning_rate': 0.019648794645697618, 'max_depth': 13, 'colsample_bytree': 0.09617324983555015, 'subsample': 0.8853667956535631, 'min_child_samples': 17, 'subsample_freq': 8}. Best is trial 2 with value: 0.629132256423918.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:48,125] Trial 6 finished with value: 0.568214017582524 and parameters: {'reg_alpha': 3.857395110544448e-06, 'reg_lambda': 4.057427718467624e-06, 'num_leaves': 173, 'learning_rate': 0.05013873530211991, 'max_depth': 26, 'colsample_bytree': 0.06647597315171916, 'subsample': 0.1208028246253416, 'min_child_samples': 2, 'subsample_freq': 10}. Best is trial 2 with value: 0.629132256423918.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:48,519] Trial 7 finished with value: 0.5468625124119058 and parameters: {'reg_alpha': 7.602863048322271e-06, 'reg_lambda': 1.9011793760669821e-06, 'num_leaves': 282, 'learning_rate': 0.03942543713658494, 'max_depth': 21, 'colsample_bytree': 0.06590002335297122, 'subsample': 0.34615306839910875, 'min_child_samples': 8, 'subsample_freq': 10}. Best is trial 2 with value: 0.629132256423918.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:48,878] Trial 8 finished with value: 0.5501773994332905 and parameters: {'reg_alpha': 6.59126605309274e-06, 'reg_lambda': 2.7306954254159874e-06, 'num_leaves': 296, 'learning_rate': 0.05401503322772159, 'max_depth': 10, 'colsample_bytree': 0.06278164888576078, 'subsample': 0.24873453478853202, 'min_child_samples': 19, 'subsample_freq': 4}. Best is trial 2 with value: 0.629132256423918.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:49,335] Trial 9 finished with value: 0.621040299338839 and parameters: {'reg_alpha': 5.281197564974569e-06, 'reg_lambda': 3.427894470886566e-06, 'num_leaves': 167, 'learning_rate': 0.054173037665925544, 'max_depth': 39, 'colsample_bytree': 0.0989435057496756, 'subsample': 0.8832738071490953, 'min_child_samples': 13, 'subsample_freq': 5}. Best is trial 2 with value: 0.629132256423918.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:49,813] Trial 10 finished with value: 0.6104296335763242 and parameters: {'reg_alpha': 9.587994737240151e-06, 'reg_lambda': 7.707870209596483e-07, 'num_leaves': 195, 'learning_rate': 0.09984724683280236, 'max_depth': 50, 'colsample_bytree': 0.04499105403290943, 'subsample': 0.6403710857169993, 'min_child_samples': 20, 'subsample_freq': 1}. Best is trial 2 with value: 0.629132256423918.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:50,309] Trial 11 finished with value: 0.6256871957569445 and parameters: {'reg_alpha': 5.799756154240275e-06, 'reg_lambda': 2.8406356161045162e-06, 'num_leaves': 160, 'learning_rate': 0.07734672066278597, 'max_depth': 37, 'colsample_bytree': 0.043616958285018444, 'subsample': 0.9879557018603823, 'min_child_samples': 9, 'subsample_freq': 1}. Best is trial 2 with value: 0.629132256423918.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:50,876] Trial 12 finished with value: 0.6318295754522778 and parameters: {'reg_alpha': 3.2534873835752547e-07, 'reg_lambda': 2.7477015089281744e-07, 'num_leaves': 152, 'learning_rate': 0.09754220267970316, 'max_depth': 35, 'colsample_bytree': 0.03858089148633698, 'subsample': 0.9887426674728156, 'min_child_samples': 8, 'subsample_freq': 1}. Best is trial 12 with value: 0.6318295754522778.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:51,431] Trial 13 finished with value: 0.6264349373955583 and parameters: {'reg_alpha': 6.156817872636614e-07, 'reg_lambda': 9.901816255317676e-08, 'num_leaves': 196, 'learning_rate': 0.09552959869770128, 'max_depth': 32, 'colsample_bytree': 0.03658377674800263, 'subsample': 0.6768591464417962, 'min_child_samples': 6, 'subsample_freq': 2}. Best is trial 12 with value: 0.6318295754522778.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:52,012] Trial 14 finished with value: 0.61631166598048 and parameters: {'reg_alpha': 6.732287193810321e-08, 'reg_lambda': 1.2112810148218002e-06, 'num_leaves': 191, 'learning_rate': 0.08463406637317855, 'max_depth': 20, 'colsample_bytree': 0.031214159181456758, 'subsample': 0.459247952093329, 'min_child_samples': 11, 'subsample_freq': 3}. Best is trial 12 with value: 0.6318295754522778.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:52,749] Trial 15 finished with value: 0.6302372187643797 and parameters: {'reg_alpha': 4.722478789263596e-06, 'reg_lambda': 2.210517295763106e-06, 'num_leaves': 151, 'learning_rate': 0.09925252153775542, 'max_depth': 44, 'colsample_bytree': 0.04911495607202021, 'subsample': 0.6653868650496567, 'min_child_samples': 6, 'subsample_freq': 2}. Best is trial 12 with value: 0.6318295754522778.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:53,523] Trial 16 finished with value: 0.6072933569058634 and parameters: {'reg_alpha': 4.308263194758181e-06, 'reg_lambda': 6.683785970758882e-08, 'num_leaves': 150, 'learning_rate': 0.07004510109686926, 'max_depth': 43, 'colsample_bytree': 0.0525717468761433, 'subsample': 0.726619062017212, 'min_child_samples': 1, 'subsample_freq': 3}. Best is trial 12 with value: 0.6318295754522778.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:54,242] Trial 17 finished with value: 0.621836477682788 and parameters: {'reg_alpha': 1.3454653309682939e-06, 'reg_lambda': 1.7727670536584716e-06, 'num_leaves': 208, 'learning_rate': 0.08744653990408036, 'max_depth': 50, 'colsample_bytree': 0.027767366063413877, 'subsample': 0.7631549143036727, 'min_child_samples': 6, 'subsample_freq': 2}. Best is trial 12 with value: 0.6318295754522778.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:55,073] Trial 18 finished with value: 0.6223238720302244 and parameters: {'reg_alpha': 1.2190498116083095e-06, 'reg_lambda': 1.8107946474053013e-06, 'num_leaves': 174, 'learning_rate': 0.0871587669973146, 'max_depth': 43, 'colsample_bytree': 0.053254941996121206, 'subsample': 0.6313624342711079, 'min_child_samples': 6, 'subsample_freq': 2}. Best is trial 12 with value: 0.6318295754522778.


Parameters: { "num_leaves", "subsample_freq" } are not used.



[I 2023-11-12 20:59:55,734] Trial 19 finished with value: 0.6025647235475043 and parameters: {'reg_alpha': 4.073995854634189e-06, 'reg_lambda': 8.592864727984015e-07, 'num_leaves': 152, 'learning_rate': 0.06959365115775791, 'max_depth': 32, 'colsample_bytree': 0.010115959526361483, 'subsample': 0.9865084238601425, 'min_child_samples': 10, 'subsample_freq': 7}. Best is trial 12 with value: 0.6318295754522778.


CPU times: total: 1min 13s
Wall time: 12.2 s


In [87]:
optimized_params = study.best_trial.params
optimized_params

{'reg_alpha': 3.2534873835752547e-07,
 'reg_lambda': 2.7477015089281744e-07,
 'num_leaves': 152,
 'learning_rate': 0.09754220267970316,
 'max_depth': 35,
 'colsample_bytree': 0.03858089148633698,
 'subsample': 0.9887426674728156,
 'min_child_samples': 8,
 'subsample_freq': 1}

In [97]:
study.best_value

0.6318295754522778

In [88]:
# plot_optimization_history: shows the scores from all trials as well as the best score so far at each point.
optuna.visualization.plot_optimization_history(study)

In [89]:
optuna.visualization.plot_slice(study)

In [90]:
# Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

### Running XGBClassifier with optimized parameters

In [92]:
optimized_params['random_state']= 23
optimized_params['n_estimators']= 200

In [93]:
eval_data.head()

Unnamed: 0,age,amount,assets,debt,expenses,home=ignore,home=other,home=owner,home=parents,home=private,...,marital=separated,marital=single,marital=unk,marital=widow,price,records=no,records=yes,seniority,time,label
0,31.0,550.0,0.0,0.0,49.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,999.0,1.0,0.0,6.0,36.0,0
1,38.0,1000.0,0.0,0.0,56.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1307.0,1.0,0.0,18.0,60.0,0
2,40.0,700.0,0.0,0.0,58.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,982.0,0.0,1.0,17.0,24.0,0
3,29.0,1200.0,6000.0,3000.0,75.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1521.0,1.0,0.0,2.0,48.0,1
4,61.0,750.0,0.0,0.0,57.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,830.0,1.0,0.0,25.0,42.0,0


In [96]:
optimized_params

{'reg_alpha': 3.2534873835752547e-07,
 'reg_lambda': 2.7477015089281744e-07,
 'num_leaves': 152,
 'learning_rate': 0.09754220267970316,
 'max_depth': 35,
 'colsample_bytree': 0.03858089148633698,
 'subsample': 0.9887426674728156,
 'min_child_samples': 8,
 'subsample_freq': 1,
 'random_state': 23,
 'n_estimators': 200}

In [94]:
# Fit an XGBoost binary classifier on the training data split
model_optimised = xgboost.XGBClassifier(**optimized_params).fit(model_train, y_train)

# Create a model_optimised signature
signature = infer_signature(model_val, model_optimised.predict(model_val))

# # Build the Evaluation Dataset from the test set
# eval_data = model_val.copy()
# eval_data["label"] = y_val

with mlflow.start_run() as run:
    # Log the baseline model_optimised to MLflow
    mlflow.sklearn.log_model(model_optimised, "model_optimised", signature=signature)
    model_uri = mlflow.get_artifact_uri("model_optimised")

    # Evaluate the logged model_optimised
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )

Parameters: { "min_child_samples", "num_leaves", "subsample_freq" } are not used.



 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/11/12 21:07:17 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/12 21:07:17 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/12 21:07:20 INFO mlflow.models.evaluation.default_evaluator: Shap explainer TreeExplainer is used.

Unable to serialize underlying model using MLflow, will use SHAP serialization



In [95]:
model_uri

'mlflow-artifacts:/0/d2e99fc57a2d42d2aa6a808a805ab564/artifacts/model_optimised'