In [1]:
import os
os.chdir('../../')
os.getcwd()

'/Users/henriquecosta/workspace/studies/modern-ml'

In [2]:
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.metrics import log_loss, average_precision_score
from sklearn.model_selection import train_test_split, KFold
import optuna

import mlflow

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df_train = pd.read_parquet('./case/data/processed/lending_club_case_train_dataset_v2.parquet')
df_valid = pd.read_parquet('./case/data/processed/lending_club_case_valid_dataset_v2.parquet')
df_test  = pd.read_parquet('./case/data/processed/lending_club_case_test_dataset_v2.parquet')

In [4]:
FEATURES = ['funded_amnt',
 'funded_amnt_inv',
 'term',
 'int_rate',
 'installment',
 'annual_inc',
 'dti',
 'fico_range_high',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'revol_bal',
 'revol_util',
 'total_rev_hi_lim',
 'acc_open_past_24mths',
 'avg_cur_bal',
 'bc_util',
 'mo_sin_old_il_acct',
 'mo_sin_old_rev_tl_op',
 'mo_sin_rcnt_rev_tl_op',
 'mo_sin_rcnt_tl',
 'mort_acc',
 'mths_since_recent_bc',
 'mths_since_recent_inq',
 'num_bc_tl',
 'num_il_tl',
 'num_rev_tl_bal_gt_0',
 'percent_bc_gt_75',
 'tot_hi_cred_lim',
 'total_bc_limit',
 'total_il_high_credit_limit',
 'issue_d_elapse',
 'earliest_cr_line_since',
 'grade',
 'sub_grade',
 'emp_length',
 'home_ownership',
 'verification_status',
 'pymnt_plan',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'initial_list_status',
 'application_type',
 'verification_status_joint',
 'issue_d_month',
 'issue_d_day',
 'issue_d_dayofweek',
 'issue_d_dayofyear',
 'issue_d_quarter',
 'earliest_cr_line_month',
 'earliest_cr_line_day',
 'earliest_cr_line_dayofweek',
 'earliest_cr_line_dayofyear',
 'earliest_cr_line_quarter',
 'sec_app_earliest_cr_line_month',
 'sec_app_earliest_cr_line_day',
 'sec_app_earliest_cr_line_dayofweek',
 'sec_app_earliest_cr_line_dayofyear',
 'sec_app_earliest_cr_line_quarter'
]

CATEGORICAL_FEATURES = [c for c in df_train.select_dtypes(include='category').columns if c in FEATURES]

In [5]:
X_train = df_train[FEATURES]
y_train = df_train['default']

In [6]:
_, X_train, _, y_train = train_test_split(X_train, y_train, test_size=0.1, random_state=42, stratify=y_train)

In [7]:
mlflow.set_experiment('tunning_hyperparam')

<Experiment: artifact_location='file:///Users/henriquecosta/workspace/studies/modern-ml/mlruns/222273069811250698', creation_time=1738525229143, experiment_id='222273069811250698', last_update_time=1738525229143, lifecycle_stage='active', name='tunning_hyperparam', tags={}>

In [8]:
X_valid = df_valid[FEATURES]
y_valid = df_valid['default']

train_pool = cb.Pool(X_train, y_train, cat_features=CATEGORICAL_FEATURES)
valid_pool = cb.Pool(X_valid, y_valid, cat_features=CATEGORICAL_FEATURES)

In [9]:
with mlflow.start_run(run_name='vanilla-Catboost'):
    params = dict(
        iterations=1000,
        depth=6,
        auto_class_weights='Balanced',
        eval_metric='Logloss',
        verbose=10,
    )

    model = cb.CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=10)
    y_pred = model.predict_proba(X_valid)[:, 1]
    mlflow.log_params(params)
    mlflow.log_metric('log_loss', log_loss(y_valid, y_pred))
    mlflow.log_metric('avg_pr', average_precision_score(y_valid, y_pred))

Learning rate set to 0.107642
0:	learn: 0.6818394	test: 0.6830205	best: 0.6830205 (0)	total: 233ms	remaining: 3m 52s
10:	learn: 0.6334659	test: 0.6390599	best: 0.6390599 (10)	total: 2.26s	remaining: 3m 23s
20:	learn: 0.6228893	test: 0.6307599	best: 0.6307599 (20)	total: 4.1s	remaining: 3m 11s
30:	learn: 0.6185046	test: 0.6272774	best: 0.6272774 (30)	total: 5.95s	remaining: 3m 5s
40:	learn: 0.6157967	test: 0.6256245	best: 0.6255978 (38)	total: 7.75s	remaining: 3m 1s
50:	learn: 0.6139560	test: 0.6244991	best: 0.6244991 (50)	total: 9.5s	remaining: 2m 56s
60:	learn: 0.6125319	test: 0.6236351	best: 0.6236351 (60)	total: 11.3s	remaining: 2m 53s
70:	learn: 0.6115700	test: 0.6233340	best: 0.6233061 (69)	total: 12.9s	remaining: 2m 48s
80:	learn: 0.6106116	test: 0.6233535	best: 0.6230181 (71)	total: 14.5s	remaining: 2m 44s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.6230181253
bestIteration = 71

Shrink model to first 72 iterations.


In [10]:
def objective(trial: optuna.Trial):
    with mlflow.start_run(nested=True):
        params = dict(
            iterations=500,
            learning_rate=trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            depth=trial.suggest_int('depth', 4, 12),
            l2_leaf_reg=trial.suggest_int('l2_leaf_reg', 1, 20),
            colsample_bylevel=trial.suggest_float('colsample_bylevel', 0.1, 1.0, step=0.1),
            subsample=trial.suggest_float('subsample', 0.1, 1.0, step=0.1),
            eval_metric='Logloss',  
        )

        model = cb.CatBoostClassifier(**params)
        
        kf = KFold(n_splits=3, shuffle=True, random_state=42)
        scores = []
        for train_index, valid_index in kf.split(X_train):
            X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
            y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]

            train_pool = cb.Pool(X_tr, y_tr, cat_features=CATEGORICAL_FEATURES)
            val_pool = cb.Pool(X_val, y_val, cat_features=CATEGORICAL_FEATURES)

            model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=10, verbose=False)
            y_pred = model.predict_proba(X_val)[:, 1]
            score = log_loss(y_val, y_pred)
            
            scores.append(score)
        avg_score = np.mean(scores)
        mlflow.log_metric('log_loss', avg_score)
        mlflow.log_params(params)
    return np.mean(avg_score)

In [11]:
optuna.logging.set_verbosity(optuna.logging.ERROR)


with mlflow.start_run():
    study = optuna.create_study(direction="minimize")

    study.optimize(objective, n_trials=10)

    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_avg_logloss", study.best_value)

    model = cb.CatBoostClassifier(**study.best_params)
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=10)
    y_pred = model.predict_proba(X_valid)[:, 1]

    mlflow.log_metric('log_loss', log_loss(y_valid, y_pred))
    mlflow.log_metric('avg_pr', average_precision_score(y_valid, y_pred))

0:	learn: 0.6577137	test: 0.6630221	best: 0.6630221 (0)	total: 242ms	remaining: 4m 1s
1:	learn: 0.6324781	test: 0.6432432	best: 0.6432432 (1)	total: 369ms	remaining: 3m 3s
2:	learn: 0.6063934	test: 0.6206608	best: 0.6206608 (2)	total: 644ms	remaining: 3m 33s
3:	learn: 0.5832933	test: 0.6011078	best: 0.6011078 (3)	total: 793ms	remaining: 3m 17s
4:	learn: 0.5629174	test: 0.5836546	best: 0.5836546 (4)	total: 954ms	remaining: 3m 9s
5:	learn: 0.5464700	test: 0.5706763	best: 0.5706763 (5)	total: 1.14s	remaining: 3m 9s
6:	learn: 0.5321679	test: 0.5592792	best: 0.5592792 (6)	total: 1.34s	remaining: 3m 9s
7:	learn: 0.5201493	test: 0.5501186	best: 0.5501186 (7)	total: 1.42s	remaining: 2m 56s
8:	learn: 0.5107433	test: 0.5433591	best: 0.5433591 (8)	total: 1.58s	remaining: 2m 54s
9:	learn: 0.5018810	test: 0.5375184	best: 0.5375184 (9)	total: 1.75s	remaining: 2m 53s
10:	learn: 0.4944124	test: 0.5324823	best: 0.5324823 (10)	total: 1.88s	remaining: 2m 48s
11:	learn: 0.4883197	test: 0.5284274	best: 0.5