In [1]:
import os
os.chdir('../../')
os.getcwd()

'/Users/henriquecosta/workspace/studies/modern-ml'

In [24]:
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.metrics import log_loss, average_precision_score
from sklearn.model_selection import train_test_split, KFold
import optuna

import mlflow

In [3]:
df_train = pd.read_parquet('./case/data/processed/lending_club_case_train_dataset_v2.parquet')
df_valid = pd.read_parquet('./case/data/processed/lending_club_case_valid_dataset_v2.parquet')
df_test  = pd.read_parquet('./case/data/processed/lending_club_case_test_dataset_v2.parquet')

In [18]:
FEATURES = [
    'loan_amnt',
    'term',
    'int_rate',
    'installment',
    'annual_inc',
    'dti',
    'fico_range_low',
    'inq_last_6mths',
    'mths_since_last_delinq',
    'revol_bal',
    'total_rev_hi_lim',
    'acc_open_past_24mths',
    'bc_open_to_buy',
    'bc_util',
    'mo_sin_old_il_acct',
    'mo_sin_old_rev_tl_op',
    'mo_sin_rcnt_rev_tl_op',
    'mo_sin_rcnt_tl',
    'mort_acc',
    'mths_since_recent_bc',
    'mths_since_recent_inq',
    'num_bc_tl',
    'num_il_tl',
    'num_rev_tl_bal_gt_0',
    'percent_bc_gt_75',
    'tot_hi_cred_lim',
    'total_bc_limit',
    'total_il_high_credit_limit',
    'issue_d_elapse',
    'earliest_cr_line_since',
    'grade',
    'sub_grade',
    'emp_length',
    'home_ownership',
    'verification_status',
    'pymnt_plan',
    'purpose',
    'title',
    'zip_code',
    'addr_state',
    'initial_list_status',
    'application_type',
    'verification_status_joint',
    'issue_d_month',
    'issue_d_day',
    'issue_d_dayofweek',
    'issue_d_dayofyear',
    'issue_d_quarter',
    'earliest_cr_line_month',
    'earliest_cr_line_day',
    'earliest_cr_line_dayofweek',
    'earliest_cr_line_dayofyear',
    'earliest_cr_line_quarter',
    'sec_app_earliest_cr_line_month',
    'sec_app_earliest_cr_line_day',
    'sec_app_earliest_cr_line_dayofweek',
    'sec_app_earliest_cr_line_dayofyear',
    'sec_app_earliest_cr_line_quarter'
]

CATEGORICAL_FEATURES = [c for c in df_train.select_dtypes(include='category').columns if c in FEATURES]

In [20]:
X_train = df_train[FEATURES]
y_train = df_train['default']

In [21]:
_, X_train, _, y_train = train_test_split(X_train, y_train, test_size=0.3, random_state=42, stratify=y_train)

In [4]:
mlflow.set_experiment('tunning_hyperparam')

2025/02/02 16:40:29 INFO mlflow.tracking.fluent: Experiment with name 'tunning_hyperparam' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/henriquecosta/workspace/studies/modern-ml/mlruns/222273069811250698', creation_time=1738525229143, experiment_id='222273069811250698', last_update_time=1738525229143, lifecycle_stage='active', name='tunning_hyperparam', tags={}>

In [22]:
X_valid = df_valid[FEATURES]
y_valid = df_valid['default']

train_pool = cb.Pool(X_train, y_train, cat_features=CATEGORICAL_FEATURES)
valid_pool = cb.Pool(X_valid, y_valid, cat_features=CATEGORICAL_FEATURES)

In [25]:
with mlflow.start_run(run_name='vanilla-Catboost'):
    params = dict(
        iterations=1000,
        depth=6,
        auto_class_weights='Balanced',
        eval_metric='Logloss',
        verbose=10,
    )

    model = cb.CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=10)
    y_pred = model.predict_proba(X_valid)[:, 1]
    mlflow.log_params(params)
    mlflow.log_metric('log_loss', log_loss(y_valid, y_pred))
    mlflow.log_metric('avg_pr', average_precision_score(y_valid, y_pred))

Learning rate set to 0.141198
0:	learn: 0.6764790	test: 0.6767839	best: 0.6767839 (0)	total: 720ms	remaining: 11m 59s
10:	learn: 0.6260489	test: 0.6335844	best: 0.6335844 (10)	total: 5.93s	remaining: 8m 52s
20:	learn: 0.6187232	test: 0.6274173	best: 0.6274173 (20)	total: 10.7s	remaining: 8m 18s
30:	learn: 0.6156667	test: 0.6235768	best: 0.6235768 (30)	total: 15.3s	remaining: 7m 59s
40:	learn: 0.6138170	test: 0.6218669	best: 0.6218669 (40)	total: 19.9s	remaining: 7m 46s
50:	learn: 0.6127659	test: 0.6212987	best: 0.6212987 (50)	total: 25.3s	remaining: 7m 50s
60:	learn: 0.6117360	test: 0.6204772	best: 0.6204772 (60)	total: 30.7s	remaining: 7m 52s
70:	learn: 0.6109311	test: 0.6202244	best: 0.6200282 (63)	total: 36s	remaining: 7m 51s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.6200282114
bestIteration = 63

Shrink model to first 64 iterations.


In [29]:
def objective(trial: optuna.Trial):
    with mlflow.start_run(nested=True):
        params = dict(
            iterations=500,
            depth=trial.suggest_int('depth', 4, 10),
            learning_rate=trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
            l2_leaf_reg=trial.suggest_int('l2_leaf_reg', 1, 20),
            colsample_bylevel=trial.suggest_float('colsample_bylevel', 0.7, 1.0, step=0.1),
            subsample=trial.suggest_float('colsample_bylevel', 0.5, 1.0, step=0.1),
            eval_metric='Logloss',  
        )

        model = cb.CatBoostClassifier(**params)
        
        kf = KFold(n_splits=3, shuffle=True, random_state=42)
        scores = []
        for train_index, valid_index in kf.split(X_train):
            X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
            y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]

            train_pool = cb.Pool(X_tr, y_tr, cat_features=CATEGORICAL_FEATURES)
            val_pool = cb.Pool(X_val, y_val, cat_features=CATEGORICAL_FEATURES)

            model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=10, verbose=False)
            y_pred = model.predict_proba(X_val)[:, 1]
            score = log_loss(y_val, y_pred)
            
            scores.append(score)
        avg_score = np.mean(scores)
        mlflow.log_metric('log_loss', avg_score)
        mlflow.log_params(params)
    return np.mean(avg_score)

In [31]:
optuna.logging.set_verbosity(optuna.logging.ERROR)


with mlflow.start_run():
    study = optuna.create_study(direction="minimize")

    study.optimize(objective, n_trials=10)

    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_avg_logloss", study.best_value)

    model = cb.CatBoostClassifier(**study.best_params)
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=10)
    y_pred = model.predict_proba(X_valid)[:, 1]

    mlflow.log_metric('log_loss', log_loss(y_valid, y_pred))
    mlflow.log_metric('avg_pr', average_precision_score(y_valid, y_pred))



0:	learn: 0.6421645	test: 0.6487584	best: 0.6487584 (0)	total: 3.47s	remaining: 57m 47s
1:	learn: 0.6007537	test: 0.6119180	best: 0.6119180 (1)	total: 10.2s	remaining: 1h 24m 34s
2:	learn: 0.5678102	test: 0.5851345	best: 0.5851345 (2)	total: 22.3s	remaining: 2h 3m 24s
3:	learn: 0.5425788	test: 0.5646542	best: 0.5646542 (3)	total: 25.7s	remaining: 1h 46m 46s
4:	learn: 0.5215553	test: 0.5486170	best: 0.5486170 (4)	total: 30.8s	remaining: 1h 42m 4s
5:	learn: 0.5055012	test: 0.5374531	best: 0.5374531 (5)	total: 48.6s	remaining: 2h 14m 7s
6:	learn: 0.4928793	test: 0.5288366	best: 0.5288366 (6)	total: 57s	remaining: 2h 14m 46s
7:	learn: 0.4824322	test: 0.5214940	best: 0.5214940 (7)	total: 1m 3s	remaining: 2h 11m 44s
8:	learn: 0.4738273	test: 0.5157163	best: 0.5157163 (8)	total: 1m 11s	remaining: 2h 11m 54s
9:	learn: 0.4667372	test: 0.5117830	best: 0.5117830 (9)	total: 1m 17s	remaining: 2h 7m 44s
10:	learn: 0.4614421	test: 0.5090383	best: 0.5090383 (10)	total: 1m 23s	remaining: 2h 5m 14s
11:	