# Imports

In [1]:
%run setup.py

In [3]:
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb

from sklearn.metrics import average_precision_score, roc_auc_score, f1_score, precision_recall_curve, auc

from typing import List, Tuple
import optuna

from thesis_code.data_analysis.ml_pipeline import (
    load_parquet,
    load_multiple_parquets,
    protein_level_split,
    preprocess,
    calculate_class_weights,
    train_model,
    evaluate_model,
    save_model,
    load_model
    )

RANDOM_STATE = 42

# Load Data

In [4]:
df_full = load_parquet("/home/user_stel/project_thesis/data/derived/df_feat_context_v1.parquet")
df_surface = load_parquet("/home/user_stel/project_thesis/data/derived/df_ml_context_v1.parquet")

print(df_full.shape)
print(df_surface.shape)


(183890, 43)
(106100, 43)


## Sanity Checks

In [5]:
def check_dataset(
    df: pd.DataFrame,
    target_col: str = "IBS",
    required_cols: List[str] | None = None
):
    """
    Basic sanity checks for ML datasets.

    Parameters
    ----------
    df : pd.DataFrame
    target_col : str
    required_cols : list, optional
    """

    if target_col not in df.columns:
        raise ValueError(f"Target column '{target_col}' not found")

    if required_cols is not None:
        missing = set(required_cols) - set(df.columns)
        if missing:
            raise ValueError(f"Missing required columns: {missing}")

    if df[target_col].isna().any():
        raise ValueError("Target column contains NaNs")

    print("Dataset check passed:")
    print(f"  Samples: {len(df):,}")
    print(f"  IBS positives: {df[target_col].sum():,}")
    print(f"  IBS ratio: {df[target_col].mean():.4f}")


In [6]:
print("Full-surface IBS ratio:", df_full["IBS"].mean())
print("Surface-only IBS ratio:", df_surface["IBS"].mean())

df_full["IBS"].value_counts()

Full-surface IBS ratio: 0.1516286910653108
Surface-only IBS ratio: 0.20555136663524975


IBS
False    156007
True      27883
Name: count, dtype: int64

# Split train/test/val 

In [7]:
# Step 1: inspect
for i, col in enumerate(df_full.columns):
    print(f"{i:2d}  {col}")

# Step 2: decide
DROP_COLS = [
    "IBS",
    "domain",
    "cathpdb",
    "pdb",
    "uniprot_acc",
    "uniprot_id",
    "domain",
    "residue_name",
    "chain_id",
    "residue_number",
    "Experimental Method",
    "origin",
    "location",
    "taxon",
    "data_type",
    "resolution",
    "S35", "S60", "S95", "S100",
    "uniref50", "uniref90", "uniref100",
    "neighboursList",
    "neighbors"
]

 0  domain
 1  cathpdb
 2  pdb
 3  uniprot_acc
 4  uniprot_id
 5  residue_name
 6  IBS
 7  chain_id
 8  residue_number
 9  b_factor
10  sec_struc
11  sec_struc_full
12  prot_block
13  data_type
14  Experimental Method
15  resolution
16  RSA_total_freesasa_tien
17  convhull_vertex
18  protrusion
19  is_hydrophobic_protrusion
20  is_co_insertable
21  neighboursList
22  density
23  exposed
24  S35
25  S60
26  S95
27  S100
28  uniref50
29  uniref90
30  uniref100
31  origin
32  location
33  taxon
34  neighbors
35  n_neighbors
36  neighbor_frac_exposed
37  neighbor_frac_hydrophobic
38  neighbor_frac_charged
39  neighbor_frac_polar
40  neighbor_frac_aromatic
41  neighbor_frac_small
42  neighbor_mean_RSA


In [8]:
df_full = df_full.reset_index(drop=True)
df_surface = df_surface.reset_index(drop=True)

In [9]:
train_prot, val_prot, test_prot = protein_level_split(df_full, "cathpdb")

df_train = df_full[df_full["cathpdb"].isin(train_prot)]
df_val   = df_full[df_full["cathpdb"].isin(val_prot)]
df_test  = df_full[df_full["cathpdb"].isin(test_prot)]

# Preprocessing

In [None]:
X_train, y_train = preprocess(
    df_train,
    target_col="IBS",
    drop_cols=DROP_COLS
)

X_val, y_val = preprocess(
    df_val,
    target_col="IBS",
    drop_cols=DROP_COLS
)

X_test, y_test = preprocess(
    df_test,
    target_col="IBS",
    drop_cols=DROP_COLS
)

# Hyperparameter Tuning

In [11]:
categorical_cols = X_train.select_dtypes(include="object").columns.tolist()
numeric_cols = X_train.select_dtypes(include=np.number).columns.tolist()

In [None]:
print(X_train.dtypes)

b_factor                     float64
sec_struc                     object
sec_struc_full                object
prot_block                    object
RSA_total_freesasa_tien      float64
convhull_vertex                 bool
protrusion                      bool
is_hydrophobic_protrusion       bool
is_co_insertable                bool
density                        int64
exposed                         bool
n_neighbors                  float64
neighbor_frac_exposed        float64
neighbor_frac_hydrophobic    float64
neighbor_frac_charged        float64
neighbor_frac_polar          float64
neighbor_frac_aromatic       float64
neighbor_frac_small          float64
neighbor_mean_RSA            float64
dtype: object


### Random Forest

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_cols),
        ("cat", OneHotEncoder(
            handle_unknown="ignore",
            sparse_output=False
        ), categorical_cols)
    ]
)

rf_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", RandomForestClassifier(
            random_state=RANDOM_STATE,
            n_jobs=-1
        ))
    ]
)

In [14]:
def rf_objective(trial):

    params = {
        "model__n_estimators": trial.suggest_int("model__n_estimators", 300, 800),
        "model__max_depth": trial.suggest_int("model__max_depth", 5, 18),
        "model__min_samples_split": trial.suggest_int("model__min_samples_split", 5, 20),
        "model__min_samples_leaf": trial.suggest_int("model__min_samples_leaf", 5, 20),
        "model__max_features": trial.suggest_categorical(
            "model__max_features", ["sqrt", "log2"]
        )
    }

    model = rf_pipeline.set_params(**params)

    # Tune positive-class importance
    pos_scale = trial.suggest_float("pos_scale", 0.5, 2.0)
    class_weights = calculate_class_weights(y_train, pos_scale=pos_scale)
    model.set_params(model__class_weight=class_weights)

    model.fit(X_train, y_train)

    y_val_proba = model.predict_proba(X_val)[:, 1]
    return average_precision_score(y_val, y_val_proba)

In [16]:
study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(rf_objective, n_trials=25)

[I 2026-02-10 20:38:24,890] A new study created in memory with name: no-name-12c4bf12-d325-4995-b4a2-1fc6dbb1f7af
[I 2026-02-10 20:39:02,200] Trial 0 finished with value: 0.3416985751541191 and parameters: {'model__n_estimators': 619, 'model__max_depth': 9, 'model__min_samples_split': 12, 'model__min_samples_leaf': 8, 'model__max_features': 'sqrt', 'pos_scale': 1.6544023636914618}. Best is trial 0 with value: 0.3416985751541191.
[I 2026-02-10 20:39:41,755] Trial 1 finished with value: 0.3763907837763151 and parameters: {'model__n_estimators': 624, 'model__max_depth': 15, 'model__min_samples_split': 6, 'model__min_samples_leaf': 9, 'model__max_features': 'log2', 'pos_scale': 1.9654197554955117}. Best is trial 1 with value: 0.3763907837763151.
[I 2026-02-10 20:40:07,671] Trial 2 finished with value: 0.29404364528311017 and parameters: {'model__n_estimators': 729, 'model__max_depth': 5, 'model__min_samples_split': 15, 'model__min_samples_leaf': 19, 'model__max_features': 'sqrt', 'pos_scal

In [17]:
import importlib
import thesis_code.data_analysis.ml_pipeline as ml_pipeline
importlib.reload(ml_pipeline)


<module 'thesis_code.data_analysis.ml_pipeline' from '/home/user_stel/project_thesis/thesis_code/data_analysis/ml_pipeline.py'>

In [26]:
best_params = study_rf.best_params.copy()
print("Best params:", best_params)

pos_scale = best_params.pop("pos_scale", None)

if pos_scale is not None:
    class_weights = {0: 1.0, 1: pos_scale}
else:
    class_weights = "balanced"


best_rf = rf_pipeline.set_params(
    **best_params,
    model__class_weight=class_weights
)


Best params: {'model__n_estimators': 508, 'model__max_depth': 17, 'model__min_samples_split': 19, 'model__min_samples_leaf': 5, 'model__max_features': 'log2', 'pos_scale': 0.9787537479949556}


In [27]:
ml_pipeline.save_model(best_rf, "rf_model")

Model saved to: /home/user_stel/project_thesis/thesis_code/data_analysis/models/rf_model.joblib


PosixPath('/home/user_stel/project_thesis/thesis_code/data_analysis/models/rf_model.joblib')

In [None]:
best_rf.get_params().keys() #sanity check

dict_keys(['memory', 'steps', 'transform_input', 'verbose', 'preprocess', 'model', 'preprocess__force_int_remainder_cols', 'preprocess__n_jobs', 'preprocess__remainder', 'preprocess__sparse_threshold', 'preprocess__transformer_weights', 'preprocess__transformers', 'preprocess__verbose', 'preprocess__verbose_feature_names_out', 'preprocess__num', 'preprocess__cat', 'preprocess__cat__categories', 'preprocess__cat__drop', 'preprocess__cat__dtype', 'preprocess__cat__feature_name_combiner', 'preprocess__cat__handle_unknown', 'preprocess__cat__max_categories', 'preprocess__cat__min_frequency', 'preprocess__cat__sparse_output', 'model__bootstrap', 'model__ccp_alpha', 'model__class_weight', 'model__criterion', 'model__max_depth', 'model__max_features', 'model__max_leaf_nodes', 'model__max_samples', 'model__min_impurity_decrease', 'model__min_samples_leaf', 'model__min_samples_split', 'model__min_weight_fraction_leaf', 'model__monotonic_cst', 'model__n_estimators', 'model__n_jobs', 'model__oob_

### Logistic Regression

In [30]:
lr_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", LogisticRegression(
            max_iter=1000,
            random_state=RANDOM_STATE
        ))
    ]
)

param_grid_lr = {
    "model__C": [0.01, 0.1, 1, 10],
    "model__penalty": ["l2"],
    "model__solver": ["lbfgs"]
}

class_weights_lr = calculate_class_weights(y_train)
lr_pipeline.set_params(model__class_weight=class_weights_lr)

In [31]:
grid_lr = GridSearchCV(
    estimator=lr_pipeline,     
    param_grid=param_grid_lr,
    scoring="average_precision",
    cv=3,
    n_jobs=-1,
    error_score="raise"        
)

grid_lr.fit(X_train, y_train)


In [32]:
best_lr = grid_lr.best_estimator_
print(grid_lr.best_params_)

{'model__C': 0.01, 'model__penalty': 'l2', 'model__solver': 'lbfgs'}


In [48]:
ml_pipeline.save_model(best_lr, "lr_model")

Model saved to: /home/user_stel/project_thesis/thesis_code/data_analysis/models/lr_model.joblib


PosixPath('/home/user_stel/project_thesis/thesis_code/data_analysis/models/lr_model.joblib')

### Support Vector Machines

In [57]:
preprocessor_svm = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
        ("num", StandardScaler(), numeric_cols),
    ],
    remainder="drop"
)

In [58]:
svm_pipeline = Pipeline([
    ("preprocess", preprocessor_svm),  # with StandardScaler
    ("model", SVC(class_weight="balanced", max_iter=5000))])

In [59]:
param_grid_svm = {
    "model__C": [0.1, 1, 10]
    # "model__gamma": ["scale", "auto"]
}

class_weights_svm = calculate_class_weights(y_train)

svm_pipeline.set_params(
    model__class_weight=class_weights_svm
)

In [60]:
grid_svm = GridSearchCV(
    estimator=svm_pipeline,        
    param_grid=param_grid_svm,
    scoring="average_precision",
    cv=3,
    n_jobs=-1,
    error_score="raise"
)

grid_svm.fit(X_train, y_train)



In [61]:
best_svm = grid_svm.best_estimator_
print(grid_svm.best_params_)

{'model__C': 0.1}


In [62]:
ml_pipeline.save_model(best_svm, "svm_model")

Model saved to: /home/user_stel/project_thesis/thesis_code/data_analysis/models/svm_model.joblib


PosixPath('/home/user_stel/project_thesis/thesis_code/data_analysis/models/svm_model.joblib')

### XGBoost

In [66]:
xgb_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", XGBClassifier(
            objective="binary:logistic",
            eval_metric="logloss",
            use_label_encoder=False,
            random_state=RANDOM_STATE,
            n_jobs=-1
        ))
    ]
)

In [70]:
def xgb_objective(trial):

    params = {
        "model__n_estimators": trial.suggest_int("model__n_estimators", 200, 800),
        "model__max_depth": trial.suggest_int("model__max_depth", 3, 10),
        "model__learning_rate": trial.suggest_float(
            "model__learning_rate", 0.01, 0.2, log=True
        ),
        "model__subsample": trial.suggest_float("model__subsample", 0.6, 1.0),
        "model__colsample_bytree": trial.suggest_float(
            "model__colsample_bytree", 0.6, 1.0
        ),
        "model__min_child_weight": trial.suggest_int(
            "model__min_child_weight", 1, 10
        ),
        "model__reg_lambda": trial.suggest_float(
            "model__reg_lambda", 1e-3, 10.0, log=True
        )
    }

    model = xgb_pipeline.set_params(**params)

    class_weights = calculate_class_weights(y_train)
    scale_pos_weight = class_weights[1] / class_weights[0]

    model.set_params(model__scale_pos_weight=scale_pos_weight)

    model.fit(X_train, y_train)

    y_val_proba = model.predict_proba(X_val)[:, 1]
    return average_precision_score(y_val, y_val_proba)


In [71]:
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(xgb_objective, n_trials=50)

[I 2026-01-18 20:00:28,653] A new study created in memory with name: no-name-610eb82a-544b-4caf-8bc5-6063a44af79e
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2026-01-18 20:00:44,644] Trial 0 finished with value: 0.3856020086821153 and parameters: {'model__n_estimators': 251, 'model__max_depth': 10, 'model__learning_rate': 0.051889669620576284, 'model__subsample': 0.9112738881725808, 'model__colsample_bytree': 0.9080899900103306, 'model__min_child_weight': 6, 'model__reg_lambda': 0.017218037079039197}. Best is trial 0 with value: 0.3856020086821153.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2026-01-18 20:00:58,822] Trial 1 finished with value: 0.37413816419375345 and parameters: {'model__n_estimators': 704, 'model__max_depth': 4, 'model__learning_rate': 0.18118827541260477, 'model__subsample': 0.9981727970595841, 'model__colsample_bytree': 0.9732532230386381, 'model__min_child_w

In [72]:
best_xgb = xgb_pipeline.set_params(**study_xgb.best_params)
print(study_xgb.best_params)

{'model__n_estimators': 759, 'model__max_depth': 10, 'model__learning_rate': 0.01323084356551085, 'model__subsample': 0.8971857262596246, 'model__colsample_bytree': 0.8626199585459593, 'model__min_child_weight': 7, 'model__reg_lambda': 0.30385885542295415}


In [73]:
ml_pipeline.save_model(best_xgb, "xgb_model")

Model saved to: /home/user_stel/project_thesis/thesis_code/data_analysis/models/xgb_model.joblib


Parameters: { "use_label_encoder" } are not used.

  rv = reduce(self.proto)


PosixPath('/home/user_stel/project_thesis/thesis_code/data_analysis/models/xgb_model.joblib')

### LightGBM

In [76]:
lgb_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", LGBMClassifier(
            objective="binary",
            random_state=RANDOM_STATE,
            n_jobs=-1
        ))
    ]
)

In [77]:
def lgb_objective(trial):

    params = {
        "model__n_estimators": trial.suggest_int("model__n_estimators", 200, 800),
        "model__learning_rate": trial.suggest_float(
            "model__learning_rate", 0.01, 0.2, log=True
        ),
        "model__max_depth": trial.suggest_int("model__max_depth", 3, 12),
        "model__num_leaves": trial.suggest_int("model__num_leaves", 15, 255),
        "model__subsample": trial.suggest_float("model__subsample", 0.6, 1.0),
        "model__colsample_bytree": trial.suggest_float(
            "model__colsample_bytree", 0.6, 1.0
        ),
        "model__min_child_samples": trial.suggest_int(
            "model__min_child_samples", 10, 50
        )
    }

    model = lgb_pipeline.set_params(**params)

    # handle class imbalance via class_weight
    class_weights = calculate_class_weights(y_train)
    model.set_params(model__class_weight=class_weights)

    model.fit(X_train, y_train)

    y_val_proba = model.predict_proba(X_val)[:, 1]
    return average_precision_score(y_val, y_val_proba)

In [78]:
study_lgb = optuna.create_study(direction="maximize")
study_lgb.optimize(lgb_objective, n_trials=50)

print(study_lgb.best_params)

[I 2026-01-18 20:15:05,072] A new study created in memory with name: no-name-2d1ba957-0e59-4335-a022-955f87726ba3


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016204 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:15:33,852] Trial 0 finished with value: 0.38106566798480235 and parameters: {'model__n_estimators': 671, 'model__learning_rate': 0.041641667365313814, 'model__max_depth': 10, 'model__num_leaves': 199, 'model__subsample': 0.6972240621035143, 'model__colsample_bytree': 0.8160367831315182, 'model__min_child_samples': 11}. Best is trial 0 with value: 0.38106566798480235.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011431 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:15:39,767] Trial 1 finished with value: 0.3581786410168725 and parameters: {'model__n_estimators': 502, 'model__learning_rate': 0.014717423186071948, 'model__max_depth': 5, 'model__num_leaves': 48, 'model__subsample': 0.9598082644732918, 'model__colsample_bytree': 0.7459497098132004, 'model__min_child_samples': 19}. Best is trial 0 with value: 0.38106566798480235.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012812 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:15:46,014] Trial 2 finished with value: 0.37646721487730833 and parameters: {'model__n_estimators': 471, 'model__learning_rate': 0.13900898679742582, 'model__max_depth': 5, 'model__num_leaves': 85, 'model__subsample': 0.9854986046662615, 'model__colsample_bytree': 0.8037381783271358, 'model__min_child_samples': 15}. Best is trial 0 with value: 0.38106566798480235.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014607 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:15:51,458] Trial 3 finished with value: 0.3621356344535942 and parameters: {'model__n_estimators': 738, 'model__learning_rate': 0.1394110049516733, 'model__max_depth': 3, 'model__num_leaves': 18, 'model__subsample': 0.6673251508248738, 'model__colsample_bytree': 0.7691350829871129, 'model__min_child_samples': 30}. Best is trial 0 with value: 0.38106566798480235.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030614 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:15:59,660] Trial 4 finished with value: 0.3791302913458374 and parameters: {'model__n_estimators': 420, 'model__learning_rate': 0.017575389972329006, 'model__max_depth': 9, 'model__num_leaves': 94, 'model__subsample': 0.7634362176114986, 'model__colsample_bytree': 0.9519533409872922, 'model__min_child_samples': 40}. Best is trial 0 with value: 0.38106566798480235.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:16:09,968] Trial 5 finished with value: 0.3799662273621083 and parameters: {'model__n_estimators': 694, 'model__learning_rate': 0.0386151823635385, 'model__max_depth': 7, 'model__num_leaves': 118, 'model__subsample': 0.9725266707013661, 'model__colsample_bytree': 0.8238367947500036, 'model__min_child_samples': 41}. Best is trial 0 with value: 0.38106566798480235.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:16:23,796] Trial 6 finished with value: 0.35416696101803724 and parameters: {'model__n_estimators': 647, 'model__learning_rate': 0.13894489654229342, 'model__max_depth': 10, 'model__num_leaves': 212, 'model__subsample': 0.9663480100945566, 'model__colsample_bytree': 0.8116974050312786, 'model__min_child_samples': 30}. Best is trial 0 with value: 0.38106566798480235.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:16:29,441] Trial 7 finished with value: 0.3678864606241714 and parameters: {'model__n_estimators': 315, 'model__learning_rate': 0.01891055626838898, 'model__max_depth': 7, 'model__num_leaves': 237, 'model__subsample': 0.6455424889376083, 'model__colsample_bytree': 0.76110497770398, 'model__min_child_samples': 47}. Best is trial 0 with value: 0.38106566798480235.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003336 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:16:42,218] Trial 8 finished with value: 0.3832204936074863 and parameters: {'model__n_estimators': 626, 'model__learning_rate': 0.03700940033818609, 'model__max_depth': 12, 'model__num_leaves': 146, 'model__subsample': 0.6306578748363808, 'model__colsample_bytree': 0.752837659492171, 'model__min_child_samples': 41}. Best is trial 8 with value: 0.3832204936074863.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009519 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:16:51,492] Trial 9 finished with value: 0.3689101704154044 and parameters: {'model__n_estimators': 678, 'model__learning_rate': 0.1686970850920687, 'model__max_depth': 6, 'model__num_leaves': 152, 'model__subsample': 0.6988408048988417, 'model__colsample_bytree': 0.8201971023592292, 'model__min_child_samples': 45}. Best is trial 8 with value: 0.3832204936074863.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002779 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:16:56,486] Trial 10 finished with value: 0.3858811241847737 and parameters: {'model__n_estimators': 224, 'model__learning_rate': 0.06619418842852376, 'model__max_depth': 12, 'model__num_leaves': 158, 'model__subsample': 0.8457535871760364, 'model__colsample_bytree': 0.6072469284966204, 'model__min_child_samples': 35}. Best is trial 10 with value: 0.3858811241847737.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016598 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:17:02,695] Trial 11 finished with value: 0.38623016958391904 and parameters: {'model__n_estimators': 209, 'model__learning_rate': 0.06888364088182787, 'model__max_depth': 12, 'model__num_leaves': 162, 'model__subsample': 0.8625235680324861, 'model__colsample_bytree': 0.6019006131541262, 'model__min_child_samples': 36}. Best is trial 11 with value: 0.38623016958391904.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003062 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:17:10,017] Trial 12 finished with value: 0.3832071395122747 and parameters: {'model__n_estimators': 332, 'model__learning_rate': 0.07288940696072968, 'model__max_depth': 12, 'model__num_leaves': 177, 'model__subsample': 0.8596372851640689, 'model__colsample_bytree': 0.6041180761205684, 'model__min_child_samples': 35}. Best is trial 11 with value: 0.38623016958391904.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003254 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:17:15,764] Trial 13 finished with value: 0.38598659162443627 and parameters: {'model__n_estimators': 203, 'model__learning_rate': 0.07636368000852888, 'model__max_depth': 11, 'model__num_leaves': 167, 'model__subsample': 0.8680494235409799, 'model__colsample_bytree': 0.6066544291845586, 'model__min_child_samples': 25}. Best is trial 11 with value: 0.38623016958391904.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002885 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:17:20,608] Trial 14 finished with value: 0.3849514694605647 and parameters: {'model__n_estimators': 204, 'model__learning_rate': 0.07433517134461358, 'model__max_depth': 10, 'model__num_leaves': 255, 'model__subsample': 0.8876584911552238, 'model__colsample_bytree': 0.6692845064498485, 'model__min_child_samples': 23}. Best is trial 11 with value: 0.38623016958391904.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003645 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:17:28,592] Trial 15 finished with value: 0.3812030802194151 and parameters: {'model__n_estimators': 300, 'model__learning_rate': 0.09383971406531186, 'model__max_depth': 9, 'model__num_leaves': 189, 'model__subsample': 0.801058116752524, 'model__colsample_bytree': 0.6784723364316375, 'model__min_child_samples': 24}. Best is trial 11 with value: 0.38623016958391904.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001820 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:17:36,419] Trial 16 finished with value: 0.3847445469894311 and parameters: {'model__n_estimators': 389, 'model__learning_rate': 0.027408530882920085, 'model__max_depth': 11, 'model__num_leaves': 124, 'model__subsample': 0.9120665813752429, 'model__colsample_bytree': 0.6566241345968469, 'model__min_child_samples': 25}. Best is trial 11 with value: 0.38623016958391904.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001953 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:17:42,217] Trial 17 finished with value: 0.38512642335268266 and parameters: {'model__n_estimators': 260, 'model__learning_rate': 0.05798506214119692, 'model__max_depth': 11, 'model__num_leaves': 219, 'model__subsample': 0.775219398624525, 'model__colsample_bytree': 0.9080582776976984, 'model__min_child_samples': 35}. Best is trial 11 with value: 0.38623016958391904.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002499 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:17:51,139] Trial 18 finished with value: 0.3805328981381775 and parameters: {'model__n_estimators': 529, 'model__learning_rate': 0.09314803152101844, 'model__max_depth': 8, 'model__num_leaves': 180, 'model__subsample': 0.813425678040704, 'model__colsample_bytree': 0.7083825750486391, 'model__min_child_samples': 28}. Best is trial 11 with value: 0.38623016958391904.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002873 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:17:58,323] Trial 19 finished with value: 0.3826306976816905 and parameters: {'model__n_estimators': 361, 'model__learning_rate': 0.02476837300597333, 'model__max_depth': 11, 'model__num_leaves': 100, 'model__subsample': 0.9228503167141147, 'model__colsample_bytree': 0.6365086558983031, 'model__min_child_samples': 18}. Best is trial 11 with value: 0.38623016958391904.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002258 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:18:01,955] Trial 20 finished with value: 0.3634303079563487 and parameters: {'model__n_estimators': 261, 'model__learning_rate': 0.011134168972191919, 'model__max_depth': 9, 'model__num_leaves': 72, 'model__subsample': 0.7485056631469634, 'model__colsample_bytree': 0.8680886407455952, 'model__min_child_samples': 50}. Best is trial 11 with value: 0.38623016958391904.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002509 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:18:06,213] Trial 21 finished with value: 0.38506469042799274 and parameters: {'model__n_estimators': 201, 'model__learning_rate': 0.05541675437183934, 'model__max_depth': 12, 'model__num_leaves': 159, 'model__subsample': 0.8508779527633592, 'model__colsample_bytree': 0.609745692239812, 'model__min_child_samples': 36}. Best is trial 11 with value: 0.38623016958391904.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007479 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:18:12,747] Trial 22 finished with value: 0.3821091655694887 and parameters: {'model__n_estimators': 249, 'model__learning_rate': 0.09916419051257659, 'model__max_depth': 12, 'model__num_leaves': 158, 'model__subsample': 0.8397114227316435, 'model__colsample_bytree': 0.703639025254068, 'model__min_child_samples': 34}. Best is trial 11 with value: 0.38623016958391904.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001747 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:18:17,359] Trial 23 finished with value: 0.38437236529126595 and parameters: {'model__n_estimators': 244, 'model__learning_rate': 0.060253822885761595, 'model__max_depth': 11, 'model__num_leaves': 124, 'model__subsample': 0.8923470383624652, 'model__colsample_bytree': 0.6340849915987747, 'model__min_child_samples': 32}. Best is trial 11 with value: 0.38623016958391904.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011538 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:18:33,668] Trial 24 finished with value: 0.3855951264700211 and parameters: {'model__n_estimators': 204, 'model__learning_rate': 0.07588948082895804, 'model__max_depth': 12, 'model__num_leaves': 174, 'model__subsample': 0.8341504352851791, 'model__colsample_bytree': 0.6003650195309591, 'model__min_child_samples': 38}. Best is trial 11 with value: 0.38623016958391904.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001590 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:18:50,437] Trial 25 finished with value: 0.38587929554859035 and parameters: {'model__n_estimators': 573, 'model__learning_rate': 0.03177289620664807, 'model__max_depth': 10, 'model__num_leaves': 137, 'model__subsample': 0.8760013532910472, 'model__colsample_bytree': 0.7039913700938316, 'model__min_child_samples': 27}. Best is trial 11 with value: 0.38623016958391904.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012537 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:18:57,751] Trial 26 finished with value: 0.3822963123347014 and parameters: {'model__n_estimators': 296, 'model__learning_rate': 0.10602850862450698, 'model__max_depth': 11, 'model__num_leaves': 201, 'model__subsample': 0.9191685969669663, 'model__colsample_bytree': 0.6361047009524041, 'model__min_child_samples': 21}. Best is trial 11 with value: 0.38623016958391904.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:19:04,844] Trial 27 finished with value: 0.3824871185439378 and parameters: {'model__n_estimators': 438, 'model__learning_rate': 0.0453266875551371, 'model__max_depth': 8, 'model__num_leaves': 165, 'model__subsample': 0.8248813000171921, 'model__colsample_bytree': 0.6843402572680314, 'model__min_child_samples': 43}. Best is trial 11 with value: 0.38623016958391904.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014183 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:19:11,662] Trial 28 finished with value: 0.38706263100271326 and parameters: {'model__n_estimators': 338, 'model__learning_rate': 0.051556313156039695, 'model__max_depth': 12, 'model__num_leaves': 137, 'model__subsample': 0.9416295918768198, 'model__colsample_bytree': 0.6418034861398971, 'model__min_child_samples': 32}. Best is trial 28 with value: 0.38706263100271326.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004619 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:19:18,567] Trial 29 finished with value: 0.38394529082444445 and parameters: {'model__n_estimators': 371, 'model__learning_rate': 0.046126652930428315, 'model__max_depth': 10, 'model__num_leaves': 110, 'model__subsample': 0.9271383633099031, 'model__colsample_bytree': 0.649904539318114, 'model__min_child_samples': 11}. Best is trial 28 with value: 0.38706263100271326.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009533 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:19:24,488] Trial 30 finished with value: 0.3597278749331729 and parameters: {'model__n_estimators': 281, 'model__learning_rate': 0.1996530303563301, 'model__max_depth': 11, 'model__num_leaves': 192, 'model__subsample': 0.9470455217209012, 'model__colsample_bytree': 0.7275592517184961, 'model__min_child_samples': 28}. Best is trial 28 with value: 0.38706263100271326.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007556 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:19:30,239] Trial 31 finished with value: 0.38505602144321327 and parameters: {'model__n_estimators': 229, 'model__learning_rate': 0.052912902554419376, 'model__max_depth': 12, 'model__num_leaves': 139, 'model__subsample': 0.8687200918844008, 'model__colsample_bytree': 0.6229554745032934, 'model__min_child_samples': 32}. Best is trial 28 with value: 0.38706263100271326.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008999 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:19:36,325] Trial 32 finished with value: 0.3830111060424891 and parameters: {'model__n_estimators': 342, 'model__learning_rate': 0.0641118347698363, 'model__max_depth': 12, 'model__num_leaves': 133, 'model__subsample': 0.8935303010076144, 'model__colsample_bytree': 0.6561643808582852, 'model__min_child_samples': 38}. Best is trial 28 with value: 0.38706263100271326.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001558 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:19:43,006] Trial 33 finished with value: 0.385544150818384 and parameters: {'model__n_estimators': 232, 'model__learning_rate': 0.07557668144278035, 'model__max_depth': 11, 'model__num_leaves': 170, 'model__subsample': 0.9992023891944146, 'model__colsample_bytree': 0.625332482749281, 'model__min_child_samples': 33}. Best is trial 28 with value: 0.38706263100271326.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004644 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:19:44,800] Trial 34 finished with value: 0.35159407158014244 and parameters: {'model__n_estimators': 277, 'model__learning_rate': 0.11854459927469732, 'model__max_depth': 3, 'model__num_leaves': 208, 'model__subsample': 0.7894170907270549, 'model__colsample_bytree': 0.6045399237559955, 'model__min_child_samples': 38}. Best is trial 28 with value: 0.38706263100271326.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011506 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:19:53,740] Trial 35 finished with value: 0.38721251256450995 and parameters: {'model__n_estimators': 411, 'model__learning_rate': 0.04900847530563631, 'model__max_depth': 12, 'model__num_leaves': 152, 'model__subsample': 0.721455577062599, 'model__colsample_bytree': 0.6812661281747796, 'model__min_child_samples': 30}. Best is trial 35 with value: 0.38721251256450995.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003061 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:19:58,566] Trial 36 finished with value: 0.3813704267830282 and parameters: {'model__n_estimators': 441, 'model__learning_rate': 0.05030950206875688, 'model__max_depth': 10, 'model__num_leaves': 72, 'model__subsample': 0.719362122619149, 'model__colsample_bytree': 0.6843571821096895, 'model__min_child_samples': 26}. Best is trial 35 with value: 0.38721251256450995.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002611 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:20:07,251] Trial 37 finished with value: 0.38370494374948666 and parameters: {'model__n_estimators': 404, 'model__learning_rate': 0.040323175602297735, 'model__max_depth': 11, 'model__num_leaves': 147, 'model__subsample': 0.7262526479676015, 'model__colsample_bytree': 0.7810238934155106, 'model__min_child_samples': 31}. Best is trial 35 with value: 0.38721251256450995.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002474 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:20:10,093] Trial 38 finished with value: 0.35905819346597134 and parameters: {'model__n_estimators': 474, 'model__learning_rate': 0.03177532151342608, 'model__max_depth': 4, 'model__num_leaves': 106, 'model__subsample': 0.6013607437284906, 'model__colsample_bytree': 0.7347101761296765, 'model__min_child_samples': 21}. Best is trial 35 with value: 0.38721251256450995.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007679 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:20:20,380] Trial 39 finished with value: 0.37993920957840926 and parameters: {'model__n_estimators': 514, 'model__learning_rate': 0.08640475110871174, 'model__max_depth': 9, 'model__num_leaves': 223, 'model__subsample': 0.9588999887887089, 'model__colsample_bytree': 0.6619066240655034, 'model__min_child_samples': 29}. Best is trial 35 with value: 0.38721251256450995.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003828 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:20:23,125] Trial 40 finished with value: 0.3837768687771449 and parameters: {'model__n_estimators': 327, 'model__learning_rate': 0.12440103723027747, 'model__max_depth': 12, 'model__num_leaves': 48, 'model__subsample': 0.7484631627148348, 'model__colsample_bytree': 0.9826757262275001, 'model__min_child_samples': 16}. Best is trial 35 with value: 0.38721251256450995.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007428 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:20:33,181] Trial 41 finished with value: 0.3838050002368859 and parameters: {'model__n_estimators': 361, 'model__learning_rate': 0.06754591585125472, 'model__max_depth': 12, 'model__num_leaves': 186, 'model__subsample': 0.6748521057728585, 'model__colsample_bytree': 0.6192064820806135, 'model__min_child_samples': 30}. Best is trial 35 with value: 0.38721251256450995.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002348 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:20:39,083] Trial 42 finished with value: 0.3846186960948228 and parameters: {'model__n_estimators': 232, 'model__learning_rate': 0.04602596855913219, 'model__max_depth': 12, 'model__num_leaves': 149, 'model__subsample': 0.8100329363929214, 'model__colsample_bytree': 0.648484788482241, 'model__min_child_samples': 36}. Best is trial 35 with value: 0.38721251256450995.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007380 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:20:45,173] Trial 43 finished with value: 0.3798874066910465 and parameters: {'model__n_estimators': 313, 'model__learning_rate': 0.08132461750270882, 'model__max_depth': 11, 'model__num_leaves': 164, 'model__subsample': 0.8523484205786773, 'model__colsample_bytree': 0.6286492359042575, 'model__min_child_samples': 39}. Best is trial 35 with value: 0.38721251256450995.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007015 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:20:49,983] Trial 44 finished with value: 0.36712178270905005 and parameters: {'model__n_estimators': 279, 'model__learning_rate': 0.03655846595519564, 'model__max_depth': 6, 'model__num_leaves': 118, 'model__subsample': 0.9388741933851413, 'model__colsample_bytree': 0.6003282570096193, 'model__min_child_samples': 42}. Best is trial 35 with value: 0.38721251256450995.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001910 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:20:57,624] Trial 45 finished with value: 0.3812848069553763 and parameters: {'model__n_estimators': 474, 'model__learning_rate': 0.0652801872625144, 'model__max_depth': 12, 'model__num_leaves': 135, 'model__subsample': 0.8976121320695336, 'model__colsample_bytree': 0.8509034271575082, 'model__min_child_samples': 34}. Best is trial 35 with value: 0.38721251256450995.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004851 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:21:11,970] Trial 46 finished with value: 0.3802632490013145 and parameters: {'model__n_estimators': 722, 'model__learning_rate': 0.05248960524450197, 'model__max_depth': 10, 'model__num_leaves': 157, 'model__subsample': 0.7871343960626782, 'model__colsample_bytree': 0.6785119095680127, 'model__min_child_samples': 23}. Best is trial 35 with value: 0.38721251256450995.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003133 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:21:16,247] Trial 47 finished with value: 0.383313378717269 and parameters: {'model__n_estimators': 216, 'model__learning_rate': 0.1126660142360133, 'model__max_depth': 11, 'model__num_leaves': 179, 'model__subsample': 0.8670910745739189, 'model__colsample_bytree': 0.6436326134416739, 'model__min_child_samples': 31}. Best is trial 35 with value: 0.38721251256450995.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002268 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:21:26,040] Trial 48 finished with value: 0.3774563296371744 and parameters: {'model__n_estimators': 401, 'model__learning_rate': 0.08945457866679903, 'model__max_depth': 12, 'model__num_leaves': 197, 'model__subsample': 0.9815852894548082, 'model__colsample_bytree': 0.6153101032880338, 'model__min_child_samples': 36}. Best is trial 35 with value: 0.38721251256450995.


[LightGBM] [Info] Number of positive: 19415, number of negative: 108363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002458 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 127778, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2026-01-18 20:21:33,508] Trial 49 finished with value: 0.3821840945478919 and parameters: {'model__n_estimators': 347, 'model__learning_rate': 0.022820077888829567, 'model__max_depth': 10, 'model__num_leaves': 143, 'model__subsample': 0.8413059970700424, 'model__colsample_bytree': 0.6713384130746967, 'model__min_child_samples': 26}. Best is trial 35 with value: 0.38721251256450995.


{'model__n_estimators': 411, 'model__learning_rate': 0.04900847530563631, 'model__max_depth': 12, 'model__num_leaves': 152, 'model__subsample': 0.721455577062599, 'model__colsample_bytree': 0.6812661281747796, 'model__min_child_samples': 30}


In [79]:
best_lgb = lgb_pipeline.set_params(**study_lgb.best_params)
print(study_lgb.best_params)

{'model__n_estimators': 411, 'model__learning_rate': 0.04900847530563631, 'model__max_depth': 12, 'model__num_leaves': 152, 'model__subsample': 0.721455577062599, 'model__colsample_bytree': 0.6812661281747796, 'model__min_child_samples': 30}


In [80]:
ml_pipeline.save_model(best_lgb, "lgb_model")

Model saved to: /home/user_stel/project_thesis/thesis_code/data_analysis/models/lgb_model.joblib


PosixPath('/home/user_stel/project_thesis/thesis_code/data_analysis/models/lgb_model.joblib')

# Training on train+val 

In [12]:
# Load all models
models = {
    "rf":  load_model("rf_model"),
    "lr":  load_model("lr_model"),
    "svm": load_model("svm_model"),
    "xgb": load_model("xgb_model"),
    "lgb": load_model("lgb_model")
}

In [13]:
# Prepare combined train+val set
X_trainval = pd.concat([X_train, X_val])
y_trainval = pd.concat([y_train, y_val])

In [14]:
final_results = {}

for name, model in models.items():

    clf = model.named_steps["model"]

    # -----------------------------
    # Handle class imbalance
    # -----------------------------
    if hasattr(clf, "class_weight"):
        class_weights = calculate_class_weights(y_trainval)
        model.set_params(model__class_weight=class_weights)

    elif hasattr(clf, "scale_pos_weight"):
        class_weights = calculate_class_weights(y_trainval)
        scale_pos_weight = class_weights[1] / class_weights[0]
        model.set_params(model__scale_pos_weight=scale_pos_weight)

    # -----------------------------
    # Retrain on train + val
    # -----------------------------
    model.fit(X_trainval, y_trainval)

    # -----------------------------
    # Evaluate on test (ONCE)
    # -----------------------------
    metrics = evaluate_model(
        model,
        X_test,
        y_test,
        split_name=f"{name.upper()} – Final Test"
    )

    final_results[name] = metrics

    # -----------------------------
    # Save final model
    # -----------------------------
    save_model(model, f"{name}_final")


===== RF – Final Test evaluation =====
              precision    recall  f1-score   support

           0      0.922     0.757     0.832     23221
           1      0.330     0.651     0.438      4270

    accuracy                          0.741     27491
   macro avg      0.626     0.704     0.635     27491
weighted avg      0.830     0.741     0.770     27491

Confusion matrix:
[[17585  5636]
 [ 1489  2781]]
Model saved to: /home/user_stel/project_thesis/thesis_code/data_analysis/models/rf_final.joblib

===== LR – Final Test evaluation =====
              precision    recall  f1-score   support

           0      0.927     0.638     0.756     23221
           1      0.270     0.728     0.394      4270

    accuracy                          0.652     27491
   macro avg      0.599     0.683     0.575     27491
weighted avg      0.825     0.652     0.700     27491

Confusion matrix:
[[14816  8405]
 [ 1161  3109]]
Model saved to: /home/user_stel/project_thesis/thesis_code/data_analysis




===== SVM – Final Test evaluation =====
              precision    recall  f1-score   support

           0      0.845     1.000     0.916     23221
           1      0.000     0.000     0.000      4270

    accuracy                          0.845     27491
   macro avg      0.422     0.500     0.458     27491
weighted avg      0.713     0.845     0.774     27491

Confusion matrix:
[[23221     0]
 [ 4270     0]]
Model saved to: /home/user_stel/project_thesis/thesis_code/data_analysis/models/svm_final.joblib


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



===== XGB – Final Test evaluation =====
              precision    recall  f1-score   support

           0      0.933     0.681     0.788     23221
           1      0.298     0.734     0.423      4270

    accuracy                          0.690     27491
   macro avg      0.615     0.708     0.606     27491
weighted avg      0.834     0.690     0.731     27491

Confusion matrix:
[[15822  7399]
 [ 1136  3134]]
Model saved to: /home/user_stel/project_thesis/thesis_code/data_analysis/models/xgb_final.joblib
[LightGBM] [Info] Number of positive: 23613, number of negative: 132786
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012523 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 156399, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.




===== LGB – Final Test evaluation =====
              precision    recall  f1-score   support

           0      0.932     0.688     0.791     23221
           1      0.300     0.726     0.424      4270

    accuracy                          0.694     27491
   macro avg      0.616     0.707     0.608     27491
weighted avg      0.834     0.694     0.734     27491

Confusion matrix:
[[15975  7246]
 [ 1171  3099]]
Model saved to: /home/user_stel/project_thesis/thesis_code/data_analysis/models/lgb_final.joblib


In [16]:
results_df = (
    pd.DataFrame(final_results)
    .T
    .rename_axis("model")
    .reset_index()
)

results_df

Unnamed: 0,model,average_precision,mcc
0,rf,0.428502,0.321091
1,lr,0.311919,0.268812
2,svm,0.155324,0.0
3,xgb,0.418933,0.309441
4,lgb,0.414107,0.309321


In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

for name, model in models.items():
    y_score = model.predict_proba(X_test)[:, 1]
    precision, recall, _ = precision_recall_curve(y_test, y_score)
    plt.plot(recall, precision, label=name)

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.title("Precision–Recall curves (test set)")
plt.show()

# Model Eavluation

In [None]:
models = {
    "RandomForest": best_rf,
    "LogisticRegression": best_lr,
    "SVM": best_svm,
    "XGBoost": best_xgb,
    "LightGBM": best_lgb
}

results = {}

for name, model in models.items():
    results[name] = evaluate_model(
        model,
        X_test,
        y_test,
        split_name=f"{name} – Test"
    )