# Modeling
Experiment different algorithms and pick best model

In [11]:
# Imports
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score
import joblib
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_val_score

In [2]:
# load in split data
X_train = np.load("/Users/rohith/Desktop/fraud-detection-ml/Data/prep/creditcard_X_train.npy")
y_train = np.load("/Users/rohith/Desktop/fraud-detection-ml/Data/prep/creditcard_y_train.npy")
X_val   = np.load("/Users/rohith/Desktop/fraud-detection-ml/Data/prep/creditcard_X_val.npy")
y_val   = np.load("/Users/rohith/Desktop/fraud-detection-ml/Data/prep/creditcard_y_val.npy")
x_test = np.load("/Users/rohith/Desktop/fraud-detection-ml/Data/prep/creditcard_X_test.npy")
y_test   = np.load("/Users/rohith/Desktop/fraud-detection-ml/Data/prep/creditcard_y_test.npy")

print(f"Train shape: {X_train.shape},  Val shape: {X_val.shape}")

Train shape: (192964, 11),  Val shape: (41349, 11)


### Baseline runs

In [3]:
# Dummy
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train, y_train)
print("Dummy AUC:", roc_auc_score(y_val, dummy.predict_proba(X_val)[:,1]))

Dummy AUC: 0.5


In [4]:
# Shallow Tree
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X_train, y_train)
print("Shallow Tree AUC:",
      roc_auc_score(y_val, tree.predict_proba(X_val)[:,1]))


Shallow Tree AUC: 0.6963779748210523


In [5]:
# RandomForrest Classifier
rForrest = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=1)

rForrest.fit(X_train, y_train)

prob = rForrest.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, prob)
print("RandomForest (100 trees) ROC AUC:", round(auc, 4))


RandomForest (100 trees) ROC AUC: 0.9447


In [6]:
# LightGBM
lgbm = LGBMClassifier(
    is_unbalance=True,    
    n_estimators=100,     
    learning_rate=0.1, 
    random_state=42,
    n_jobs=-1
)

lgbm.fit(X_train, y_train)

proba = lgbm.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, proba)
print("LightGBM (is_unbalance=True) ROC AUC:", round(auc, 4))

[LightGBM] [Info] Number of positive: 365, number of negative: 192599
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000784 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 192964, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001892 -> initscore=-6.268468
[LightGBM] [Info] Start training from score -6.268468
LightGBM (is_unbalance=True) ROC AUC: 0.907




1. Dummy AUC: 0.500  
2. Shallow Tree AUC: 0.696  
3. RandomForest ROC AUC: 0.945  
4. LightGBM ROC AUC: 0.907

### Tuning Random Forrest Classifier

#### Randomized Search CV

In [7]:
# Parameters to tune
params = {
    "n_estimators": [50, 100, 250, 500, 1000],
    "max_depth": [None, 5, 10, 20],
    "max_features":    ["sqrt", "log2", 0.3, 0.5],
    "min_samples_split":[2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

In [8]:
# cross-validation strategy
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [9]:
# Search configs
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rs = RandomizedSearchCV(
    rf,
    param_distributions=params,
    n_iter=30,               
    scoring="roc_auc",
    cv=cv,
    verbose=1,
    random_state=42
)

In [10]:
# Finding best params
"""
rs.fit(X_train, y_train)
print("Best ROC-AUC (val):", rs.best_score_)
print("Best params:", rs.best_params_)
"""

'\nrs.fit(X_train, y_train)\nprint("Best ROC-AUC (val):", rs.best_score_)\nprint("Best params:", rs.best_params_)\n'

##### Randomized Search Results
Fitting 3 folds for each of 30 candidates, totalling 90 fits  
Best ROC-AUC (val): 0.9720197513738311  
Best params: {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 0.3, 'max_depth': 10}

#### Optuna

In [15]:
def rf_objective(trial):
    params = {
        "n_estimators":    trial.suggest_int("n_estimators", 100, 800),
        "max_depth":       trial.suggest_int("max_depth", 5, 30),
        "max_features":    trial.suggest_categorical("max_features", ["sqrt", "log2", 0.3, 0.5]),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf":  trial.suggest_int("min_samples_leaf", 1, 4),
    }
    clf = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
    # 3-fold CV AUC
    score = cross_val_score(clf, X_train, y_train, cv=3, scoring="roc_auc").mean()
    return score

In [None]:
# Run optuna study
'''
study = optuna.create_study(direction="maximize")
study.optimize(rf_objective, n_trials=50, show_progress_bar=True)
'''

[I 2025-07-03 17:00:05,164] A new study created in memory with name: no-name-da2d9200-5b90-40aa-ab7f-f01d14a69fb0
Best trial: 0. Best value: 0.958488:   2%|▏         | 1/50 [01:21<1:06:22, 81.27s/it]

[I 2025-07-03 17:01:26,433] Trial 0 finished with value: 0.9584877954973038 and parameters: {'n_estimators': 350, 'max_depth': 29, 'max_features': 'sqrt', 'min_samples_split': 9, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.9584877954973038.


Best trial: 1. Best value: 0.967414:   4%|▍         | 2/50 [08:03<3:35:49, 269.79s/it]

[I 2025-07-03 17:08:08,183] Trial 1 finished with value: 0.9674141407274811 and parameters: {'n_estimators': 760, 'max_depth': 13, 'max_features': 0.5, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.9674141407274811.


Best trial: 2. Best value: 0.969235:   6%|▌         | 3/50 [12:13<3:24:24, 260.94s/it]

[I 2025-07-03 17:12:18,591] Trial 2 finished with value: 0.969234923668672 and parameters: {'n_estimators': 514, 'max_depth': 10, 'max_features': 0.5, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 2 with value: 0.969234923668672.


Best trial: 2. Best value: 0.969235:   8%|▊         | 4/50 [13:35<2:26:01, 190.46s/it]

[I 2025-07-03 17:13:41,002] Trial 3 finished with value: 0.9649961845150798 and parameters: {'n_estimators': 274, 'max_depth': 17, 'max_features': 0.3, 'min_samples_split': 6, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.969234923668672.


Best trial: 2. Best value: 0.969235:  10%|█         | 5/50 [14:43<1:49:44, 146.33s/it]

[I 2025-07-03 17:14:49,090] Trial 4 finished with value: 0.946904706654387 and parameters: {'n_estimators': 121, 'max_depth': 29, 'max_features': 0.5, 'min_samples_split': 6, 'min_samples_leaf': 2}. Best is trial 2 with value: 0.969234923668672.


Best trial: 2. Best value: 0.969235:  12%|█▏        | 6/50 [16:15<1:33:44, 127.82s/it]

[I 2025-07-03 17:16:20,984] Trial 5 finished with value: 0.9561319766530157 and parameters: {'n_estimators': 307, 'max_depth': 23, 'max_features': 0.3, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 2 with value: 0.969234923668672.


Best trial: 2. Best value: 0.969235:  14%|█▍        | 7/50 [16:47<1:09:11, 96.55s/it] 

[I 2025-07-03 17:16:53,148] Trial 6 finished with value: 0.948520228926775 and parameters: {'n_estimators': 104, 'max_depth': 19, 'max_features': 'sqrt', 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 2 with value: 0.969234923668672.


Best trial: 2. Best value: 0.969235:  16%|█▌        | 8/50 [19:59<1:28:46, 126.81s/it]

[I 2025-07-03 17:20:04,756] Trial 7 finished with value: 0.9684939010594755 and parameters: {'n_estimators': 697, 'max_depth': 10, 'max_features': 'log2', 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 2 with value: 0.969234923668672.


Best trial: 2. Best value: 0.969235:  18%|█▊        | 9/50 [22:16<1:28:43, 129.84s/it]

[I 2025-07-03 17:22:21,265] Trial 8 finished with value: 0.9663437568932282 and parameters: {'n_estimators': 463, 'max_depth': 13, 'max_features': 'log2', 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 2 with value: 0.969234923668672.


Best trial: 2. Best value: 0.969235:  20%|██        | 10/50 [28:02<2:11:02, 196.56s/it]

[I 2025-07-03 17:28:07,210] Trial 9 finished with value: 0.9626787003014821 and parameters: {'n_estimators': 621, 'max_depth': 18, 'max_features': 0.5, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.969234923668672.


Best trial: 2. Best value: 0.969235:  22%|██▏       | 11/50 [30:15<1:55:13, 177.27s/it]

[I 2025-07-03 17:30:20,735] Trial 10 finished with value: 0.9664919231532721 and parameters: {'n_estimators': 550, 'max_depth': 5, 'max_features': 0.5, 'min_samples_split': 10, 'min_samples_leaf': 2}. Best is trial 2 with value: 0.969234923668672.


Best trial: 2. Best value: 0.969235:  24%|██▍       | 12/50 [32:03<1:38:58, 156.29s/it]

[I 2025-07-03 17:32:09,035] Trial 11 finished with value: 0.9683281887378697 and parameters: {'n_estimators': 763, 'max_depth': 5, 'max_features': 'log2', 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 2 with value: 0.969234923668672.


Best trial: 2. Best value: 0.969235:  26%|██▌       | 13/50 [34:32<1:34:56, 153.95s/it]

[I 2025-07-03 17:34:37,626] Trial 12 finished with value: 0.9688786736347091 and parameters: {'n_estimators': 631, 'max_depth': 10, 'max_features': 'log2', 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 2 with value: 0.969234923668672.


Best trial: 2. Best value: 0.969235:  28%|██▊       | 14/50 [36:42<1:28:02, 146.74s/it]

[I 2025-07-03 17:36:47,671] Trial 13 finished with value: 0.9681585712759455 and parameters: {'n_estimators': 536, 'max_depth': 10, 'max_features': 'log2', 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 2 with value: 0.969234923668672.


Best trial: 14. Best value: 0.969972:  30%|███       | 15/50 [41:19<1:48:31, 186.03s/it]

[I 2025-07-03 17:41:24,772] Trial 14 finished with value: 0.9699724933177231 and parameters: {'n_estimators': 622, 'max_depth': 9, 'max_features': 0.5, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 14 with value: 0.9699724933177231.


Best trial: 14. Best value: 0.969972:  32%|███▏      | 16/50 [44:19<1:44:26, 184.31s/it]

[I 2025-07-03 17:44:25,077] Trial 15 finished with value: 0.9699381078528996 and parameters: {'n_estimators': 445, 'max_depth': 8, 'max_features': 0.5, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 14 with value: 0.9699724933177231.


Best trial: 14. Best value: 0.969972:  34%|███▍      | 17/50 [46:47<1:35:17, 173.26s/it]

[I 2025-07-03 17:46:52,656] Trial 16 finished with value: 0.9693047536841545 and parameters: {'n_estimators': 415, 'max_depth': 7, 'max_features': 0.5, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 14 with value: 0.9699724933177231.


Best trial: 14. Best value: 0.969972:  36%|███▌      | 18/50 [48:39<1:22:32, 154.77s/it]

[I 2025-07-03 17:48:44,381] Trial 17 finished with value: 0.964116733008896 and parameters: {'n_estimators': 232, 'max_depth': 14, 'max_features': 0.5, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 14 with value: 0.9699724933177231.


Best trial: 14. Best value: 0.969972:  38%|███▊      | 19/50 [51:03<1:18:23, 151.73s/it]

[I 2025-07-03 17:51:09,034] Trial 18 finished with value: 0.9698840514128846 and parameters: {'n_estimators': 406, 'max_depth': 8, 'max_features': 0.5, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 14 with value: 0.9699724933177231.


Best trial: 14. Best value: 0.969972:  40%|████      | 20/50 [53:51<1:18:15, 156.50s/it]

[I 2025-07-03 17:53:56,656] Trial 19 finished with value: 0.9611361709969461 and parameters: {'n_estimators': 635, 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 14 with value: 0.9699724933177231.


Best trial: 14. Best value: 0.969972:  42%|████▏     | 21/50 [56:00<1:11:40, 148.31s/it]

[I 2025-07-03 17:56:05,864] Trial 20 finished with value: 0.9549217353175571 and parameters: {'n_estimators': 477, 'max_depth': 23, 'max_features': 0.3, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 14 with value: 0.9699724933177231.


Best trial: 14. Best value: 0.969972:  44%|████▍     | 22/50 [58:14<1:07:14, 144.09s/it]

[I 2025-07-03 17:58:20,104] Trial 21 finished with value: 0.9694953313737521 and parameters: {'n_estimators': 361, 'max_depth': 8, 'max_features': 0.5, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 14 with value: 0.9699724933177231.


Best trial: 14. Best value: 0.969972:  46%|████▌     | 23/50 [1:00:23<1:02:44, 139.42s/it]

[I 2025-07-03 18:00:28,614] Trial 22 finished with value: 0.9680939381399184 and parameters: {'n_estimators': 388, 'max_depth': 7, 'max_features': 0.5, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 14 with value: 0.9699724933177231.


Best trial: 14. Best value: 0.969972:  48%|████▊     | 24/50 [1:04:03<1:10:52, 163.57s/it]

[I 2025-07-03 18:04:08,523] Trial 23 finished with value: 0.9698598816196943 and parameters: {'n_estimators': 578, 'max_depth': 8, 'max_features': 0.5, 'min_samples_split': 8, 'min_samples_leaf': 2}. Best is trial 14 with value: 0.9699724933177231.


Best trial: 14. Best value: 0.969972:  50%|█████     | 25/50 [1:17:48<2:30:50, 362.02s/it]

[I 2025-07-03 18:17:53,507] Trial 24 finished with value: 0.9676374283509065 and parameters: {'n_estimators': 445, 'max_depth': 11, 'max_features': 0.5, 'min_samples_split': 6, 'min_samples_leaf': 3}. Best is trial 14 with value: 0.9699724933177231.


Best trial: 14. Best value: 0.969972:  52%|█████▏    | 26/50 [1:21:56<2:11:08, 327.86s/it]

[I 2025-07-03 18:22:01,669] Trial 25 finished with value: 0.9670973488864952 and parameters: {'n_estimators': 247, 'max_depth': 5, 'max_features': 0.5, 'min_samples_split': 9, 'min_samples_leaf': 2}. Best is trial 14 with value: 0.9699724933177231.


Best trial: 14. Best value: 0.969972:  54%|█████▍    | 27/50 [1:25:29<1:52:29, 293.44s/it]

[I 2025-07-03 18:25:34,809] Trial 26 finished with value: 0.9581805810249134 and parameters: {'n_estimators': 182, 'max_depth': 16, 'max_features': 0.5, 'min_samples_split': 6, 'min_samples_leaf': 3}. Best is trial 14 with value: 0.9699724933177231.


Best trial: 14. Best value: 0.969972:  56%|█████▌    | 28/50 [1:28:37<1:36:00, 261.84s/it]

[I 2025-07-03 18:28:42,927] Trial 27 finished with value: 0.9680059837234646 and parameters: {'n_estimators': 497, 'max_depth': 12, 'max_features': 0.5, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 14 with value: 0.9699724933177231.


Best trial: 14. Best value: 0.969972:  58%|█████▊    | 29/50 [1:31:15<1:20:40, 230.48s/it]

[I 2025-07-03 18:31:20,239] Trial 28 finished with value: 0.9547354208356408 and parameters: {'n_estimators': 428, 'max_depth': 20, 'max_features': 0.3, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 14 with value: 0.9699724933177231.


Best trial: 29. Best value: 0.970515:  60%|██████    | 30/50 [1:32:46<1:02:55, 188.77s/it]

[I 2025-07-03 18:32:51,698] Trial 29 finished with value: 0.9705147823667463 and parameters: {'n_estimators': 333, 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 29 with value: 0.9705147823667463.


Best trial: 30. Best value: 0.972271:  62%|██████▏   | 31/50 [1:34:15<50:15, 158.70s/it]  

[I 2025-07-03 18:34:20,227] Trial 30 finished with value: 0.9722707800097767 and parameters: {'n_estimators': 347, 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 30 with value: 0.9722707800097767.


Best trial: 30. Best value: 0.972271:  64%|██████▍   | 32/50 [1:35:24<39:37, 132.06s/it]

[I 2025-07-03 18:35:30,118] Trial 31 finished with value: 0.9703877487373423 and parameters: {'n_estimators': 313, 'max_depth': 6, 'max_features': 'sqrt', 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 30 with value: 0.9722707800097767.


Best trial: 30. Best value: 0.972271:  66%|██████▌   | 33/50 [1:36:34<32:05, 113.29s/it]

[I 2025-07-03 18:36:39,633] Trial 32 finished with value: 0.9668229578301105 and parameters: {'n_estimators': 322, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 3, 'min_samples_leaf': 4}. Best is trial 30 with value: 0.9722707800097767.


Best trial: 33. Best value: 0.972438:  68%|██████▊   | 34/50 [1:38:23<29:50, 111.92s/it]

[I 2025-07-03 18:38:28,335] Trial 33 finished with value: 0.9724378337787631 and parameters: {'n_estimators': 333, 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 33 with value: 0.9724378337787631.


Best trial: 33. Best value: 0.972438:  70%|███████   | 35/50 [1:39:43<25:36, 102.46s/it]

[I 2025-07-03 18:39:48,734] Trial 34 finished with value: 0.9702478222910615 and parameters: {'n_estimators': 345, 'max_depth': 6, 'max_features': 'sqrt', 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 33 with value: 0.9724378337787631.


Best trial: 33. Best value: 0.972438:  72%|███████▏  | 36/50 [1:41:07<22:37, 96.93s/it] 

[I 2025-07-03 18:41:12,768] Trial 35 finished with value: 0.9672399341709461 and parameters: {'n_estimators': 288, 'max_depth': 12, 'max_features': 'sqrt', 'min_samples_split': 3, 'min_samples_leaf': 4}. Best is trial 33 with value: 0.9724378337787631.


Best trial: 36. Best value: 0.972879:  74%|███████▍  | 37/50 [1:42:04<18:25, 85.01s/it]

[I 2025-07-03 18:42:09,967] Trial 36 finished with value: 0.9728786086958349 and parameters: {'n_estimators': 188, 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 36 with value: 0.9728786086958349.


Best trial: 36. Best value: 0.972879:  76%|███████▌  | 38/50 [1:42:57<15:04, 75.39s/it]

[I 2025-07-03 18:43:02,895] Trial 37 finished with value: 0.9715480104531881 and parameters: {'n_estimators': 164, 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 36 with value: 0.9728786086958349.


Best trial: 36. Best value: 0.972879:  78%|███████▊  | 39/50 [1:44:14<13:53, 75.74s/it]

[I 2025-07-03 18:44:19,463] Trial 38 finished with value: 0.9545646478326916 and parameters: {'n_estimators': 184, 'max_depth': 30, 'max_features': 'sqrt', 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 36 with value: 0.9728786086958349.


Best trial: 36. Best value: 0.972879:  80%|████████  | 40/50 [1:45:22<12:13, 73.37s/it]

[I 2025-07-03 18:45:27,310] Trial 39 finished with value: 0.9549622914458359 and parameters: {'n_estimators': 158, 'max_depth': 23, 'max_features': 'sqrt', 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 36 with value: 0.9728786086958349.


Best trial: 36. Best value: 0.972879:  82%|████████▏ | 41/50 [1:46:52<11:46, 78.47s/it]

[I 2025-07-03 18:46:57,664] Trial 40 finished with value: 0.966458863152866 and parameters: {'n_estimators': 247, 'max_depth': 11, 'max_features': 'sqrt', 'min_samples_split': 5, 'min_samples_leaf': 4}. Best is trial 36 with value: 0.9728786086958349.


Best trial: 36. Best value: 0.972879:  84%|████████▍ | 42/50 [1:47:46<09:28, 71.11s/it]

[I 2025-07-03 18:47:51,617] Trial 41 finished with value: 0.9686148315105271 and parameters: {'n_estimators': 145, 'max_depth': 9, 'max_features': 'sqrt', 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 36 with value: 0.9728786086958349.


Best trial: 36. Best value: 0.972879:  86%|████████▌ | 43/50 [1:48:43<07:48, 66.96s/it]

[I 2025-07-03 18:48:48,890] Trial 42 finished with value: 0.9727316682099527 and parameters: {'n_estimators': 205, 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 36 with value: 0.9728786086958349.


Best trial: 36. Best value: 0.972879:  88%|████████▊ | 44/50 [1:49:34<06:12, 62.16s/it]

[I 2025-07-03 18:49:39,849] Trial 43 finished with value: 0.9706806555436208 and parameters: {'n_estimators': 207, 'max_depth': 6, 'max_features': 'sqrt', 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 36 with value: 0.9728786086958349.


Best trial: 36. Best value: 0.972879:  90%|█████████ | 45/50 [1:50:04<04:22, 52.48s/it]

[I 2025-07-03 18:50:09,729] Trial 44 finished with value: 0.9723075012635517 and parameters: {'n_estimators': 108, 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 36 with value: 0.9728786086958349.


Best trial: 36. Best value: 0.972879:  92%|█████████▏| 46/50 [1:50:38<03:07, 46.97s/it]

[I 2025-07-03 18:50:43,866] Trial 45 finished with value: 0.9668789110231671 and parameters: {'n_estimators': 101, 'max_depth': 9, 'max_features': 'sqrt', 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 36 with value: 0.9728786086958349.


Best trial: 36. Best value: 0.972879:  94%|█████████▍| 47/50 [1:51:28<02:23, 47.79s/it]

[I 2025-07-03 18:51:33,565] Trial 46 finished with value: 0.9662903965943107 and parameters: {'n_estimators': 129, 'max_depth': 13, 'max_features': 'sqrt', 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 36 with value: 0.9728786086958349.


Best trial: 36. Best value: 0.972879:  96%|█████████▌| 48/50 [1:53:00<02:02, 61.15s/it]

[I 2025-07-03 18:53:05,897] Trial 47 finished with value: 0.9693337278808242 and parameters: {'n_estimators': 268, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 36 with value: 0.9728786086958349.


Best trial: 36. Best value: 0.972879:  98%|█████████▊| 49/50 [1:54:21<01:07, 67.13s/it]

[I 2025-07-03 18:54:26,956] Trial 48 finished with value: 0.9555777866905908 and parameters: {'n_estimators': 209, 'max_depth': 26, 'max_features': 'sqrt', 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 36 with value: 0.9728786086958349.


Best trial: 36. Best value: 0.972879: 100%|██████████| 50/50 [1:55:17<00:00, 138.35s/it]

[I 2025-07-03 18:55:22,548] Trial 49 finished with value: 0.9710923367988237 and parameters: {'n_estimators': 280, 'max_depth': 6, 'max_features': 'sqrt', 'min_samples_split': 3, 'min_samples_leaf': 4}. Best is trial 36 with value: 0.9728786086958349.





##### Optuna Results
[I 2025-07-03 18:55:22,548] Trial 49 finished with value: 0.9710923367988237 and parameters: {'n_estimators': 280, 'max_depth': 6, 'max_features': 'sqrt', 'min_samples_split': 3, 'min_samples_leaf': 4}.   
Best is trial 36 with value: 0.9728786086958349.

### Tuning LightGBM

#### Randomized Search CV

In [39]:
# Reload data
X_train = np.load("/Users/rohith/Desktop/fraud-detection-ml/Data/prep/creditcard_X_train.npy")
y_train = np.load("/Users/rohith/Desktop/fraud-detection-ml/Data/prep/creditcard_y_train.npy")

In [40]:
# Params
lgbm_param_dist = {
    "n_estimators":     [100, 300, 500, 800],
    "learning_rate":    [0.01, 0.05, 0.1],
    "max_depth":        [5, 10, 15, None],
    "num_leaves":       [31, 50, 100, 200],
    "scale_pos_weight": [1, 50, 100, 200]
}

In [41]:
# Set up CV
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [42]:
# Search configs
lgbm = LGBMClassifier(is_unbalance=True, random_state=42, n_jobs=-1, verbose=-1)
rs = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=lgbm_param_dist,
    n_iter=30,
    scoring="roc_auc",
    cv=cv,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

In [None]:
"""
# Find best params
rs.fit(X_train, y_train)

print("Best LGBM ROC-AUC (val):", rs.best_score_)
print("Best params:", rs.best_params_)
"""

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[LightGBM] [Fatal] Cannot set is_unbalance and scale_pos_weight at the same time
[LightGBM] [Fatal] Cannot set is_unbalance and scale_pos_weight at the same time
[LightGBM] [Fatal] Cannot set is_unbalance and scale_pos_weight at the same time
[LightGBM] [Fatal] Cannot set is_unbalance and scale_pos_weight at the same time
[LightGBM] [Fatal] Cannot set is_unbalance and scale_pos_weight at the same time
[LightGBM] [Fatal] Cannot set is_unbalance and scale_pos_weight at the same time
[LightGBM] [Fatal] Cannot set is_unbalance and scale_pos_weight at the same time
[LightGBM] [Fatal] Cannot set is_unbalance and scale_pos_weight at the same time
[LightGBM] [Fatal] Cannot set is_unbalance and scale_pos_weight at the same time
[LightGBM] [Fatal] Cannot set is_unbalance and scale_pos_weight at the same time
[LightGBM] [Fatal] Cannot set is_unbalance and scale_pos_weight at the same time
[LightGBM] [Fatal] Cannot set is_unbalance and scale_pos_weight at the same time
[LightGBM] [Fatal] Cannot se

Best LGBM ROC-AUC (val): 0.9600263136892746
Best params: {'scale_pos_weight': 1, 'num_leaves': 50, 'n_estimators': 300, 'max_depth': None, 'learning_rate': 0.01}


##### Randomized SearchCV Results
Best LGBM ROC-AUC (val): 0.9600263136892746  
Best params: {'scale_pos_weight': 1, 'num_leaves': 50, 'n_estimators': 300, 'max_depth': None, 'learning_rate': 0.01}

#### Optuna

In [50]:
# Optuna objective
def lgbm_objective(trial):
    params = {
        "n_estimators":     trial.suggest_int("n_estimators", 100, 800),
        "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),
        "max_depth":        trial.suggest_int("max_depth", 5, 20),
        "num_leaves":       trial.suggest_int("num_leaves", 31, 200),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 100, 1000, log=True)
    }
    clf = LGBMClassifier(**params, random_state=42, n_jobs=-1, verbose=-1)
    # 3-fold CV AUC
    auc = cross_val_score(clf, X_train, y_train, cv=3, scoring="roc_auc").mean()
    return auc

In [None]:
"""
# Run study
study = optuna.create_study(direction="maximize")
study.optimize(lgbm_objective, n_trials=50, show_progress_bar=True)

# Best trial
print("Best LGBM ROC-AUC (val):", study.best_value)
print("Best params:", study.best_params)
"""

[I 2025-07-03 21:38:04,842] A new study created in memory with name: no-name-3543431d-e1ac-4f22-b359-4b01aa3840e2
  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),
  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:38:42,497] Trial 0 finished with value: 0.9327846994635886 and parameters: {'n_estimators': 778, 'learning_rate': 0.020743938796379792, 'max_depth': 11, 'num_leaves': 180, 'scale_pos_weight': 453.827234469319}. Best is trial 0 with value: 0.9327846994635886.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:38:57,289] Trial 1 finished with value: 0.9500284613683273 and parameters: {'n_estimators': 260, 'learning_rate': 0.0016774851264175013, 'max_depth': 14, 'num_leaves': 153, 'scale_pos_weight': 483.08341339705095}. Best is trial 1 with value: 0.9500284613683273.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:39:03,505] Trial 2 finished with value: 0.953045118902223 and parameters: {'n_estimators': 220, 'learning_rate': 0.011965488983276682, 'max_depth': 7, 'num_leaves': 78, 'scale_pos_weight': 170.91043146709853}. Best is trial 2 with value: 0.953045118902223.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:39:08,213] Trial 3 finished with value: 0.881924716198355 and parameters: {'n_estimators': 728, 'learning_rate': 0.06616943794760337, 'max_depth': 20, 'num_leaves': 106, 'scale_pos_weight': 100.19219000736625}. Best is trial 2 with value: 0.953045118902223.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:39:25,827] Trial 4 finished with value: 0.948598627338538 and parameters: {'n_estimators': 548, 'learning_rate': 0.01443227332064528, 'max_depth': 9, 'num_leaves': 79, 'scale_pos_weight': 318.50170926278935}. Best is trial 2 with value: 0.953045118902223.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:39:45,221] Trial 5 finished with value: 0.9532521686272725 and parameters: {'n_estimators': 721, 'learning_rate': 0.001126544916142429, 'max_depth': 18, 'num_leaves': 61, 'scale_pos_weight': 151.5877160799825}. Best is trial 5 with value: 0.9532521686272725.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:40:07,519] Trial 6 finished with value: 0.9000596283191982 and parameters: {'n_estimators': 344, 'learning_rate': 0.032734510445586264, 'max_depth': 17, 'num_leaves': 188, 'scale_pos_weight': 230.72119517642685}. Best is trial 5 with value: 0.9532521686272725.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:40:12,601] Trial 7 finished with value: 0.8423182565455095 and parameters: {'n_estimators': 640, 'learning_rate': 0.07123819814714881, 'max_depth': 16, 'num_leaves': 134, 'scale_pos_weight': 371.37001547771706}. Best is trial 5 with value: 0.9532521686272725.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:40:22,474] Trial 8 finished with value: 0.950146853454406 and parameters: {'n_estimators': 458, 'learning_rate': 0.015594500984532631, 'max_depth': 7, 'num_leaves': 45, 'scale_pos_weight': 158.31365966353883}. Best is trial 5 with value: 0.9532521686272725.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:40:38,680] Trial 9 finished with value: 0.9531172296621255 and parameters: {'n_estimators': 275, 'learning_rate': 0.010662291790155787, 'max_depth': 15, 'num_leaves': 152, 'scale_pos_weight': 184.4458287291585}. Best is trial 5 with value: 0.9532521686272725.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:40:40,922] Trial 10 finished with value: 0.957907835141953 and parameters: {'n_estimators': 111, 'learning_rate': 0.0011596097374080422, 'max_depth': 19, 'num_leaves': 36, 'scale_pos_weight': 948.7252444888493}. Best is trial 10 with value: 0.957907835141953.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:40:49,246] Trial 11 finished with value: 0.9622246496959056 and parameters: {'n_estimators': 448, 'learning_rate': 0.0011750906153398957, 'max_depth': 20, 'num_leaves': 32, 'scale_pos_weight': 779.5824815387298}. Best is trial 11 with value: 0.9622246496959056.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:40:51,808] Trial 12 finished with value: 0.9500493205245689 and parameters: {'n_estimators': 100, 'learning_rate': 0.0030416702802987523, 'max_depth': 20, 'num_leaves': 43, 'scale_pos_weight': 845.5438426992071}. Best is trial 11 with value: 0.9622246496959056.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:40:53,972] Trial 13 finished with value: 0.9526520688159166 and parameters: {'n_estimators': 108, 'learning_rate': 0.004010514683289036, 'max_depth': 19, 'num_leaves': 36, 'scale_pos_weight': 908.6520657362369}. Best is trial 11 with value: 0.9622246496959056.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:41:09,253] Trial 14 finished with value: 0.9618376072206805 and parameters: {'n_estimators': 414, 'learning_rate': 0.0043763530002078705, 'max_depth': 12, 'num_leaves': 92, 'scale_pos_weight': 622.6916331989207}. Best is trial 11 with value: 0.9622246496959056.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:41:27,070] Trial 15 finished with value: 0.9626761188085157 and parameters: {'n_estimators': 435, 'learning_rate': 0.005286204952968872, 'max_depth': 12, 'num_leaves': 106, 'scale_pos_weight': 661.3544653872166}. Best is trial 15 with value: 0.9626761188085157.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:41:49,391] Trial 16 finished with value: 0.9616717531094668 and parameters: {'n_estimators': 504, 'learning_rate': 0.006104813814432296, 'max_depth': 13, 'num_leaves': 120, 'scale_pos_weight': 642.9099581883463}. Best is trial 15 with value: 0.9626761188085157.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:42:09,387] Trial 17 finished with value: 0.9631252744952592 and parameters: {'n_estimators': 620, 'learning_rate': 0.0023718662408862947, 'max_depth': 10, 'num_leaves': 62, 'scale_pos_weight': 669.0723561737112}. Best is trial 17 with value: 0.9631252744952592.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:42:19,698] Trial 18 finished with value: 0.9623481763513345 and parameters: {'n_estimators': 584, 'learning_rate': 0.0021254077256954848, 'max_depth': 5, 'num_leaves': 100, 'scale_pos_weight': 539.575947110111}. Best is trial 17 with value: 0.9631252744952592.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:42:47,130] Trial 19 finished with value: 0.9649950659843252 and parameters: {'n_estimators': 635, 'learning_rate': 0.005249255294023219, 'max_depth': 10, 'num_leaves': 63, 'scale_pos_weight': 378.4192033093121}. Best is trial 19 with value: 0.9649950659843252.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:43:10,361] Trial 20 finished with value: 0.9573374732865596 and parameters: {'n_estimators': 649, 'learning_rate': 0.002184542190835981, 'max_depth': 10, 'num_leaves': 66, 'scale_pos_weight': 256.9918591094902}. Best is trial 19 with value: 0.9649950659843252.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:43:30,849] Trial 21 finished with value: 0.961743109286905 and parameters: {'n_estimators': 628, 'learning_rate': 0.0070297502977730445, 'max_depth': 9, 'num_leaves': 61, 'scale_pos_weight': 402.1177403976674}. Best is trial 19 with value: 0.9649950659843252.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:43:52,642] Trial 22 finished with value: 0.9624314139245421 and parameters: {'n_estimators': 536, 'learning_rate': 0.006316867680569183, 'max_depth': 12, 'num_leaves': 84, 'scale_pos_weight': 664.8706331736834}. Best is trial 19 with value: 0.9649950659843252.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:44:10,188] Trial 23 finished with value: 0.9591302442914714 and parameters: {'n_estimators': 376, 'learning_rate': 0.003786429524246584, 'max_depth': 8, 'num_leaves': 121, 'scale_pos_weight': 533.9740445512067}. Best is trial 19 with value: 0.9649950659843252.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:44:30,007] Trial 24 finished with value: 0.9628831095775651 and parameters: {'n_estimators': 700, 'learning_rate': 0.0027701618294215316, 'max_depth': 11, 'num_leaves': 55, 'scale_pos_weight': 731.3932528746878}. Best is trial 19 with value: 0.9649950659843252.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:44:50,930] Trial 25 finished with value: 0.9610010304712103 and parameters: {'n_estimators': 800, 'learning_rate': 0.0027320742866039074, 'max_depth': 10, 'num_leaves': 51, 'scale_pos_weight': 341.22287812808827}. Best is trial 19 with value: 0.9649950659843252.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:45:03,186] Trial 26 finished with value: 0.9554890144134692 and parameters: {'n_estimators': 713, 'learning_rate': 0.008387593925119893, 'max_depth': 5, 'num_leaves': 69, 'scale_pos_weight': 770.0025661258259}. Best is trial 19 with value: 0.9649950659843252.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:45:21,901] Trial 27 finished with value: 0.9547511431899854 and parameters: {'n_estimators': 691, 'learning_rate': 0.001671012310734915, 'max_depth': 10, 'num_leaves': 55, 'scale_pos_weight': 250.86182435245215}. Best is trial 19 with value: 0.9649950659843252.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:45:41,075] Trial 28 finished with value: 0.9535345325588646 and parameters: {'n_estimators': 584, 'learning_rate': 0.0026927432462215666, 'max_depth': 14, 'num_leaves': 73, 'scale_pos_weight': 402.6028410715908}. Best is trial 19 with value: 0.9649950659843252.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:46:06,093] Trial 29 finished with value: 0.9296707648978049 and parameters: {'n_estimators': 762, 'learning_rate': 0.023644723068403983, 'max_depth': 11, 'num_leaves': 91, 'scale_pos_weight': 458.0665275965961}. Best is trial 19 with value: 0.9649950659843252.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:46:49,496] Trial 30 finished with value: 0.9617381935226303 and parameters: {'n_estimators': 669, 'learning_rate': 0.003343545579489682, 'max_depth': 11, 'num_leaves': 200, 'scale_pos_weight': 555.6042247600869}. Best is trial 19 with value: 0.9649950659843252.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:47:20,898] Trial 31 finished with value: 0.9638041738769539 and parameters: {'n_estimators': 606, 'learning_rate': 0.0051534180200777935, 'max_depth': 13, 'num_leaves': 141, 'scale_pos_weight': 735.1961589810062}. Best is trial 19 with value: 0.9649950659843252.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:47:54,963] Trial 32 finished with value: 0.960189105010274 and parameters: {'n_estimators': 600, 'learning_rate': 0.0017782232765025854, 'max_depth': 13, 'num_leaves': 161, 'scale_pos_weight': 763.2099047303543}. Best is trial 19 with value: 0.9649950659843252.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:48:28,679] Trial 33 finished with value: 0.9634062728542054 and parameters: {'n_estimators': 510, 'learning_rate': 0.00463323852041788, 'max_depth': 14, 'num_leaves': 145, 'scale_pos_weight': 978.2014903420148}. Best is trial 19 with value: 0.9649950659843252.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:49:02,595] Trial 34 finished with value: 0.9635945618484322 and parameters: {'n_estimators': 493, 'learning_rate': 0.004688591782485369, 'max_depth': 14, 'num_leaves': 144, 'scale_pos_weight': 958.8112366017272}. Best is trial 19 with value: 0.9649950659843252.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:49:33,083] Trial 35 finished with value: 0.9651582885633744 and parameters: {'n_estimators': 487, 'learning_rate': 0.009035849377469507, 'max_depth': 15, 'num_leaves': 138, 'scale_pos_weight': 990.3099112706714}. Best is trial 35 with value: 0.9651582885633744.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:50:06,419] Trial 36 finished with value: 0.9621919884113472 and parameters: {'n_estimators': 487, 'learning_rate': 0.008326815086990354, 'max_depth': 15, 'num_leaves': 166, 'scale_pos_weight': 793.0837862785527}. Best is trial 35 with value: 0.9651582885633744.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:50:38,512] Trial 37 finished with value: 0.9518214790901723 and parameters: {'n_estimators': 543, 'learning_rate': 0.013780658443398978, 'max_depth': 16, 'num_leaves': 133, 'scale_pos_weight': 990.9550835157659}. Best is trial 35 with value: 0.9651582885633744.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:51:01,226] Trial 38 finished with value: 0.9070305422906518 and parameters: {'n_estimators': 365, 'learning_rate': 0.02125288810747738, 'max_depth': 14, 'num_leaves': 135, 'scale_pos_weight': 280.7170349752781}. Best is trial 35 with value: 0.9651582885633744.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:51:11,033] Trial 39 finished with value: 0.8372299317704434 and parameters: {'n_estimators': 564, 'learning_rate': 0.04530078262428435, 'max_depth': 16, 'num_leaves': 175, 'scale_pos_weight': 132.6723168374825}. Best is trial 35 with value: 0.9651582885633744.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:51:32,716] Trial 40 finished with value: 0.9580141081333048 and parameters: {'n_estimators': 328, 'learning_rate': 0.009009314221326944, 'max_depth': 15, 'num_leaves': 143, 'scale_pos_weight': 860.2810683970948}. Best is trial 35 with value: 0.9651582885633744.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:52:03,111] Trial 41 finished with value: 0.961510376046068 and parameters: {'n_estimators': 517, 'learning_rate': 0.005149214053677003, 'max_depth': 13, 'num_leaves': 145, 'scale_pos_weight': 978.3081351923395}. Best is trial 35 with value: 0.9651582885633744.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:52:34,377] Trial 42 finished with value: 0.9647056617179759 and parameters: {'n_estimators': 483, 'learning_rate': 0.0048604414628581945, 'max_depth': 14, 'num_leaves': 156, 'scale_pos_weight': 883.1183987264568}. Best is trial 35 with value: 0.9651582885633744.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:53:07,110] Trial 43 finished with value: 0.9593386572516556 and parameters: {'n_estimators': 477, 'learning_rate': 0.011684731695781145, 'max_depth': 17, 'num_leaves': 159, 'scale_pos_weight': 866.7622446659677}. Best is trial 35 with value: 0.9651582885633744.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:53:29,708] Trial 44 finished with value: 0.9554006756698367 and parameters: {'n_estimators': 400, 'learning_rate': 0.0077320577466656855, 'max_depth': 15, 'num_leaves': 125, 'scale_pos_weight': 204.3279528860306}. Best is trial 35 with value: 0.9651582885633744.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:54:03,063] Trial 45 finished with value: 0.945501361183084 and parameters: {'n_estimators': 463, 'learning_rate': 0.005375851220104824, 'max_depth': 17, 'num_leaves': 172, 'scale_pos_weight': 102.64707291108648}. Best is trial 35 with value: 0.9651582885633744.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:54:07,978] Trial 46 finished with value: 0.8419616735642359 and parameters: {'n_estimators': 745, 'learning_rate': 0.08946222343519787, 'max_depth': 14, 'num_leaves': 152, 'scale_pos_weight': 722.1709855874157}. Best is trial 35 with value: 0.9651582885633744.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:54:24,111] Trial 47 finished with value: 0.9514397795399 and parameters: {'n_estimators': 295, 'learning_rate': 0.011091938168599965, 'max_depth': 13, 'num_leaves': 110, 'scale_pos_weight': 856.9684351050222}. Best is trial 35 with value: 0.9651582885633744.


  "learning_rate":    trial.suggest_loguniform("learning_rate", 1e-3, 0.1),


[I 2025-07-03 21:54:49,931] Trial 48 finished with value: 0.9305089988499918 and parameters: {'n_estimators': 430, 'learning_rate': 0.017065639285669856, 'max_depth': 12, 'num_leaves': 136, 'scale_pos_weight': 564.8791499991389}. Best is trial 35 with value: 0.9651582885633744.


Best trial: 35. Best value: 0.965158: 100%|██████████| 50/50 [16:54<00:00, 20.29s/it]

[I 2025-07-03 21:54:59,534] Trial 49 finished with value: 0.9534538918297936 and parameters: {'n_estimators': 162, 'learning_rate': 0.003565961745342314, 'max_depth': 16, 'num_leaves': 127, 'scale_pos_weight': 905.8246548748929}. Best is trial 35 with value: 0.9651582885633744.
Best LGBM ROC-AUC (val): 0.9651582885633744
Best params: {'n_estimators': 487, 'learning_rate': 0.009035849377469507, 'max_depth': 15, 'num_leaves': 138, 'scale_pos_weight': 990.3099112706714}





##### Optuna Results
Best LGBM ROC-AUC (val): 0.9651582885633744  
Best params: {'n_estimators': 487, 'learning_rate': 0.009035849377469507, 'max_depth': 15, 'num_leaves': 138, 'scale_pos_weight': 990.3099112706714}

## Model Performance Summary
| Model                                 | ROC-AUC  |
|---------------------------------------|--------:|
| **Dummy Classifier**                  | 0.500   |
| **Shallow Decision Tree**             | 0.696   |
| **Random Forest (baseline)**          | 0.945   |
| **LightGBM (baseline)**               | 0.907   |

---

### Random Forest Tuning

#### 1. RandomizedSearchCV  
- **Folds × Candidates:** 3 CV × 30 = 90 fits  
- **Best ROC-AUC (val):** 0.9720  
- **Best params:**  
  ```json
  {
    "n_estimators": 500,
    "min_samples_split": 10,
    "min_samples_leaf": 2,
    "max_features": 0.3,
    "max_depth": 10
  }
  ```

#### 2. Optuna
- **Trials:** 50
- **Example trial 49:**
  - value = 0.97109
  - params = { n_estimators: 280, max_depth: 6, max_features: "sqrt", min_samples_split: 3, min_samples_leaf: 4 }
- **Best trial (36):**
  - value = 0.97288
  - params = { n_estimators: 500, max_depth: 10, max_features: 0.3, min_samples_split: 10, min_samples_leaf: 2 }

### LightGBM Tuning

#### 1. RandomizedSearchCV
- **Best ROC-AUC (val):** 0.9600
- **Best params:**
  ```json
  {
    "scale_pos_weight": 1,
    "num_leaves": 50,
    "n_estimators": 300,
    "max_depth": null,
    "learning_rate": 0.01
  }
  ```

#### 2. Optuna
- **Best ROC-AUC (val):** 0.9652
- **Best params:**
  ```json
  {
    "n_estimators": 487,
    "learning_rate": 0.009035849377469507,
    "max_depth": 15,
    "num_leaves": 138,
    "scale_pos_weight": 990.3099112706714
  }
  ```