In [14]:
pip install lightgbm 

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from lightgbm import LGBMClassifier
from sklearn.metrics import make_scorer, f1_score
import optuna

In [16]:
# Load the dataset
df = pd.read_csv('https://drive.google.com/uc?export=download&id=1eYCKuqJda4bpzXBVnqXylg0qQwvpUuum')

## DATA CLEANSING

In [17]:

# As an approach, we're dropping the employment industry with huge number of missing values
df = df.drop(columns=['employment_industry'])
target = 'h1n1_vaccine'

In [18]:
#PREPPING for the X and y variables
X = df.drop(columns=[target])
y = df[target]

In [19]:
#Splitting for training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

### IMPUTING

In [20]:
# Split numerical and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Numeric transformer
numeric_transformer = SimpleImputer(strategy='mean')

### TARGET ENCODING

In [21]:
# Categorical transformer with Target Encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target_encode', TargetEncoder())
])

In [22]:
# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train, y_train)
X_test_preprocessed = preprocessor.transform(X_test)

## FEATURE ENGINEERING

In [23]:
# Creating a function to loop the trial for Optuna
def objective(trial):
    param = {
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 31, 256),
        'max_depth': trial.suggest_int('max_depth', -1, 50),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0)
    }

    model = LGBMClassifier(**param)
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1 = cross_validate(model, X_train_preprocessed, y_train, cv=skf, scoring=make_scorer(f1_score, average='macro'))['test_score'].mean()
    return f1

In [24]:
# Run the optimization: 1000 trials cause... :) 
optimization = optuna.create_study(direction='maximize')
optimization.optimize(objective, n_trials=10)


[I 2024-07-14 18:49:56,157] A new study created in memory with name: no-name-38ce3307-5932-4124-82b7-319d6c94a418
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0)


[LightGBM] [Info] Number of positive: 3256, number of negative: 12126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000674 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211676 -> initscore=-1.314852
[LightGBM] [Info] Start training from score -1.314852
[LightGBM] [Info] Number of positive: 3257, number of negative: 12125
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000911 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [bin

[I 2024-07-14 18:49:56,509] Trial 0 finished with value: 0.7190462285889361 and parameters: {'num_leaves': 36, 'max_depth': 15, 'learning_rate': 0.032646667389121496, 'n_estimators': 59, 'min_child_weight': 3, 'subsample': 0.854698835427258, 'colsample_bytree': 0.56776343547927, 'reg_alpha': 0.00029693003844199735, 'reg_lambda': 6.479053865257585e-05}. Best is trial 0 with value: 0.7190462285889361.


[LightGBM] [Info] Number of positive: 3257, number of negative: 12126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000753 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15383, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211727 -> initscore=-1.314545
[LightGBM] [Info] Start training from score -1.314545
[LightGBM] [Info] Number of positive: 3257, number of negative: 12126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000681 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15383, number of used features: 34
[LightGBM] [Info] [bin

  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0)


[LightGBM] [Info] Number of positive: 3257, number of negative: 12125
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000855 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211741 -> initscore=-1.314463
[LightGBM] [Info] Start training from score -1.314463
[LightGBM] [Info] Number of positive: 3257, number of negative: 12125
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000782 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [bin

[I 2024-07-14 18:49:57,612] Trial 1 finished with value: 0.7440330695154318 and parameters: {'num_leaves': 155, 'max_depth': 40, 'learning_rate': 0.180025628515136, 'n_estimators': 91, 'min_child_weight': 3, 'subsample': 0.9257604308813965, 'colsample_bytree': 0.9305463060178839, 'reg_alpha': 0.9888687066286024, 'reg_lambda': 0.4160873897895492}. Best is trial 1 with value: 0.7440330695154318.


[LightGBM] [Info] Number of positive: 3256, number of negative: 12126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000913 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211676 -> initscore=-1.314852
[LightGBM] [Info] Start training from score -1.314852


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0)


[LightGBM] [Info] Number of positive: 3257, number of negative: 12125
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000900 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211741 -> initscore=-1.314463
[LightGBM] [Info] Start training from score -1.314463
[LightGBM] [Info] Number of positive: 3257, number of negative: 12125
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000846 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [bin

[I 2024-07-14 18:49:58,350] Trial 2 finished with value: 0.6759777715428341 and parameters: {'num_leaves': 158, 'max_depth': 13, 'learning_rate': 0.021490211734390904, 'n_estimators': 51, 'min_child_weight': 8, 'subsample': 0.6525084886070728, 'colsample_bytree': 0.6510355460193622, 'reg_alpha': 1.818925913172323e-08, 'reg_lambda': 6.233775620245502e-08}. Best is trial 1 with value: 0.7440330695154318.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0)


[LightGBM] [Info] Number of positive: 3256, number of negative: 12126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001095 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211676 -> initscore=-1.314852
[LightGBM] [Info] Start training from score -1.314852
[LightGBM] [Info] Number of positive: 3257, number of negative: 12125
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001059 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [bin

[I 2024-07-14 18:49:59,274] Trial 3 finished with value: 0.7483442392926583 and parameters: {'num_leaves': 109, 'max_depth': 32, 'learning_rate': 0.03611326998202903, 'n_estimators': 89, 'min_child_weight': 6, 'subsample': 0.8631948312479781, 'colsample_bytree': 0.8479016152927255, 'reg_alpha': 0.9591628764161337, 'reg_lambda': 1.385209928563482e-07}. Best is trial 3 with value: 0.7483442392926583.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0)


[LightGBM] [Info] Number of positive: 3256, number of negative: 12126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000947 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211676 -> initscore=-1.314852
[LightGBM] [Info] Start training from score -1.314852
[LightGBM] [Info] Number of positive: 3257, number of negative: 12125
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000764 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [bin

[I 2024-07-14 18:50:01,568] Trial 4 finished with value: 0.7441617164233181 and parameters: {'num_leaves': 232, 'max_depth': 49, 'learning_rate': 0.10337226939159416, 'n_estimators': 210, 'min_child_weight': 10, 'subsample': 0.8198952746901413, 'colsample_bytree': 0.5619526649900493, 'reg_alpha': 6.092537801029675e-05, 'reg_lambda': 7.17332014772335e-05}. Best is trial 3 with value: 0.7483442392926583.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0)


[LightGBM] [Info] Number of positive: 3256, number of negative: 12126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000762 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211676 -> initscore=-1.314852
[LightGBM] [Info] Start training from score -1.314852
[LightGBM] [Info] Number of positive: 3257, number of negative: 12125
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000759 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [bin

[I 2024-07-14 18:50:04,453] Trial 5 finished with value: 0.7512853912359375 and parameters: {'num_leaves': 177, 'max_depth': 29, 'learning_rate': 0.049808394474338664, 'n_estimators': 229, 'min_child_weight': 9, 'subsample': 0.6659848720589086, 'colsample_bytree': 0.536410027428686, 'reg_alpha': 1.2007563411908925e-07, 'reg_lambda': 0.002805789693454542}. Best is trial 5 with value: 0.7512853912359375.


[LightGBM] [Info] Number of positive: 3256, number of negative: 12126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000837 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211676 -> initscore=-1.314852
[LightGBM] [Info] Start training from score -1.314852


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0)


[LightGBM] [Info] Number of positive: 3257, number of negative: 12125
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000751 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211741 -> initscore=-1.314463
[LightGBM] [Info] Start training from score -1.314463
[LightGBM] [Info] Number of positive: 3257, number of negative: 12125
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000796 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [bin

[I 2024-07-14 18:50:06,731] Trial 6 finished with value: 0.7486062547744385 and parameters: {'num_leaves': 188, 'max_depth': 35, 'learning_rate': 0.022789810709113435, 'n_estimators': 156, 'min_child_weight': 6, 'subsample': 0.5211799369553782, 'colsample_bytree': 0.6740550481011915, 'reg_alpha': 0.0001797421022262494, 'reg_lambda': 7.162090676328028e-08}. Best is trial 5 with value: 0.7512853912359375.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0)


[LightGBM] [Info] Number of positive: 3256, number of negative: 12126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000941 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211676 -> initscore=-1.314852
[LightGBM] [Info] Start training from score -1.314852
[LightGBM] [Info] Number of positive: 3257, number of negative: 12125
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000719 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [bin

[I 2024-07-14 18:50:07,924] Trial 7 finished with value: 0.7542077463726597 and parameters: {'num_leaves': 140, 'max_depth': 49, 'learning_rate': 0.06581726559888942, 'n_estimators': 100, 'min_child_weight': 6, 'subsample': 0.7810620443765292, 'colsample_bytree': 0.8797255138054071, 'reg_alpha': 4.67846019368153e-08, 'reg_lambda': 2.7914652440951235e-06}. Best is trial 7 with value: 0.7542077463726597.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0)


[LightGBM] [Info] Number of positive: 3256, number of negative: 12126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000814 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211676 -> initscore=-1.314852
[LightGBM] [Info] Start training from score -1.314852
[LightGBM] [Info] Number of positive: 3257, number of negative: 12125
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000860 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [bin

[I 2024-07-14 18:50:12,397] Trial 8 finished with value: 0.7430388795464518 and parameters: {'num_leaves': 239, 'max_depth': 34, 'learning_rate': 0.010459430560112778, 'n_estimators': 279, 'min_child_weight': 7, 'subsample': 0.8125783071160262, 'colsample_bytree': 0.663227207162043, 'reg_alpha': 0.012839673497339039, 'reg_lambda': 2.6305360721670487e-05}. Best is trial 7 with value: 0.7542077463726597.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0)


[LightGBM] [Info] Number of positive: 3256, number of negative: 12126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000976 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211676 -> initscore=-1.314852
[LightGBM] [Info] Start training from score -1.314852
[LightGBM] [Info] Number of positive: 3257, number of negative: 12125
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000880 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [bin

[I 2024-07-14 18:50:15,110] Trial 9 finished with value: 0.7309828601477802 and parameters: {'num_leaves': 250, 'max_depth': 47, 'learning_rate': 0.011314542581329547, 'n_estimators': 159, 'min_child_weight': 2, 'subsample': 0.6593168113598727, 'colsample_bytree': 0.858314970187768, 'reg_alpha': 7.066704911908278e-06, 'reg_lambda': 1.8613925947666244e-06}. Best is trial 7 with value: 0.7542077463726597.


In [25]:
# Get the best parameters and train the final model
best_params = optimization.best_params
best_model = LGBMClassifier(**best_params)
best_model.fit(X_train_preprocessed, y_train)

[LightGBM] [Info] Number of positive: 4071, number of negative: 15157
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001110 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 19228, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211722 -> initscore=-1.314574
[LightGBM] [Info] Start training from score -1.314574


In [26]:

# Perform cross-validation and evaluate the model
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_validate(best_model, X_train_preprocessed, y_train, cv=10, scoring=make_scorer(f1_score, average='macro'))

# Display the results
f1_scores = cv_results['test_score']

print(f"Best hyperparameters: {best_params}")
print(f"Cross-validated F1 score: {f1_scores.mean()} ± {f1_scores.std()}")

[LightGBM] [Info] Number of positive: 3663, number of negative: 13642
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001143 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 17305, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211673 -> initscore=-1.314871
[LightGBM] [Info] Start training from score -1.314871
[LightGBM] [Info] Number of positive: 3664, number of negative: 13641
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000995 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 17305, number of used features: 34
[LightGBM] [Info] [bin

In [28]:
best_params

{'num_leaves': 140,
 'max_depth': 49,
 'learning_rate': 0.06581726559888942,
 'n_estimators': 100,
 'min_child_weight': 6,
 'subsample': 0.7810620443765292,
 'colsample_bytree': 0.8797255138054071,
 'reg_alpha': 4.67846019368153e-08,
 'reg_lambda': 2.7914652440951235e-06}

In [29]:
best_model = LGBMClassifier(**best_params)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_validate(best_model, X_train_preprocessed, y_train, cv=skf, scoring='f1_macro')

[LightGBM] [Info] Number of positive: 3256, number of negative: 12126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001323 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211676 -> initscore=-1.314852
[LightGBM] [Info] Start training from score -1.314852
[LightGBM] [Info] Number of positive: 3257, number of negative: 12125
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000811 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15382, number of used features: 34
[LightGBM] [Info] [bin

In [30]:
cv_results

{'fit_time': array([0.22944617, 0.22390962, 0.24863172, 0.23519468, 0.22514367]),
 'score_time': array([0.00679517, 0.00823689, 0.01207304, 0.00768471, 0.01930404]),
 'test_score': array([0.75488942, 0.75187824, 0.7580507 , 0.75940032, 0.74682004])}

## APPLY MODEL 

In [31]:
# Retrain the final model on the entire dataset using the best parameters
X_preprocessed = preprocessor.fit_transform(X, y)
final_model = LGBMClassifier(**best_params)
final_model.fit(X_preprocessed, y)

[LightGBM] [Info] Number of positive: 4544, number of negative: 16821
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001079 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 21365, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212684 -> initscore=-1.308820
[LightGBM] [Info] Start training from score -1.308820


In [32]:
y_pred = final_model.predict(X_test_preprocessed)

In [33]:
test_f1_score = f1_score(y_test, y_pred, average='macro')

In [34]:
test_f1_score

0.8451802257880405