# Gradient Boosted Tree Model with Light GBM
using Optuna

In [1]:
import pandas as pd
import numpy as np
import optuna
import optunahub
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

**Note:** Light GBM can't work with Polars

## ingesting data as categorical to LGBM in this section

In [2]:
X = pd.read_csv("Data/training_set_features.csv", index_col="respondent_id")

In [3]:
X=X[:].astype("category")

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26707 entries, 0 to 26706
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   h1n1_concern                 26615 non-null  category
 1   h1n1_knowledge               26591 non-null  category
 2   behavioral_antiviral_meds    26636 non-null  category
 3   behavioral_avoidance         26499 non-null  category
 4   behavioral_face_mask         26688 non-null  category
 5   behavioral_wash_hands        26665 non-null  category
 6   behavioral_large_gatherings  26620 non-null  category
 7   behavioral_outside_home      26625 non-null  category
 8   behavioral_touch_face        26579 non-null  category
 9   doctor_recc_h1n1             24547 non-null  category
 10  doctor_recc_seasonal         24547 non-null  category
 11  chronic_med_condition        25736 non-null  category
 12  child_under_6_months         25887 non-null  category
 13  health

In [5]:
y1 = pd.read_csv("Data/training_set_labels.csv", index_col="respondent_id", usecols=["respondent_id", "h1n1_vaccine"])
y2 = pd.read_csv("Data/training_set_labels.csv", index_col="respondent_id", usecols=["respondent_id", "seasonal_vaccine"])

X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y1, test_size=0.2, random_state=42, stratify=y1)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2, test_size=0.2, random_state=42, stratify=y2)

y1: h1n1 labels

y2: seasonal labels

In [6]:
# First set-up Optuna function
#def objective(trial, X_train, y_train, X_test, y_test):
def objective_h1n1(trial):

    X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y1, test_size=0.2, random_state=42, stratify=y1)
    #X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2, test_size=0.2, random_state=42, stratify=y2)

    dtrain = lgb.Dataset(X_train1, label=y_train1)
    dvalid = lgb.Dataset(X_test1, label=y_test1)

    params = {
        'objective': 'binary', # i the previous 2 files I didn't explicitly define this, as LGBM would use this as default for the classifier
        'metric': 'auc',
        'verbosity': -1,
        'max_depth': trial.suggest_int('max_depth', 1, 6),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 1.0, log=False),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 100, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True), # in LGBM; the value is not limited to [0 to 1] also knwon as lambda_l1
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True), # in LGBM ; the value is not limited to [0 to 1] also knwon as lambda_l2
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "random_state": 8
    }

    # a callback for pruning
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'auc')
    model = lgb.train(params,
                      dtrain,
                      valid_sets=[dvalid],
                      callbacks=[pruning_callback],)
    
    preds = model.predict(X_test1)
    return roc_auc_score(y_test1, preds)

def objective_seas(trial):

    #X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y1, test_size=0.2, random_state=42, stratify=y1)
    X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2, test_size=0.2, random_state=42, stratify=y2)

    dtrain = lgb.Dataset(X_train2, label=y_train2)
    dvalid = lgb.Dataset(X_test2, label=y_test2)

    params = {
        'objective': 'binary', # i the previous 2 files I didn't explicitly define this, as LGBM would use this as default for the classifier
        'metric': 'auc',
        'verbosity': -1,
        'max_depth': trial.suggest_int('max_depth', 1, 6),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 1.0, log=False),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 100, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True), # in LGBM; the value is not limited to [0 to 1] also knwon as lambda_l1
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True), # in LGBM ; the value is not limited to [0 to 1] also knwon as lambda_l2
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "random_state": 8
    }

    # a callback for pruning
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'auc')
    model = lgb.train(params,
                      dtrain,
                      valid_sets=[dvalid],
                      callbacks=[pruning_callback])
    
    preds = model.predict(X_test2)
    return roc_auc_score(y_test2, preds)

In [7]:
#module = optunahub.load_module(package="samplers/auto_sampler")
study = optuna.create_study(direction='maximize', 
                            pruner=optuna.pruners.MedianPruner(), # Ist der Pruner wirklich eine gute Idee?
#                            sampler=module.AutoSampler(),
                            ) 
study.optimize(objective_h1n1, n_trials = 100) # Wo ist die eingabe für K-Fold CV =5?


[I 2024-11-27 12:13:29,316] A new study created in memory with name: no-name-65edb4a6-61e3-4952-beed-69d5b31e5815
[I 2024-11-27 12:13:29,734] Trial 0 finished with value: 0.8567087160166242 and parameters: {'max_depth': 6, 'learning_rate': 0.21407078573434835, 'subsample': 0.9522106535780931, 'subsample_freq': 7, 'colsample_bytree': 0.6318424998276143, 'min_child_samples': 77, 'min_child_weight': 0.017305700756464478, 'reg_alpha': 5.766175957779647e-08, 'reg_lambda': 7.871954880887852e-06, 'num_leaves': 50}. Best is trial 0 with value: 0.8567087160166242.
[I 2024-11-27 12:13:30,070] Trial 1 finished with value: 0.8194437841692417 and parameters: {'max_depth': 1, 'learning_rate': 0.009456920848839128, 'subsample': 0.2913940398886474, 'subsample_freq': 0, 'colsample_bytree': 0.973187330236381, 'min_child_samples': 81, 'min_child_weight': 2.184232370275735, 'reg_alpha': 1.8045736335068904e-06, 'reg_lambda': 8.894824644771073e-05, 'num_leaves': 211}. Best is trial 0 with value: 0.856708716

In [8]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 100
Best trial:
  Value: 0.8567087160166242
  Params: 
    max_depth: 6
    learning_rate: 0.21407078573434835
    subsample: 0.9522106535780931
    subsample_freq: 7
    colsample_bytree: 0.6318424998276143
    min_child_samples: 77
    min_child_weight: 0.017305700756464478
    reg_alpha: 5.766175957779647e-08
    reg_lambda: 7.871954880887852e-06
    num_leaves: 50


In [9]:
#module = optunahub.load_module(package="samplers/auto_sampler")
study2 = optuna.create_study(direction='maximize', 
                            pruner=optuna.pruners.MedianPruner(), # Ist der Pruner wirklich eine gute Idee?
                            #sampler=module.AutoSampler(),
                            ) 
study2.optimize(objective_seas, n_trials = 100) # Wo ist die eingabe für K-Fold CV =5?


[I 2024-11-27 12:13:46,750] A new study created in memory with name: no-name-2826a8a5-d62a-45bb-be91-7e4146c0ad9e
[I 2024-11-27 12:13:47,099] Trial 0 finished with value: 0.8519409862986302 and parameters: {'max_depth': 4, 'learning_rate': 0.3121193346359039, 'subsample': 0.41711715583353415, 'subsample_freq': 5, 'colsample_bytree': 0.6759985314730259, 'min_child_samples': 46, 'min_child_weight': 0.07623479570941365, 'reg_alpha': 1.1318169108468219e-06, 'reg_lambda': 1.205286763435109e-09, 'num_leaves': 70}. Best is trial 0 with value: 0.8519409862986302.
[I 2024-11-27 12:13:47,465] Trial 1 finished with value: 0.8502951600511803 and parameters: {'max_depth': 4, 'learning_rate': 0.4467103113774013, 'subsample': 0.3672571227604582, 'subsample_freq': 7, 'colsample_bytree': 0.41542665829393943, 'min_child_samples': 91, 'min_child_weight': 35.26912643105677, 'reg_alpha': 5.491928942361416e-09, 'reg_lambda': 8.622022116011e-08, 'num_leaves': 170}. Best is trial 0 with value: 0.8519409862986

In [10]:
print("Seasonal Vaccination")
print("Number of finished trials: {}".format(len(study2.trials)))
print("Best trial:")
trial = study2.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Seasonal Vaccination
Number of finished trials: 100
Best trial:
  Value: 0.863914280704497
  Params: 
    max_depth: 6
    learning_rate: 0.18755832369088704
    subsample: 0.9558393406367027
    subsample_freq: 8
    colsample_bytree: 0.6994378385759924
    min_child_samples: 40
    min_child_weight: 59.07655474972175
    reg_alpha: 5.382162260998821e-09
    reg_lambda: 5.1883835536972365
    num_leaves: 130


y1: h1n1 labels

y2: seasonal labels

# Submission Data

In [11]:
dtrain_h1n1 = lgb.Dataset(X_train1, label=y_train1)
dtrain_seas = lgb.Dataset(X_train2, label=y_train2)

In [12]:
h1n1_model = lgb.LGBMClassifier(**study.best_trial.params)
seas_model = lgb.LGBMClassifier(**study2.best_trial.params)

In [13]:
h1n1_model.fit(X_train1, np.ravel(y_train1), eval_set=(X_test1, np.ravel(y_test1)), eval_metric='auc')
seas_model.fit(X_train2, np.ravel(y_train2), eval_set=(X_test2, np.ravel(y_test2)), eval_metric='auc')

In [14]:
print('Best score for the H1N1 model')
for key, value in h1n1_model.best_score_.items():
    print(key, value) #h1n1_model.best_score_

print('Best score for the Seasonal model')
for key, value in seas_model.best_score_.items():
    print(key, value) #h1n1_model.best_score_

Best score for the H1N1 model
valid_0 OrderedDict({'auc': 0.8558783399599367, 'binary_logloss': 0.36107319898878903})
Best score for the Seasonal model
valid_0 OrderedDict({'auc': 0.864412704381523, 'binary_logloss': 0.45928310109426146})


In [42]:
temp_list = list(h1n1_model.best_score_.items())
#temp_list[0][1]
temp_temp_list = list(temp_list[0][1].items())
temp_temp_list[0][1]
print('Best auc score for the H1N1 model:', temp_temp_list[0][1])

temp_list = list(seas_model.best_score_.items())
#temp_list[0][1]
temp_temp_list = list(temp_list[0][1].items())
temp_temp_list[0][1]
print('Best auc score for the Seasonal model:', temp_temp_list[0][1])


Best auc score for the H1N1 model: 0.8558783399599367
Best auc score for the Seasonal model: 0.864412704381523


In [64]:
h1n1_model.predict(X_test1)

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [49]:
h1n1_model.predict_proba(X_test1)[:,1]

array([6.72554589e-02, 9.17602226e-01, 5.90640887e-04, ...,
       4.90199102e-01, 6.13295874e-02, 5.44400991e-03])

In [50]:
df_test_set_features = pd.read_csv('Data/test_set_features.csv', index_col='respondent_id')
df_test_set_features=df_test_set_features[:].astype("category")

In [51]:
h1n1_vaccine_probability=h1n1_model.predict_proba(df_test_set_features)
seas_vaccine_probability=seas_model.predict_proba(df_test_set_features)

df_submission = df_test_set_features[[]]
df_submission['h1n1_vaccine'] = h1n1_vaccine_probability[:,1].tolist()
df_submission['seasonal_vaccine'] = seas_vaccine_probability[:,1].tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_submission['h1n1_vaccine'] = h1n1_vaccine_probability[:,1].tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_submission['seasonal_vaccine'] = seas_vaccine_probability[:,1].tolist()


In [52]:
df_submission.to_csv("LGBM_Model/Optuna_submission.csv")