# Gradient Boosted Tree Model with Light GBM
using Optuna

<div class="alert alert-block alert-warning">

## Conclusion of all files 4_* to 8_*:
- consider using different samplers for Optuna (not done here)
- better recreate the pipeline without using a modified objective function overloaded with _study.best_trial_ !

In [1]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, TargetEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [3]:
X = pd.read_csv("Data/training_set_features.csv", index_col="respondent_id")

In [4]:
y1 = pd.read_csv("Data/training_set_labels.csv", index_col="respondent_id", usecols=["respondent_id", "h1n1_vaccine"])
y2 = pd.read_csv("Data/training_set_labels.csv", index_col="respondent_id", usecols=["respondent_id", "seasonal_vaccine"])

X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y1, test_size=0.2, random_state=42, stratify=y1)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2, test_size=0.2, random_state=42, stratify=y2)

In [24]:
def objective_h1n1(trial):
    
    dtrain = lgb.Dataset(X_train1, label=y_train1)
    dvalid = lgb.Dataset(X_test1, label=y_test1)

    '''First select the encoder'''
    encoders = trial.suggest_categorical("encoders", ["TargetEncoder", "OrdinalEncoder"])

    if encoders == "TargetEncoder":
        encoder = TargetEncoder()
    else:
        encoder = OrdinalEncoder()
    
    '''2nd select the model LGBM-model'''
    # instantiate parameters for classifier
    params = {
        'objective': 'binary', # i the previous 2 files I didn't explicitly define this, as LGBM would use this as default for the classifier
        'metric': 'auc',
        'verbosity': 0,
        'max_depth': trial.suggest_int('max_depth', 1, 6),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 100, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True), # in LGBM; the value is not limited to [0 to 1] also knwon as lambda_l1
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True), # in LGBM ; the value is not limited to [0 to 1] also knwon as lambda_l2
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "random_state": 8,
        #"n_jobs": 1
    }

    #classifier = lgb.LGBMClassifier(**params, callbacks=[optuna.integration.LightGBMPruningCallback(trial, "auc")])
    classifier = lgb.LGBMClassifier(**params)

    pipe = Pipeline([('encoder', encoder), 
                     ('clf', classifier)])
    pipe.fit(X_train1, np.ravel(y_train1))

    y_pred = pipe.predict_proba(X_test1)[:, 1]
    score = roc_auc_score(y_test1, y_pred)

    return score
    

In [25]:
study_h1n1 = optuna.create_study(direction="maximize")#, pruner=optuna.pruners.MedianPruner())
study_h1n1.optimize(objective_h1n1, n_trials=100)

[I 2024-11-27 17:56:41,472] A new study created in memory with name: no-name-f4b6008a-8a42-44f8-90b0-84b97ad323aa
[I 2024-11-27 17:56:41,671] Trial 0 finished with value: 0.8410105247285571 and parameters: {'encoders': 'TargetEncoder', 'max_depth': 6, 'learning_rate': 0.0639777146421749, 'subsample': 0.8523817819913164, 'subsample_freq': 1, 'colsample_bytree': 0.17877578603550917, 'min_child_samples': 63, 'min_child_weight': 0.013980976720323135, 'reg_alpha': 4.342209418520701e-08, 'reg_lambda': 0.29190850453588846, 'num_leaves': 133}. Best is trial 0 with value: 0.8410105247285571.
[I 2024-11-27 17:56:41,828] Trial 1 finished with value: 0.8589831924765626 and parameters: {'encoders': 'TargetEncoder', 'max_depth': 3, 'learning_rate': 0.09331362328603034, 'subsample': 0.43350573067531595, 'subsample_freq': 4, 'colsample_bytree': 0.3996892930721282, 'min_child_samples': 94, 'min_child_weight': 0.1351160042015722, 'reg_alpha': 1.526104091764039e-08, 'reg_lambda': 1.5756112230950497e-09, 

In [26]:
print("Number of finished trials: {}".format(len(study_h1n1.trials)))
print("Best trial:")
trial = study_h1n1.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 100
Best trial:
  Value: 0.8651969813264866
  Params: 
    encoders: OrdinalEncoder
    max_depth: 5
    learning_rate: 0.10571507830065287
    subsample: 0.9251431543814541
    subsample_freq: 8
    colsample_bytree: 0.6787171445927545
    min_child_samples: 43
    min_child_weight: 5.625249967481208
    reg_alpha: 1.2708070859455648e-06
    reg_lambda: 0.03268611919666046
    num_leaves: 78


**werte mit Pruning**

Number of finished trials: 100

Best trial:

  Value: 0.8656566724852328

  Params: 
  
    encoders: TargetEncoder
    max_depth: 6
    learning_rate: 0.10742032000145753
    subsample: 0.8748896332353917
    subsample_freq: 0
    colsample_bytree: 0.4626980798205049
    min_child_samples: 32
    min_child_weight: 0.849752869542173
    reg_alpha: 1.2148872803471023e-06
    reg_lambda: 0.053174027171273695
    num_leaves: 239

**Werte ohne Pruning**

Number of finished trials: 100

Best trial:

  Value: 0.8656221171133909

  Params: 

    encoders: OrdinalEncoder
    max_depth: 4
    learning_rate: 0.14637656019953735
    subsample: 0.906919657644
    subsample_freq: 4
    colsample_bytree: 0.7873074510345616
    min_child_samples: 27
    min_child_weight: 0.0034631838050949815
    reg_alpha: 2.6787233260886746e-06
    reg_lambda: 0.005394939368622497
    num_leaves: 40

In [18]:
def objective_seas(trial):
    
    dtrain = lgb.Dataset(X_train2, label=y_train2)
    dvalid = lgb.Dataset(X_test2, label=y_test2)

    '''First select the encoder'''
    encoders = trial.suggest_categorical("encoders", ["TargetEncoder", "OrdinalEncoder"])

    if encoders == "TargetEncoder":
        encoder = TargetEncoder()
    else:
        encoder = OrdinalEncoder()
    
    '''2nd select the model LGBM-model'''
    # instantiate parameters for classifier
    params = {
        'objective': 'binary', # i the previous 2 files I didn't explicitly define this, as LGBM would use this as default for the classifier
        'metric': 'auc',
        'verbosity': 0,
        'max_depth': trial.suggest_int('max_depth', 1, 6),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 100, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True), # in LGBM; the value is not limited to [0 to 1] also knwon as lambda_l1
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True), # in LGBM ; the value is not limited to [0 to 1] also knwon as lambda_l2
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "random_state": 8,
        #"n_jobs": 1
    }

    classifier = lgb.LGBMClassifier(**params, callbacks=[optuna.integration.LightGBMPruningCallback(trial, "auc")])
    #classifier = lgb.LGBMClassifier(**params)

    pipe = Pipeline([('encoder', encoder), 
                     ('clf', classifier)])
    pipe.fit(X_train2, np.ravel(y_train2))

    y_pred = pipe.predict_proba(X_test2)[:, 1]
    score = roc_auc_score(y_test2, y_pred)

    return score
    

In [19]:
study_seas = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
study_seas.optimize(objective_h1n1, n_trials=100)

[I 2024-11-27 17:46:43,644] A new study created in memory with name: no-name-2f637cc1-947a-4a35-81b3-14d5390aafe0
[I 2024-11-27 17:46:43,792] Trial 0 finished with value: 0.817686381727957 and parameters: {'encoders': 'OrdinalEncoder', 'max_depth': 1, 'learning_rate': 0.0012824868368541038, 'subsample': 0.877031346127817, 'subsample_freq': 1, 'colsample_bytree': 0.33129855580692746, 'min_child_samples': 27, 'min_child_weight': 0.013699437877886956, 'reg_alpha': 1.6812700848369745e-09, 'reg_lambda': 0.000111818537244868, 'num_leaves': 87}. Best is trial 0 with value: 0.817686381727957.
[I 2024-11-27 17:46:43,956] Trial 1 finished with value: 0.8604461412644544 and parameters: {'encoders': 'OrdinalEncoder', 'max_depth': 4, 'learning_rate': 0.15022822346124712, 'subsample': 0.909992824557176, 'subsample_freq': 0, 'colsample_bytree': 0.27516160693060865, 'min_child_samples': 69, 'min_child_weight': 0.001129305800731012, 'reg_alpha': 2.6771299465578473e-05, 'reg_lambda': 4.283952807336383e-

In [20]:
print("Number of finished trials: {}".format(len(study_seas.trials)))
print("Best trial:")
trial_seas = study_seas.best_trial

print("  Value: {}".format(trial_seas.value))

print("  Params: ")
for key, value in trial_seas.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 100
Best trial:
  Value: 0.8658093444008256
  Params: 
    encoders: TargetEncoder
    max_depth: 5
    learning_rate: 0.12308857182346723
    subsample: 0.916553990715227
    subsample_freq: 0
    colsample_bytree: 0.8672065256050777
    min_child_samples: 22
    min_child_weight: 0.856992960312687
    reg_alpha: 0.024875317016880637
    reg_lambda: 0.6004171209430826
    num_leaves: 220


**Werte mit dem Versuch für Pruner**

Number of finished trials: 100

Best trial:

  Value: 0.8657634799981989

  Params: 

    encoders: TargetEncoder
    max_depth: 6
    learning_rate: 0.06115118796155541
    subsample: 0.9051437761260196
    subsample_freq: 8
    colsample_bytree: 0.8925410506480986
    min_child_samples: 31
    min_child_weight: 0.19653456438533892
    reg_alpha: 0.00545284829993156
    reg_lambda: 0.0005242068931854741
    num_leaves: 249

# Submission Data

In [31]:
df_test_set_features = pd.read_csv('Data/test_set_features.csv', index_col='respondent_id')
#df_test_set_features=df_test_set_features[:].astype("category")

In [34]:
def objective_predict_proba_h1n1(trial):
    '''First select the encoder'''
    encoders = trial.suggest_categorical("encoders", ["TargetEncoder", "OrdinalEncoder"])

    if encoders == "TargetEncoder":
        encoder = TargetEncoder()
    else:
        encoder = OrdinalEncoder()
    
    '''2nd select the model LGBM-model'''
    # instantiate parameters for classifier
    params = {
        'objective': 'binary', # i the previous 2 files I didn't explicitly define this, as LGBM would use this as default for the classifier
        'metric': 'auc',
        'verbosity': 0,
        'max_depth': trial.suggest_int('max_depth', 1, 6),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 100, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True), # in LGBM; the value is not limited to [0 to 1] also knwon as lambda_l1
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True), # in LGBM ; the value is not limited to [0 to 1] also knwon as lambda_l2
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "random_state": 8,
        #"n_jobs": 1
    }

    #classifier = lgb.LGBMClassifier(**params, callbacks=[optuna.integration.LightGBMPruningCallback(trial, "auc")])
    classifier = lgb.LGBMClassifier(**params)

    pipe = Pipeline([('encoder', encoder), 
                     ('clf', classifier)])
    pipe.fit(X_train1, np.ravel(y_train1))

    return pipe.predict_proba(df_test_set_features)[:,1]
    

In [37]:
def objective_predict_proba_seas(trial):
    '''First select the encoder'''
    encoders = trial.suggest_categorical("encoders", ["TargetEncoder", "OrdinalEncoder"])

    if encoders == "TargetEncoder":
        encoder = TargetEncoder()
    else:
        encoder = OrdinalEncoder()
    
    '''2nd select the model LGBM-model'''
    # instantiate parameters for classifier
    params = {
        'objective': 'binary', # i the previous 2 files I didn't explicitly define this, as LGBM would use this as default for the classifier
        'metric': 'auc',
        'verbosity': 0,
        'max_depth': trial.suggest_int('max_depth', 1, 6),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 100, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True), # in LGBM; the value is not limited to [0 to 1] also knwon as lambda_l1
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True), # in LGBM ; the value is not limited to [0 to 1] also knwon as lambda_l2
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "random_state": 8,
        #"n_jobs": 1
    }

    #classifier = lgb.LGBMClassifier(**params, callbacks=[optuna.integration.LightGBMPruningCallback(trial, "auc")])
    classifier = lgb.LGBMClassifier(**params)

    pipe = Pipeline([('encoder', encoder), 
                     ('clf', classifier)])
    pipe.fit(X_train1, np.ravel(y_train1))

    return pipe.predict_proba(df_test_set_features)[:,1]
    

In [35]:
h1n1_vaccine_probability = objective_predict_proba_h1n1(study_h1n1.best_trial)

In [38]:
seas_vaccine_probability = objective_predict_proba_seas(study_seas.best_trial)

In [40]:
df_submission = df_test_set_features[[]]
df_submission['h1n1_vaccine'] = h1n1_vaccine_probability.tolist()
df_submission['seasonal_vaccine'] = seas_vaccine_probability.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_submission['h1n1_vaccine'] = h1n1_vaccine_probability.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_submission['seasonal_vaccine'] = seas_vaccine_probability.tolist()


In [41]:
df_submission.to_csv("LGBM_Model/Optuna_Pipeline_submission.csv")