In [1]:
import os
import configparser
from tqdm import tqdm
from helpers.helper_functions import *
from helpers.helper_classes import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from wandb.lightgbm import wandb_callback, log_summary
import wandb

# Read config.ini file
config = configparser.ConfigParser()
config.read('src/config.ini')
os.chdir(config['PATH']['ROOT_DIR'])

# # Load data
df = pd.read_parquet(config['PATH']['INT_DIR'] + '/training_set_preprocessed_nodrop.parquet', engine = 'fastparquet')
df_test = pd.read_parquet(config['PATH']['INT_DIR'] + '/test_set_preprocessed_nodrop.parquet', engine = 'fastparquet')
df = df[df['srch_id'] < 10000]

  from .autonotebook import tqdm as notebook_tqdm
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.sort_values(by=['srch_id', target_str], ascending=[True, False], inplace=True)


In [8]:
import optuna
import lightgbm as lgb

def objective(trial):

    params_lgbm = {
        "objective": "lambdarank",
        "metric":"ndcg",
        'n_estimators': trial.suggest_int('n_estimators', 10, 30), 
        'max_depth': trial.suggest_int('max_depth', 1, 20), 
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1), 
        'subsample': trial.suggest_float('subsample', 0.4, 0.7), 
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.7), 
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 0.2), 
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 0.2),
    }
    params_other = {
        'val_size': trial.suggest_float('val_size', 0.05, 0.8)
    }

    X_train, X_val, X_test, y_train, y_val, y_test, test_ideal = train_val_test_split(df, 'target', test_size=.15, val_size=params_other['val_size'], random_state=7)
    _, desire_df_click = construct_desire(X_val)
    _, desire_df_book = construct_desire(X_val, target = 'booking_bool')
    X_train = merge_and_drop(X_train, desire_df_click, desire_df_book)
    X_test = merge_and_drop(X_test, desire_df_click, desire_df_book)

    group_train = X_train.groupby('srch_id').size().values
    group_val = X_test.groupby('srch_id').size().values

    X_train_lgb = X_train.drop(['srch_id'], axis=1)
    X_test_lgb = X_test.drop(['srch_id'], axis=1)

    params_all = {**params_lgbm, **params_other}
    run = wandb.init(project='DMT-2023', group = 'optuna_vardesire_size', config = params_all, reinit = True, )
    cb = wandb_callback()
    ranker = lgb.LGBMRanker(**params_lgbm)

    ranker.fit(
        X=X_train_lgb,
        y=y_train,
        group=group_train,
        eval_set=[(X_train_lgb, y_train),(X_test_lgb, y_test)],
        eval_group=[group_train, group_val],
        eval_at=[5],
        callbacks=[cb]
    )

    y_pred = ranker.predict(X_test_lgb)
    df_res = X_test.copy()
    df_res['pred_grades'] = y_pred
    df_res = df_res.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)
    df_res = df_res.merge(test_ideal, on=['srch_id', 'prop_id'], how='left')

    ndcg_score = calc_NDCG(test_ideal, df_res)
    
    with run:
        run.log({'ndcg': ndcg_score})
    
    return ndcg_score



# Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=2)
wandb.finish()
# Extract the best hyperparameters
best_params = study.best_params
print(f'Best hyperparameters: {best_params}')

# Save best params to txt file
with open(config['PATH']['INT_DIR'] + '/optuna_best_params.txt', 'w') as f:
    f.write(str(best_params))


[32m[I 2023-05-18 15:02:08,549][0m A new study created in memory with name: no-name-22352b3a-ae8c-490d-a126-e75726ede743[0m


0,1
iteration,▁▂▂▃▄▅▅▆▇▇█
ndcg,▁
training_ndcg@5,▁▂▄▄▄▅▆▇▇██
valid_1_ndcg@5,▇▂▅▁▃▃▆▇▅█▇

0,1
iteration,10.0
ndcg,0.3807
training_ndcg@5,0.43065
valid_1_ndcg@5,0.36327


[32m[I 2023-05-18 15:02:14,398][0m Trial 0 finished with value: 0.38070222087303385 and parameters: {'n_estimators': 11, 'max_depth': 11, 'learning_rate': 0.01542215567246322, 'subsample': 0.4229122512041623, 'colsample_bytree': 0.5825390619739982, 'reg_alpha': 0.0029627482694911562, 'reg_lambda': 0.17950031633138536, 'val_size': 0.20602045431165272}. Best is trial 0 with value: 0.38070222087303385.[0m


0,1
iteration,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
ndcg,▁
training_ndcg@5,▁▁▂▂▃▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
valid_1_ndcg@5,▄▁▂▂▃▂▃▅▅▅▆▅▆▆▇▇▆▅▆█▇█▇▇▇▆

0,1
iteration,25.0
ndcg,0.38619
training_ndcg@5,0.46093
valid_1_ndcg@5,0.36896


[32m[I 2023-05-18 15:02:22,049][0m Trial 1 finished with value: 0.386191822004263 and parameters: {'n_estimators': 26, 'max_depth': 11, 'learning_rate': 0.013304979495638981, 'subsample': 0.6702157475502727, 'colsample_bytree': 0.5168942830771519, 'reg_alpha': 0.11704188308887337, 'reg_lambda': 0.04435518575397499, 'val_size': 0.6812201222249858}. Best is trial 1 with value: 0.386191822004263.[0m


Best hyperparameters: {'n_estimators': 26, 'max_depth': 11, 'learning_rate': 0.013304979495638981, 'subsample': 0.6702157475502727, 'colsample_bytree': 0.5168942830771519, 'reg_alpha': 0.11704188308887337, 'reg_lambda': 0.04435518575397499, 'val_size': 0.6812201222249858}
