In [4]:
import os
import configparser
from tqdm import tqdm
from helpers.helper_functions import *
from helpers.helper_classes import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from wandb.lightgbm import wandb_callback, log_summary
import wandb

# Read config.ini file
os.chdir('/Users/hrvanelderen/Documents/Master/DMT/data-mining-techniques-vu')
config = configparser.ConfigParser()
config.read('src/config.ini')
os.chdir(config['PATH']['ROOT_DIR'])

# # Load data
df = pd.read_parquet(config['PATH']['INT_DIR'] + '/training_set_preprocessed_nodrop.parquet', engine = 'auto')
df_test = pd.read_parquet(config['PATH']['INT_DIR'] + '/test_set_preprocessed_nodrop.parquet', engine = 'auto')
df = df[df['srch_id'] < 10000]

In [5]:
categorical_features = ['hour', 'day', 'month', 'day_of_week', 'site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_id', 'srch_destination_id']

for c in categorical_features:
    df[c] = df[c].astype('category')
    df_test[c] = df_test[c].astype('category')

In [6]:
import optuna
import lightgbm as lgb

def objective(trial):

    params_lgbm = {
        "objective": "lambdarank",
        "metric":"ndcg",
        'n_estimators': trial.suggest_int('n_estimators', 10, 30), 
        'max_depth': trial.suggest_int('max_depth', 1, 20), 
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1), 
        'subsample': trial.suggest_float('subsample', 0.4, 0.7), 
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.7), 
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 0.2), 
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 0.2),
    }
    params_other = {
        'val_size': trial.suggest_float('val_size', 0.05, 0.8)
    }

    X_train, X_val, X_test, y_train, y_val, y_test, test_ideal = train_val_test_split(df, 'target', test_size=.15, val_size=params_other['val_size'], random_state=7)
    _, desire_df_click = construct_desire(X_val)
    _, desire_df_book = construct_desire(X_val, target = 'booking_bool')
    X_train = merge_and_drop(X_train, desire_df_click, desire_df_book)
    X_test = merge_and_drop(X_test, desire_df_click, desire_df_book)

    group_train = X_train.groupby('srch_id').size().values
    group_val = X_test.groupby('srch_id').size().values

    X_train_lgb = X_train.drop(['srch_id'], axis=1)
    X_test_lgb = X_test.drop(['srch_id'], axis=1)

    params_all = {**params_lgbm, **params_other}
    run = wandb.init(project='DMT-2023', group = 'optuna_vardesire_size', config = params_all, reinit = True, )
    cb = wandb_callback()
    ranker = lgb.LGBMRanker(**params_lgbm)

    ranker.fit(
        X=X_train_lgb,
        y=y_train,
        group=group_train,
        eval_set=[(X_train_lgb, y_train),(X_test_lgb, y_test)],
        eval_group=[group_train, group_val],
        eval_at=[5],
        callbacks=[cb]
    )

    y_pred = ranker.predict(X_test_lgb)
    df_res = X_test.copy()
    df_res['pred_grades'] = y_pred
    df_res = df_res.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)
    df_res = df_res.merge(test_ideal, on=['srch_id', 'prop_id'], how='left')

    ndcg_score = calc_NDCG(test_ideal, df_res)
    
    with run:
        run.log({'ndcg': ndcg_score})
    
    return ndcg_score



# Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=2)
wandb.finish()
# Extract the best hyperparameters
best_params = study.best_params
print(f'Best hyperparameters: {best_params}')

# Save best params to txt file
with open(config['PATH']['INT_DIR'] + '/optuna_best_params.txt', 'w') as f:
    f.write(str(best_params))


[32m[I 2023-05-22 19:55:24,449][0m A new study created in memory with name: no-name-1b771028-dab6-4a90-9495-462173096569[0m
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.sort_values(by=['srch_id', target_str], ascending=[True, False], inplace=True)
[33m[W 2023-05-22 19:55:25,092][0m Trial 0 failed with parameters: {'n_estimators': 19, 'max_depth': 4, 'learning_rate': 0.07734275261036115, 'subsample': 0.40068456779200684, 'colsample_bytree': 0.5337558659376456, 'reg_alpha': 0.05570777365614721, 'reg_lambda': 0.13477562392212605, 'val_size': 0.6430913288735147} because of the following error: ValueError('too many values to unpack (expected 2)').[0m
Traceback (most recent call last):
  File "/Users/hrvanelderen/anaconda3/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_va

ValueError: too many values to unpack (expected 2)