<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Evaluation" data-toc-modified-id="Evaluation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Evaluation</a></span></li></ul></div>

In [12]:
import os
import configparser
from tqdm import tqdm
from helpers.helper_functions import *
from helpers.helper_classes import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import lightgbm as lgb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
os.chdir('/Users/hrvanelderen/Documents/Master/DMT/data-mining-techniques-vu')

In [13]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('src/config.ini')
os.chdir(config['PATH']['ROOT_DIR'])

# # Load data
df = pd.read_parquet(config['PATH']['INT_DIR'] + '/training_set_preprocessed_nodrop.parquet', engine = 'auto')
df_test = pd.read_parquet(config['PATH']['INT_DIR'] + '/test_set_preprocessed_nodrop.parquet', engine = 'auto')
df_mini = df[df['srch_id'] < 10000]


In [14]:
categorical_features = ['hour', 'day', 'month', 'day_of_week', 'site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_id', 'srch_destination_id']

for c in categorical_features:
    df[c] = df[c].astype('category')
    df_test[c] = df_test[c].astype('category')

## Submission

In [15]:
## Best run

best_params = {'n_estimators': 421, 
 'num_leaves': 36, 
 'max_depth': 9, 
 'learning_rate': 0.07500274597972945, 
 'subsample': 0.44073175180110147, 
 'colsample_bytree': 0.44512901152675777, 
 'reg_alpha': 0.13399352212166932, 
 'reg_lambda': 0.13466379319544275, 
 'min_child_samples': 96, 
 'min_child_weight': 0.06539586276577038, 
 'val_size': 0.3154338751135015}

# all but val_size in lgb_best_params
val_size = best_params.pop('val_size')

X_train_full, X_val_full, y_train_full, y_val_full, _ = train_test_split(df, 'target', test_size=val_size)

_, desire_df_click_full = construct_desire(X_val_full)
_, desire_df_book_full = construct_desire(X_val_full, target = 'booking_bool')

prop_counts = X_val_full['prop_id'].value_counts()
prop_counts.name = 'prop_counts'
prop_counts = pd.DataFrame({'prop_id':prop_counts.index, 'count':prop_counts.values})

srch_dest_counts = X_val_full['srch_destination_id'].value_counts()
srch_dest_counts.name = 'srch_dest_counts'
srch_dest_counts = pd.DataFrame({'srch_destination_id':srch_dest_counts.index, 'count':srch_dest_counts.values})

merge_df_list = [(desire_df_click_full, 'prop_id'), (desire_df_book_full, 'prop_id'), (prop_counts, 'prop_id'), (srch_dest_counts, 'srch_destination_id')]   

X_train_full = merge_and_drop(X_train_full, merge_df_list)
df_test = merge_and_drop(df_test, merge_df_list, drop=False)
X_val_full.drop(['click_bool', 'booking_bool'], axis=1, inplace=True)


# Create dataset
group_train = X_train_full.groupby('srch_id').size().values
X_train_lgb = X_train_full.drop(['srch_id'], axis=1)
# X_val_lgb = X_test.drop(['srch_id'], axis=1)

ranker = lgb.LGBMRanker(**best_params)

# Training the model
ranker.fit(
      X=X_train_lgb,
      y=y_train_full,
      group=group_train,
      eval_set=[(X_train_lgb, y_train_full)],
      eval_group=[group_train],
      eval_at=[5],
      feature_name='auto', 
      categorical_feature = 'auto')


[1]	training's ndcg@5: 0.32275
[2]	training's ndcg@5: 0.35759
[3]	training's ndcg@5: 0.365995
[4]	training's ndcg@5: 0.37607
[5]	training's ndcg@5: 0.384951
[6]	training's ndcg@5: 0.387053
[7]	training's ndcg@5: 0.387839
[8]	training's ndcg@5: 0.393158
[9]	training's ndcg@5: 0.393664
[10]	training's ndcg@5: 0.397127
[11]	training's ndcg@5: 0.4004
[12]	training's ndcg@5: 0.402784
[13]	training's ndcg@5: 0.403729
[14]	training's ndcg@5: 0.405827
[15]	training's ndcg@5: 0.408048
[16]	training's ndcg@5: 0.408987
[17]	training's ndcg@5: 0.4092
[18]	training's ndcg@5: 0.409163
[19]	training's ndcg@5: 0.411067
[20]	training's ndcg@5: 0.413123
[21]	training's ndcg@5: 0.414839
[22]	training's ndcg@5: 0.415175
[23]	training's ndcg@5: 0.415292
[24]	training's ndcg@5: 0.416967
[25]	training's ndcg@5: 0.417142
[26]	training's ndcg@5: 0.417607
[27]	training's ndcg@5: 0.418991
[28]	training's ndcg@5: 0.41938
[29]	training's ndcg@5: 0.419342
[30]	training's ndcg@5: 0.419123
[31]	training's ndcg@5: 0.4

[247]	training's ndcg@5: 0.471455
[248]	training's ndcg@5: 0.471511
[249]	training's ndcg@5: 0.471667
[250]	training's ndcg@5: 0.471806
[251]	training's ndcg@5: 0.471945
[252]	training's ndcg@5: 0.471964
[253]	training's ndcg@5: 0.472125
[254]	training's ndcg@5: 0.472196
[255]	training's ndcg@5: 0.472276
[256]	training's ndcg@5: 0.47247
[257]	training's ndcg@5: 0.472577
[258]	training's ndcg@5: 0.472552
[259]	training's ndcg@5: 0.472638
[260]	training's ndcg@5: 0.472785
[261]	training's ndcg@5: 0.472948
[262]	training's ndcg@5: 0.473048
[263]	training's ndcg@5: 0.473195
[264]	training's ndcg@5: 0.4733
[265]	training's ndcg@5: 0.473471
[266]	training's ndcg@5: 0.473535
[267]	training's ndcg@5: 0.47365
[268]	training's ndcg@5: 0.473703
[269]	training's ndcg@5: 0.473819
[270]	training's ndcg@5: 0.473908
[271]	training's ndcg@5: 0.474083
[272]	training's ndcg@5: 0.474218
[273]	training's ndcg@5: 0.47423
[274]	training's ndcg@5: 0.474398
[275]	training's ndcg@5: 0.474505
[276]	training's nd

In [16]:
# Predicting the scores
# test = X_val
test = df_test
test_input = test.drop(['srch_id'], axis=1)
df_res = test

for c in categorical_features:
    test_input[c] = test_input[c].astype('category')

print("Predicting...")
y_pred = ranker.predict(test_input)
df_res['pred_grades'] = y_pred
print("Done predicting")

df_res = df_res.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)

df_res
lgbm_submission_desire = df_res[['srch_id', 'prop_id']]
lgbm_submission_desire.to_csv(config['PATH']['SUBMISSION_DIR'] + '/lgbm_submission_categorical_opt.csv', index=False)


Predicting...
Done predicting


In [None]:
lgb.plot_importance(ranker, figsize = (12,20))

## Evaluation

In [8]:
best_params = {'n_estimators': 421, 
 'num_leaves': 36, 
 'max_depth': 9, 
 'learning_rate': 0.07500274597972945, 
 'subsample': 0.44073175180110147, 
 'colsample_bytree': 0.44512901152675777, 
 'reg_alpha': 0.13399352212166932, 
 'reg_lambda': 0.13466379319544275, 
 'min_child_samples': 96, 
 'min_child_weight': 0.06539586276577038, 
 'val_size': 0.3154338751135015}
               
lgb_params = best_params.copy()
val_size = lgb_params.pop('val_size')

X_train, X_val, X_test, y_train, y_val, y_test, test_ideal = train_val_test_split(df, 'target', test_size=.15, val_size=val_size, random_state=7)

_, desire_df_click = construct_desire(X_val)
_, desire_df_book = construct_desire(X_val, target = 'booking_bool')

prop_counts = X_val['prop_id'].value_counts()
prop_counts.name = 'prop_counts'
prop_counts = pd.DataFrame({'prop_id':prop_counts.index, 'count':prop_counts.values})

srch_dest_counts = X_val['srch_destination_id'].value_counts()
srch_dest_counts.name = 'srch_dest_counts'
srch_dest_counts = pd.DataFrame({'srch_destination_id':srch_dest_counts.index, 'count':srch_dest_counts.values})

merge_df_list = [(desire_df_click, 'prop_id'), (desire_df_book, 'prop_id'), (prop_counts, 'prop_id'), (srch_dest_counts, 'srch_destination_id')]   

X_train = merge_and_drop(X_train, merge_df_list)
X_test = merge_and_drop(X_test, merge_df_list)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.sort_values(by=['srch_id', target_str], ascending=[True, False], inplace=True)


In [11]:
# Testing
# LightGBM ranker
import lightgbm as lgb
# import wandb
# from wandb.lightgbm import wandb_callback, log_summary

# Create dataset
group_train = X_train.groupby('srch_id').size().values
group_val = X_test.groupby('srch_id').size().values

X_train_lgb = X_train.drop(['srch_id'], axis=1)
X_val_lgb = X_test.drop(['srch_id'], axis=1)


ranker = lgb.LGBMRanker(**lgb_params)

# wandb.init(project='DMT-2023', config = best_params, notes='Now with class_weight = balanced', name='possibly-balanced-tiger-2')


# Training the model
ranker.fit(
      X=X_train_lgb,
      y=y_train,
      group=group_train,
      eval_set=[(X_train_lgb, y_train),(X_val_lgb, y_test)],
      eval_group=[group_train, group_val],
      eval_at=[5],
      callbacks=[],
      feature_name='auto', 
      categorical_feature = 'auto')

# Predicting the scores
test = X_test.drop(['srch_id'], axis=1).copy()

print("Predicting...")
y_pred = ranker.predict(test)
print("Done predicting")

df_res = X_test.copy()
df_res['pred_grades'] = y_pred
df_res = df_res.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)
df_res = df_res.merge(test_ideal, on=['srch_id', 'prop_id'], how='left')

final_ndcg = calc_NDCG(test_ideal, df_res)
# wandb.log({'ndcg_final': final_ndcg})
print(f"result final:{final_ndcg}")
# wandb.finish()




[1]	training's ndcg@5: 0.37622	valid_1's ndcg@5: 0.370568
[2]	training's ndcg@5: 0.37095	valid_1's ndcg@5: 0.361069
[3]	training's ndcg@5: 0.372393	valid_1's ndcg@5: 0.361603
[4]	training's ndcg@5: 0.381609	valid_1's ndcg@5: 0.365202
[5]	training's ndcg@5: 0.387045	valid_1's ndcg@5: 0.366969
[6]	training's ndcg@5: 0.388141	valid_1's ndcg@5: 0.369443
[7]	training's ndcg@5: 0.389023	valid_1's ndcg@5: 0.370057
[8]	training's ndcg@5: 0.394476	valid_1's ndcg@5: 0.372953
[9]	training's ndcg@5: 0.39474	valid_1's ndcg@5: 0.373361
[10]	training's ndcg@5: 0.398864	valid_1's ndcg@5: 0.373979
[11]	training's ndcg@5: 0.401249	valid_1's ndcg@5: 0.374627
[12]	training's ndcg@5: 0.405009	valid_1's ndcg@5: 0.376207
[13]	training's ndcg@5: 0.405355	valid_1's ndcg@5: 0.376313
[14]	training's ndcg@5: 0.407963	valid_1's ndcg@5: 0.377503
[15]	training's ndcg@5: 0.410643	valid_1's ndcg@5: 0.378849
[16]	training's ndcg@5: 0.411314	valid_1's ndcg@5: 0.381057
[17]	training's ndcg@5: 0.411643	valid_1's ndcg@5: 0

[138]	training's ndcg@5: 0.462179	valid_1's ndcg@5: 0.397715
[139]	training's ndcg@5: 0.462173	valid_1's ndcg@5: 0.39764
[140]	training's ndcg@5: 0.462424	valid_1's ndcg@5: 0.397847
[141]	training's ndcg@5: 0.462565	valid_1's ndcg@5: 0.397854
[142]	training's ndcg@5: 0.462831	valid_1's ndcg@5: 0.39802
[143]	training's ndcg@5: 0.462968	valid_1's ndcg@5: 0.397943
[144]	training's ndcg@5: 0.462967	valid_1's ndcg@5: 0.398136
[145]	training's ndcg@5: 0.463046	valid_1's ndcg@5: 0.398064
[146]	training's ndcg@5: 0.463192	valid_1's ndcg@5: 0.398303
[147]	training's ndcg@5: 0.463367	valid_1's ndcg@5: 0.398254
[148]	training's ndcg@5: 0.463607	valid_1's ndcg@5: 0.398046
[149]	training's ndcg@5: 0.463687	valid_1's ndcg@5: 0.398139
[150]	training's ndcg@5: 0.463935	valid_1's ndcg@5: 0.398037
[151]	training's ndcg@5: 0.46419	valid_1's ndcg@5: 0.39815
[152]	training's ndcg@5: 0.464339	valid_1's ndcg@5: 0.398236
[153]	training's ndcg@5: 0.464344	valid_1's ndcg@5: 0.398187
[154]	training's ndcg@5: 0.4

[273]	training's ndcg@5: 0.479747	valid_1's ndcg@5: 0.401944
[274]	training's ndcg@5: 0.479936	valid_1's ndcg@5: 0.401975
[275]	training's ndcg@5: 0.480066	valid_1's ndcg@5: 0.402051
[276]	training's ndcg@5: 0.48021	valid_1's ndcg@5: 0.402021
[277]	training's ndcg@5: 0.480378	valid_1's ndcg@5: 0.402122
[278]	training's ndcg@5: 0.480571	valid_1's ndcg@5: 0.402239
[279]	training's ndcg@5: 0.480684	valid_1's ndcg@5: 0.402305
[280]	training's ndcg@5: 0.480668	valid_1's ndcg@5: 0.402379
[281]	training's ndcg@5: 0.480791	valid_1's ndcg@5: 0.402411
[282]	training's ndcg@5: 0.480928	valid_1's ndcg@5: 0.402384
[283]	training's ndcg@5: 0.48112	valid_1's ndcg@5: 0.40228
[284]	training's ndcg@5: 0.481284	valid_1's ndcg@5: 0.402266
[285]	training's ndcg@5: 0.481419	valid_1's ndcg@5: 0.402251
[286]	training's ndcg@5: 0.481506	valid_1's ndcg@5: 0.402385
[287]	training's ndcg@5: 0.481569	valid_1's ndcg@5: 0.402446
[288]	training's ndcg@5: 0.48165	valid_1's ndcg@5: 0.402501
[289]	training's ndcg@5: 0.4

[408]	training's ndcg@5: 0.494588	valid_1's ndcg@5: 0.403317
[409]	training's ndcg@5: 0.494582	valid_1's ndcg@5: 0.403207
[410]	training's ndcg@5: 0.494663	valid_1's ndcg@5: 0.403313
[411]	training's ndcg@5: 0.494725	valid_1's ndcg@5: 0.403287
[412]	training's ndcg@5: 0.494806	valid_1's ndcg@5: 0.403209
[413]	training's ndcg@5: 0.494929	valid_1's ndcg@5: 0.403176
[414]	training's ndcg@5: 0.495074	valid_1's ndcg@5: 0.40319
[415]	training's ndcg@5: 0.495186	valid_1's ndcg@5: 0.403137
[416]	training's ndcg@5: 0.495302	valid_1's ndcg@5: 0.403167
[417]	training's ndcg@5: 0.495449	valid_1's ndcg@5: 0.403305
[418]	training's ndcg@5: 0.495619	valid_1's ndcg@5: 0.403328
[419]	training's ndcg@5: 0.495746	valid_1's ndcg@5: 0.403189
[420]	training's ndcg@5: 0.495873	valid_1's ndcg@5: 0.40314
[421]	training's ndcg@5: 0.496058	valid_1's ndcg@5: 0.403153
Predicting...
Done predicting
result final:0.4260613335094323


## Evaluation


In [None]:
# Predicting the scores
# test = X_val
test = df_test
test_input = test.drop(['srch_id'], axis=1)
df_res = test


print("Predicting...")
y_pred = best_ranker.predict(test_input)
df_res['pred_grades'] = y_pred
print("Done predicting")

df_res = df_res.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)

df_res

In [None]:

lgbm_submission = df_res[['srch_id', 'prop_id']]
lgbm_submission.to_csv(config['PATH']['SUBMISSION_DIR'] + '/lgbm_submission_optuna.csv', index=False)

In [None]:
# print(f"RF: {calc_NDCG(test_ideal, pred_ideal_rf)}\n,XGB: {calc_NDCG(test_ideal, pred_xgb_optimized)},\nRandom: {calc_NDCG(test_ideal, pred_random)}")
print(f"XGB: {calc_NDCG(df_ideal, pred_xgb)}, Random: {calc_NDCG(test_ideal, pred_random)}")

## Optuna + XGBRegressor

In [None]:
# Optimize XGB with optuna
import optuna
from functools import partial

def objective(trial, X_train, y_train, X_test, test_ideal):
    y_train_xgb = y_train.astype(int)
    y_train_xgb[y_train == 5] = 2

    params = {
        "objective": "multi:softprob",
        "random_state": 42,
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 1e-1, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 1e-1, log=True),
    }

    xgb_model = xgb.XGBClassifier(**params)
    xgb_model.fit(X_train, y_train_xgb)

    pred_xgb = constructs_predictions(xgb_model, X_test, ideal_df=test_ideal)
    ndcg = calc_NDCG(test_ideal, pred_xgb)

    return ndcg

print("Training XGB")
# Assuming you have defined X_train, y_train, X_test, and test_ideal before this point.

# Wrap the objective function with the input data
objective_with_data = partial(objective, X_train=X_train, y_train=y_train, X_test=X_test, test_ideal=test_ideal)

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction="maximize")
study.optimize(objective_with_data, n_trials=20)




# Train the final model with the best hyperparameters
y_train_xgb = y_train.astype(int)
y_train_xgb[y_train == 5] = 2

best_params = study.best_params
xgb_model_optimized = xgb.XGBClassifier(objective="multi:softprob", random_state=42, **best_params)
xgb_model_optimized.fit(X_train, y_train_xgb)

# Evaluate the optimized model
pred_xgb_optimized = constructs_predictions(xgb_model_optimized, X_test, ideal_df=test_ideal)
pred_xgb_submission = constructs_predictions(xgb_model_optimized, df_test)
print(f"XGB Optimized: {calc_NDCG(test_ideal, pred_xgb_optimized)}")

# pred_submission.to_csv(config['PATH']['DATA_DIR'] + '/submission_RF.csv', index=False)
pred_xgb_submission.to_csv(config['PATH']['DATA_DIR'] + '/submission_XGB.csv', index=False)