<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Evaluation" data-toc-modified-id="Evaluation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Evaluation</a></span></li></ul></div>

In [1]:
import os
import configparser
from tqdm import tqdm
from helpers.helper_functions import *
from helpers.helper_classes import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import lightgbm as lgb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('src/config.ini')
os.chdir(config['PATH']['ROOT_DIR'])

# # Load data
df = pd.read_parquet(config['PATH']['INT_DIR'] + '/training_set_preprocessed_nodrop.parquet', engine = 'auto')
df_test = pd.read_parquet(config['PATH']['INT_DIR'] + '/test_set_preprocessed_nodrop.parquet', engine = 'auto')

categorical_features = ['hour', 'day', 'month', 'day_of_week', 'site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_id', 'srch_destination_id']

for c in categorical_features:
    df[c] = df[c].astype('category')
    df_test[c] = df_test[c].astype('category')

## Submission

In [6]:
## Best run
best_params = {'n_estimators': 878,
               'num_leaves': 80,
               'max_depth': 14,
               'learning_rate': 0.018926700075124463,
               'subsample': 0.6001819915274639,
               'colsample_bytree': 0.7879556726353679,
               'reg_alpha': 0.06065988852935483,
               'reg_lambda': 0.14848222729700747,
               'min_child_samples': 7,
               'min_child_weight': 0.05206418484811052,
               'val_size': 0.37752777983423735}

# all but val_size in lgb_best_params
val_size = best_params.pop('val_size')

X_train_full, X_val_full, y_train_full, y_val_full, _ = train_test_split(df, 'target', test_size=val_size)

_, desire_df_click_full = construct_desire(X_val_full)
_, desire_df_book_full = construct_desire(X_val_full, target = 'booking_bool')

prop_counts = X_val_full['prop_id'].value_counts()
prop_counts.name = 'prop_counts'
srch_dest_counts = X_val_full['srch_destination_id'].value_counts()
srch_dest_counts.name = 'srch_dest_counts'

merge_df_list = [(desire_df_click_full, 'prop_id'), (desire_df_book_full, 'prop_id'), (prop_counts, 'prop_id'), (srch_dest_counts, 'srch_destination_id')]   

X_train_full = merge_and_drop(X_train_full, merge_df_list)
df_test = merge_and_drop(df_test, merge_df_list, drop=False)
X_val_full.drop(['click_bool', 'booking_bool'], axis=1, inplace=True)


# Create dataset
group_train = X_train_full.groupby('srch_id').size().values
X_train_lgb = X_train_full.drop(['srch_id'], axis=1)
# X_val_lgb = X_test.drop(['srch_id'], axis=1)

ranker = lgb.LGBMRanker(**best_params)

# Training the model
ranker.fit(
      X=X_train_lgb,
      y=y_train_full,
      group=group_train,
      eval_set=[(X_train_lgb, y_train_full)],
      eval_group=[group_train],
      eval_at=[5])

# Predicting the scores
# test = X_val
test = df_test
test_input = test.drop(['srch_id'], axis=1)
df_res = test


print("Predicting...")
y_pred = ranker.predict(test_input)
df_res['pred_grades'] = y_pred
print("Done predicting")

df_res = df_res.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)

df_res
lgbm_submission_desire = df_res[['srch_id', 'prop_id']]
lgbm_submission_desire.to_csv(config['PATH']['SUBMISSION_DIR'] + '/lgbm_submission_integrated_optuna.csv', index=False)


[1]	training's ndcg@5: 0.316736
[2]	training's ndcg@5: 0.353772
[3]	training's ndcg@5: 0.365537
[4]	training's ndcg@5: 0.372177
[5]	training's ndcg@5: 0.375806
[6]	training's ndcg@5: 0.378796
[7]	training's ndcg@5: 0.380743
[8]	training's ndcg@5: 0.382542
[9]	training's ndcg@5: 0.384543
[10]	training's ndcg@5: 0.385403
[11]	training's ndcg@5: 0.387131
[12]	training's ndcg@5: 0.388571
[13]	training's ndcg@5: 0.389367
[14]	training's ndcg@5: 0.390204
[15]	training's ndcg@5: 0.390675
[16]	training's ndcg@5: 0.391083
[17]	training's ndcg@5: 0.39164
[18]	training's ndcg@5: 0.391974
[19]	training's ndcg@5: 0.392089
[20]	training's ndcg@5: 0.39268
[21]	training's ndcg@5: 0.392878
[22]	training's ndcg@5: 0.393395
[23]	training's ndcg@5: 0.39423
[24]	training's ndcg@5: 0.394368
[25]	training's ndcg@5: 0.394814
[26]	training's ndcg@5: 0.394867
[27]	training's ndcg@5: 0.395193
[28]	training's ndcg@5: 0.395595
[29]	training's ndcg@5: 0.39574
[30]	training's ndcg@5: 0.395919
[31]	training's ndcg@5:

## Evaluation

In [3]:
best_params = {'n_estimators': 421, 
 'num_leaves': 36, 
 'max_depth': 9, 
 'learning_rate': 0.07500274597972945, 
 'subsample': 0.44073175180110147, 
 'colsample_bytree': 0.44512901152675777, 
 'reg_alpha': 0.13399352212166932, 
 'reg_lambda': 0.13466379319544275, 
 'min_child_samples': 96, 
 'min_child_weight': 0.06539586276577038, 
 'val_size': 0.3154338751135015}
               
lgb_params = best_params.copy()
val_size = lgb_params.pop('val_size')

X_train, X_val, X_test, y_train, y_val, y_test, test_ideal = train_val_test_split(df, 'target', test_size=.15, val_size=val_size, random_state=7)

_, desire_df_click = construct_desire(X_val)
_, desire_df_book = construct_desire(X_val, target = 'booking_bool')

prop_counts = X_val['prop_id'].value_counts()
prop_counts.name = 'prop_counts'
prop_counts = pd.DataFrame({'prop_id':prop_counts.index, 'count':prop_counts.values})

srch_dest_counts = X_val['srch_destination_id'].value_counts()
srch_dest_counts.name = 'srch_dest_counts'
srch_dest_counts = pd.DataFrame({'srch_destination_id':srch_dest_counts.index, 'count':srch_dest_counts.values})

merge_df_list = [(desire_df_click, 'prop_id'), (desire_df_book, 'prop_id'), (prop_counts, 'prop_id'), (srch_dest_counts, 'srch_destination_id')]   

X_train = merge_and_drop(X_train, merge_df_list)
X_test = merge_and_drop(X_test, merge_df_list)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.sort_values(by=['srch_id', target_str], ascending=[True, False], inplace=True)


In [4]:
# Testing
# LightGBM ranker
import lightgbm as lgb
import wandb
from wandb.lightgbm import wandb_callback, log_summary

# Create dataset
group_train = X_train.groupby('srch_id').size().values
group_val = X_test.groupby('srch_id').size().values

X_train_lgb = X_train.drop(['srch_id'], axis=1)
X_val_lgb = X_test.drop(['srch_id'], axis=1)


ranker = lgb.LGBMRanker(**lgb_params)

# wandb.init(project='DMT-2023', config = best_params, notes='Attempt 2 to replicate Reinier run', name='integrated-triceratops-41')


# Training the model
ranker.fit(
      X=X_train_lgb,
      y=y_train,
      group=group_train,
      eval_set=[(X_train_lgb, y_train),(X_val_lgb, y_test)],
      eval_group=[group_train, group_val],
      eval_at=[5],
      # callbacks=[wandb_callback()],
      feature_name = 'auto',
      categorical_feature= 'auto')

# Predicting the scores
test = X_test.drop(['srch_id'], axis=1).copy()

print("Predicting...")
y_pred = ranker.predict(test)
print("Done predicting")

df_res = X_test.copy()
df_res['pred_grades'] = y_pred
df_res = df_res.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)
df_res = df_res.merge(test_ideal, on=['srch_id', 'prop_id'], how='left')

final_ndcg = calc_NDCG(test_ideal, df_res)
# wandb.log({'ndcg_final': final_ndcg})
print(f"result final:{final_ndcg}")
# wandb.finish()




[1]	training's ndcg@5: 0.376153	valid_1's ndcg@5: 0.370391
[2]	training's ndcg@5: 0.370196	valid_1's ndcg@5: 0.360211
[3]	training's ndcg@5: 0.371251	valid_1's ndcg@5: 0.36062
[4]	training's ndcg@5: 0.381333	valid_1's ndcg@5: 0.36345
[5]	training's ndcg@5: 0.386158	valid_1's ndcg@5: 0.365806
[6]	training's ndcg@5: 0.387033	valid_1's ndcg@5: 0.367989
[7]	training's ndcg@5: 0.388286	valid_1's ndcg@5: 0.370421
[8]	training's ndcg@5: 0.393209	valid_1's ndcg@5: 0.371784
[9]	training's ndcg@5: 0.394209	valid_1's ndcg@5: 0.373215
[10]	training's ndcg@5: 0.398132	valid_1's ndcg@5: 0.374495
[11]	training's ndcg@5: 0.40199	valid_1's ndcg@5: 0.374007
[12]	training's ndcg@5: 0.404827	valid_1's ndcg@5: 0.374443
[13]	training's ndcg@5: 0.405485	valid_1's ndcg@5: 0.375718
[14]	training's ndcg@5: 0.408437	valid_1's ndcg@5: 0.376668
[15]	training's ndcg@5: 0.411027	valid_1's ndcg@5: 0.377951
[16]	training's ndcg@5: 0.411732	valid_1's ndcg@5: 0.37919
[17]	training's ndcg@5: 0.412175	valid_1's ndcg@5: 0.

In [13]:
# get feature importances of ranker sort descending
feature_imp = pd.DataFrame(sorted(zip(ranker.feature_importances_,X_train_lgb.columns)), columns=['Value','Feature'])
feature_imp.sort_values(by="Value", ascending=False, inplace=True)

# save to internal
feature_imp.to_csv(config['PATH']['INT_DIR'] + '/feature_importance.csv', index=False)
feature_imp

# Plot the feature importances
# plt.figure(figsize=(20, 10))
# sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
# plt.title('LightGBM Features (avg over folds)')
# plt.tight_layout()


Unnamed: 0,Value,Feature
100,2239,prop_id
99,1304,prop_country_id
98,509,norm_price_usd_srch_id
97,430,norm_price_usd_prop_id
96,412,rank_log_price_diff
95,391,prop_id_count
94,384,log_price_diff
93,359,norm_prop_location_score2_srch_id
92,346,norm_prop_starrating_srch_id
91,314,rank_price_usd


In [9]:
feature_imp

# sort descending

Unnamed: 0,Value,Feature
0,0,comp1_inv
1,0,count_y
2,0,day
3,0,day_of_week
4,0,hour
5,0,is_weekend
6,0,month
7,0,norm_prop_review_score_prop_id
8,0,norm_prop_starrating_prop_id
9,0,site_id
