<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import seaborn as sb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import xgboost as xgb


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

In [3]:
# # Load data
df_train = pd.read_parquet('../data/training_set.parquet', engine = 'auto')
df_train = df_train[df_train['srch_id'] < 10000]
df_test = pd.read_parquet('../data/test_set.parquet', engine = 'auto')

<h1>Data prep<h1\>

In [4]:
def make_score(row):
    if row['booking_bool'] == 1:
        return 5
    elif row['click_bool'] == 1:
        return 1
    return 0

def date_time(df):
    df_copy = df.copy()
    df_copy['date_time'] = pd.to_datetime(df_copy['date_time'])
    df_copy['hour'] = df_copy['date_time'].dt.hour
    df_copy['day'] = df_copy['date_time'].dt.day
    df_copy['month'] = df_copy['date_time'].dt.month
    return df_copy

def remove_cols(df, cols):
    return df.drop(cols, axis=1)

In [5]:
target_cols = ['click_bool', 'booking_bool', 'gross_bookings_usd', 'position', 'date_time']

df_train['score'] = df_train.apply(lambda row: make_score(row), axis=1)
df_train = date_time(df_train)
df_train = df_train.drop(target_cols, axis=1)
df_train.fillna(-1, inplace=True)

<h1>Data split<h1\>

In [24]:
from sklearn.model_selection import GroupShuffleSplit

splitter = GroupShuffleSplit(test_size=0.2, n_splits=2, random_state = 7)
split = splitter.split(df_train, groups=df_train['srch_id'])
train_inds, test_inds = next(split)

df_ideal = df_train.iloc[test_inds].copy().sort_values(by=['srch_id', 'score'], ascending=[True, False], inplace=False)

X = df_train.drop(['score'], axis=1)
y = df_train['score']
X_train, X_test, y_train, y_test, test_ideal = X.iloc[train_inds], X.iloc[test_inds], y.iloc[train_inds], y.iloc[test_inds], df_ideal

train_groups = X_train.groupby('srch_id').size().to_frame('size')['size'].to_numpy()

test_groups = X_test.groupby('srch_id').size().to_frame('size')['size'].to_numpy()


In [25]:
train_groups

array([32,  5, 21, ..., 31, 27, 31])

<h1>Training <h1\>

In [40]:
gbm = lgb.LGBMRanker(objective="lambdarank",
        metric="ndcg")

In [41]:
gbm.fit(X_train, y_train, 
        group=train_groups, eval_set=[(X_test, y_test)], 
        eval_group=[test_groups])






[1]	valid_0's ndcg@1: 0.125472	valid_0's ndcg@2: 0.186334	valid_0's ndcg@3: 0.226413	valid_0's ndcg@4: 0.253247	valid_0's ndcg@5: 0.276805
[2]	valid_0's ndcg@1: 0.133744	valid_0's ndcg@2: 0.190656	valid_0's ndcg@3: 0.228076	valid_0's ndcg@4: 0.259267	valid_0's ndcg@5: 0.282754
[3]	valid_0's ndcg@1: 0.135404	valid_0's ndcg@2: 0.205887	valid_0's ndcg@3: 0.24777	valid_0's ndcg@4: 0.278135	valid_0's ndcg@5: 0.297019
[4]	valid_0's ndcg@1: 0.140436	valid_0's ndcg@2: 0.209217	valid_0's ndcg@3: 0.253742	valid_0's ndcg@4: 0.288904	valid_0's ndcg@5: 0.311603
[5]	valid_0's ndcg@1: 0.144639	valid_0's ndcg@2: 0.211085	valid_0's ndcg@3: 0.257939	valid_0's ndcg@4: 0.290431	valid_0's ndcg@5: 0.315178
[6]	valid_0's ndcg@1: 0.149592	valid_0's ndcg@2: 0.21641	valid_0's ndcg@3: 0.264201	valid_0's ndcg@4: 0.295921	valid_0's ndcg@5: 0.322168
[7]	valid_0's ndcg@1: 0.155401	valid_0's ndcg@2: 0.224313	valid_0's ndcg@3: 0.270895	valid_0's ndcg@4: 0.299009	valid_0's ndcg@5: 0.323311
[8]	valid_0's ndcg@1: 0.15371

[85]	valid_0's ndcg@1: 0.166966	valid_0's ndcg@2: 0.23488	valid_0's ndcg@3: 0.276055	valid_0's ndcg@4: 0.307535	valid_0's ndcg@5: 0.33509
[86]	valid_0's ndcg@1: 0.166966	valid_0's ndcg@2: 0.23488	valid_0's ndcg@3: 0.276055	valid_0's ndcg@4: 0.307893	valid_0's ndcg@5: 0.335137
[87]	valid_0's ndcg@1: 0.169455	valid_0's ndcg@2: 0.235275	valid_0's ndcg@3: 0.276442	valid_0's ndcg@4: 0.309141	valid_0's ndcg@5: 0.335927
[88]	valid_0's ndcg@1: 0.171115	valid_0's ndcg@2: 0.236732	valid_0's ndcg@3: 0.277782	valid_0's ndcg@4: 0.311567	valid_0's ndcg@5: 0.335837
[89]	valid_0's ndcg@1: 0.172748	valid_0's ndcg@2: 0.236265	valid_0's ndcg@3: 0.278556	valid_0's ndcg@4: 0.313213	valid_0's ndcg@5: 0.33765
[90]	valid_0's ndcg@1: 0.176897	valid_0's ndcg@2: 0.236749	valid_0's ndcg@3: 0.280713	valid_0's ndcg@4: 0.313214	valid_0's ndcg@5: 0.338614
[91]	valid_0's ndcg@1: 0.179387	valid_0's ndcg@2: 0.237871	valid_0's ndcg@3: 0.279869	valid_0's ndcg@4: 0.313044	valid_0's ndcg@5: 0.340039
[92]	valid_0's ndcg@1: 0

<h1> Validation <h1\>

In [47]:
def construct_pred_ideal(df_in, df_ideal, y_pred):
    df = df_in.copy()
    df['pred_grades'] = y_pred
    df = df.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)

    # Merge grades from ideal on srch_id and prop_id
    df = df.merge(df_ideal, on=['srch_id', 'prop_id'], how='left')

    # Return srch_id, prop_id and pred_grades
    return df[['srch_id', 'prop_id', 'pred_grades', 'score']]

def construct_pred_submission(df_in, y_pred):
    df = df_in.copy()
    df['pred_grades'] = y_pred
    df = df.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)

    # Return srch_id, prop_id and pred_grades
    return df[['srch_id', 'prop_id']]

def constructs_predictions(model, data, ideal_df = None):
    y_pred = model.predict(data)

    if ideal_df is not None:
        pred_df = construct_pred_ideal(data, test_ideal, y_pred)
    else:
        pred_df = construct_pred_submission(data, y_pred)
    return pred_df

def calc_NDCG(df_ideal, df_pred, k = 5):
    # Group by 5
    df_ideal = df_ideal.groupby('srch_id').head(k)
    df_pred = df_pred.groupby('srch_id').head(k)

    assert df_ideal.shape[0] % k == 0
    assert df_pred.shape[0] % k == 0
    
    # Get grades matrices
    ideal_grades = df_ideal['score'].values.reshape(int(df_ideal.shape[0] / k), k)
    pred_grades = df_pred['score'].values.reshape(int(df_pred.shape[0] / k), k)

    discount_vec = [1/np.log2(i+2) for i in range(k)]

    # Calculate NDCG
    NDCG = (pred_grades @ discount_vec).sum() / (ideal_grades @ discount_vec).sum()

    return NDCG

In [48]:
pred_lgbm = constructs_predictions(gbm, X_test, ideal_df=test_ideal)
pred_random = construct_pred_ideal(X_test, test_ideal, np.random.rand(len(X_test)))

In [49]:
print(f"LGBM: {calc_NDCG(test_ideal, pred_lgbm)},\nRandom: {calc_NDCG(test_ideal, pred_random)}")

LGBM: 0.35683031906854773,
Random: 0.15002041510166886
