<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import seaborn as sb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn import preprocessing
import timeit

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

In [2]:
# # Load data
df_train = pd.read_parquet('../data/training_set.parquet', engine = 'auto')
#df_train = df_train[df_train['srch_id'] < 10000]
df_test = pd.read_parquet('../data/test_set.parquet', engine = 'auto')

<h1>Data prep<h1\>

In [3]:
def make_score(row):
    if row['booking_bool'] == 1:
        return 5
    elif row['click_bool'] == 1:
        return 1
    return 0

# Add features for hour, day and month.
def date_time(df):
    df_copy = df.copy()
    df_copy['date_time'] = pd.to_datetime(df_copy['date_time'])
    df_copy['hour'] = df_copy['date_time'].dt.hour
    df_copy['day'] = df_copy['date_time'].dt.day
    df_copy['month'] = df_copy['date_time'].dt.month
    df_copy = df_copy.drop('date_time', axis=1)
    return df_copy

def remove_cols(df, cols):
    return df.drop(cols, axis=1)

def remove_cols_nan(df, limit):
    df_new = df.copy()
    for col in df_new.columns:
        if len(df_new[col]) * limit < df_new[col].isna().sum():
            df_new = df_new.drop(col, axis=1)
    return df_new

# Add column with a ranking for each property in a search based on another column.
def create_rank_feature(df, col):
    df_new = df.copy()
    df_new['rank_' + str(col)] = df.groupby('srch_id')[col].rank(ascending=False)
    return df_new

# adds a normalised version of a column based on a chosen grouping.
def add_normalized_column(df, col, group):
    df_new = df.copy()
    df_new['norm_' + str(col) + "_" + str(group)] = (
        (df_new[col] - df_new.groupby(group)[col].transform('mean')) 
        / df_new.groupby(group)[col].transform('std')
    )
    return df_new

def prep_data(df, target_cols, test=False):
    df_new = df.copy()
    if not test:
        df_new['score'] = df_new.apply(lambda row: make_score(row), axis=1)
        df_new = df_new.drop(target_cols, axis=1)
    df_new = date_time(df_new)
    #df_new = remove_cols_nan(df_new, 0.8)
    
    # difference features
    df_new['usd_diff'] = abs(df_new['visitor_hist_adr_usd'] - df_new['price_usd'])
    df_new['star_diff'] = abs(df_new['visitor_hist_starrating'] - df_new['prop_starrating'])
    
    # ranking features
    df_new = create_rank_feature(df_new, 'price_usd')
    df_new = create_rank_feature(df_new, 'prop_starrating')
    df_new = create_rank_feature(df_new, 'prop_review_score')
    df_new = create_rank_feature(df_new, 'prop_location_score1')
    df_new = create_rank_feature(df_new, 'prop_location_score2')
    
    df_new.fillna(-1, inplace=True)
    
    # Normalisation is very slow !!!
    df_new = add_normalized_column(df_new, 'price_usd', 'srch_id')
    df_new = add_normalized_column(df_new, 'prop_starrating', 'srch_id')
    df_new = add_normalized_column(df_new, 'prop_review_score', 'srch_id')
    df_new = add_normalized_column(df_new, 'prop_location_score1', 'srch_id')
    df_new = add_normalized_column(df_new, 'prop_location_score2', 'srch_id')

    df_new = add_normalized_column(df_new, 'price_usd', 'prop_id')
    df_new = add_normalized_column(df_new, 'prop_starrating', 'prop_id')
    df_new = add_normalized_column(df_new, 'prop_review_score', 'prop_id')
    df_new = add_normalized_column(df_new, 'prop_location_score1', 'prop_id')
    df_new = add_normalized_column(df_new, 'prop_location_score2', 'prop_id')
    
    df_new = add_normalized_column(df_new, 'price_usd', 'prop_country_id')
    df_new = add_normalized_column(df_new, 'prop_starrating', 'prop_country_id')
    df_new = add_normalized_column(df_new, 'prop_review_score', 'prop_country_id')
    df_new = add_normalized_column(df_new, 'prop_location_score1', 'prop_country_id')
    df_new = add_normalized_column(df_new, 'prop_location_score2', 'prop_country_id')
    
    df_new = add_normalized_column(df_new, 'price_usd', 'srch_destination_id')
    df_new = add_normalized_column(df_new, 'prop_starrating', 'srch_destination_id')
    df_new = add_normalized_column(df_new, 'prop_review_score', 'srch_destination_id')
    df_new = add_normalized_column(df_new, 'prop_location_score1', 'srch_destination_id')
    df_new = add_normalized_column(df_new, 'prop_location_score2', 'srch_destination_id')
    
    df_new = add_normalized_column(df_new, 'price_usd', 'srch_length_of_stay')
    df_new = add_normalized_column(df_new, 'prop_starrating', 'srch_length_of_stay')
    df_new = add_normalized_column(df_new, 'prop_review_score', 'srch_length_of_stay')
    df_new = add_normalized_column(df_new, 'prop_location_score1', 'srch_length_of_stay')
    df_new = add_normalized_column(df_new, 'prop_location_score2', 'srch_length_of_stay')
    
    df_new = add_normalized_column(df_new, 'price_usd', 'srch_booking_window')
    df_new = add_normalized_column(df_new, 'prop_starrating', 'srch_booking_window')
    df_new = add_normalized_column(df_new, 'prop_review_score', 'srch_booking_window')
    df_new = add_normalized_column(df_new, 'prop_location_score1', 'srch_booking_window')
    df_new = add_normalized_column(df_new, 'prop_location_score2', 'srch_booking_window')
    
    return df_new

In [4]:
target_cols = ['click_bool', 'booking_bool', 'gross_bookings_usd', 'position']
#df_new["norm_" + str(group) + str(col)] = df.groupby(group).col(df.col-g.transform('min')) / g.transform(np.ptp)

print("")
df_train = prep_data(df_train, target_cols, False)
df_test = prep_data(df_test, target_cols, True)

In [5]:
#df_new[['price_usd', 'norm_price_usdsrch_id']].loc[df_new['srch_id'] == 1]

<h1>Data split<h1\>

In [6]:
from sklearn.model_selection import GroupShuffleSplit

splitter = GroupShuffleSplit(test_size=0.2, n_splits=2, random_state = 7)
split = splitter.split(df_train, groups=df_train['srch_id'])
train_inds, test_inds = next(split)

df_ideal = df_train.iloc[test_inds].copy().sort_values(by=['srch_id', 'score'], ascending=[True, False], inplace=False)

X = df_train.drop(['score'], axis=1)
y = df_train['score']

X_train, X_test, y_train, y_test, test_ideal = X.iloc[train_inds], X.iloc[test_inds], y.iloc[train_inds], y.iloc[test_inds], df_ideal

train_groups = X_train.groupby('srch_id').size().to_frame('size')['size'].to_numpy()

test_groups = X_test.groupby('srch_id').size().to_frame('size')['size'].to_numpy()


<h1>Training <h1\>

In [7]:
params = {
    "objective": "lambdarank",
    "metric":"ndcg",
    'n_estimators': 498, 
    'max_depth': 7, 
    'learning_rate': 0.04938250379207737, 
    'subsample': 0.5098019827512731, 
    'colsample_bytree': 0.5433556425106324, 
    'gamma': 0.33103514405053813, 
    'reg_alpha': 0.0030927739265164565, 
    'reg_lambda': 0.0005679745733624298
}

gbm = lgb.LGBMRanker(**params)



In [8]:
gbm.fit(X_train, y_train, 
        group=train_groups, eval_set=[(X_test, y_test)], 
        eval_group=[test_groups])






[1]	valid_0's ndcg@1: 0.127975	valid_0's ndcg@2: 0.188662	valid_0's ndcg@3: 0.228738	valid_0's ndcg@4: 0.260907	valid_0's ndcg@5: 0.285288
[2]	valid_0's ndcg@1: 0.154076	valid_0's ndcg@2: 0.219122	valid_0's ndcg@3: 0.261174	valid_0's ndcg@4: 0.292745	valid_0's ndcg@5: 0.317599
[3]	valid_0's ndcg@1: 0.16704	valid_0's ndcg@2: 0.235308	valid_0's ndcg@3: 0.278814	valid_0's ndcg@4: 0.310975	valid_0's ndcg@5: 0.33601
[4]	valid_0's ndcg@1: 0.177126	valid_0's ndcg@2: 0.245096	valid_0's ndcg@3: 0.288788	valid_0's ndcg@4: 0.32161	valid_0's ndcg@5: 0.346388
[5]	valid_0's ndcg@1: 0.181856	valid_0's ndcg@2: 0.250536	valid_0's ndcg@3: 0.294339	valid_0's ndcg@4: 0.326282	valid_0's ndcg@5: 0.350205
[6]	valid_0's ndcg@1: 0.183387	valid_0's ndcg@2: 0.254242	valid_0's ndcg@3: 0.297848	valid_0's ndcg@4: 0.329817	valid_0's ndcg@5: 0.353981
[7]	valid_0's ndcg@1: 0.184682	valid_0's ndcg@2: 0.256165	valid_0's ndcg@3: 0.300272	valid_0's ndcg@4: 0.332195	valid_0's ndcg@5: 0.356204
[8]	valid_0's ndcg@1: 0.186238

[60]	valid_0's ndcg@1: 0.197079	valid_0's ndcg@2: 0.271508	valid_0's ndcg@3: 0.317605	valid_0's ndcg@4: 0.350076	valid_0's ndcg@5: 0.373948
[61]	valid_0's ndcg@1: 0.197254	valid_0's ndcg@2: 0.271645	valid_0's ndcg@3: 0.318028	valid_0's ndcg@4: 0.35034	valid_0's ndcg@5: 0.374212
[62]	valid_0's ndcg@1: 0.197454	valid_0's ndcg@2: 0.271938	valid_0's ndcg@3: 0.318007	valid_0's ndcg@4: 0.350464	valid_0's ndcg@5: 0.374546
[63]	valid_0's ndcg@1: 0.19733	valid_0's ndcg@2: 0.272138	valid_0's ndcg@3: 0.318007	valid_0's ndcg@4: 0.350571	valid_0's ndcg@5: 0.374788
[64]	valid_0's ndcg@1: 0.19768	valid_0's ndcg@2: 0.272056	valid_0's ndcg@3: 0.318227	valid_0's ndcg@4: 0.350557	valid_0's ndcg@5: 0.374664
[65]	valid_0's ndcg@1: 0.197258	valid_0's ndcg@2: 0.271963	valid_0's ndcg@3: 0.318298	valid_0's ndcg@4: 0.350641	valid_0's ndcg@5: 0.37465
[66]	valid_0's ndcg@1: 0.197634	valid_0's ndcg@2: 0.271831	valid_0's ndcg@3: 0.318258	valid_0's ndcg@4: 0.350524	valid_0's ndcg@5: 0.374684
[67]	valid_0's ndcg@1: 0

[119]	valid_0's ndcg@1: 0.199961	valid_0's ndcg@2: 0.276551	valid_0's ndcg@3: 0.321654	valid_0's ndcg@4: 0.355153	valid_0's ndcg@5: 0.37914
[120]	valid_0's ndcg@1: 0.200137	valid_0's ndcg@2: 0.276656	valid_0's ndcg@3: 0.321786	valid_0's ndcg@4: 0.355375	valid_0's ndcg@5: 0.379252
[121]	valid_0's ndcg@1: 0.200263	valid_0's ndcg@2: 0.276592	valid_0's ndcg@3: 0.321972	valid_0's ndcg@4: 0.355627	valid_0's ndcg@5: 0.37933
[122]	valid_0's ndcg@1: 0.200488	valid_0's ndcg@2: 0.276907	valid_0's ndcg@3: 0.322151	valid_0's ndcg@4: 0.355838	valid_0's ndcg@5: 0.37943
[123]	valid_0's ndcg@1: 0.200436	valid_0's ndcg@2: 0.277095	valid_0's ndcg@3: 0.322186	valid_0's ndcg@4: 0.355655	valid_0's ndcg@5: 0.379521
[124]	valid_0's ndcg@1: 0.200535	valid_0's ndcg@2: 0.277211	valid_0's ndcg@3: 0.322406	valid_0's ndcg@4: 0.35558	valid_0's ndcg@5: 0.379579
[125]	valid_0's ndcg@1: 0.200313	valid_0's ndcg@2: 0.27721	valid_0's ndcg@3: 0.322336	valid_0's ndcg@4: 0.355615	valid_0's ndcg@5: 0.37957
[126]	valid_0's ndc

[178]	valid_0's ndcg@1: 0.202065	valid_0's ndcg@2: 0.278185	valid_0's ndcg@3: 0.324506	valid_0's ndcg@4: 0.357997	valid_0's ndcg@5: 0.381673
[179]	valid_0's ndcg@1: 0.202017	valid_0's ndcg@2: 0.278156	valid_0's ndcg@3: 0.324489	valid_0's ndcg@4: 0.357925	valid_0's ndcg@5: 0.381579
[180]	valid_0's ndcg@1: 0.202115	valid_0's ndcg@2: 0.278298	valid_0's ndcg@3: 0.32477	valid_0's ndcg@4: 0.357921	valid_0's ndcg@5: 0.381691
[181]	valid_0's ndcg@1: 0.20209	valid_0's ndcg@2: 0.278268	valid_0's ndcg@3: 0.324748	valid_0's ndcg@4: 0.357898	valid_0's ndcg@5: 0.381736
[182]	valid_0's ndcg@1: 0.20214	valid_0's ndcg@2: 0.278403	valid_0's ndcg@3: 0.324734	valid_0's ndcg@4: 0.357865	valid_0's ndcg@5: 0.381775
[183]	valid_0's ndcg@1: 0.202365	valid_0's ndcg@2: 0.27855	valid_0's ndcg@3: 0.324965	valid_0's ndcg@4: 0.357957	valid_0's ndcg@5: 0.381869
[184]	valid_0's ndcg@1: 0.202566	valid_0's ndcg@2: 0.278665	valid_0's ndcg@3: 0.325082	valid_0's ndcg@4: 0.358126	valid_0's ndcg@5: 0.381903
[185]	valid_0's n

[237]	valid_0's ndcg@1: 0.203814	valid_0's ndcg@2: 0.28054	valid_0's ndcg@3: 0.326683	valid_0's ndcg@4: 0.359963	valid_0's ndcg@5: 0.383914
[238]	valid_0's ndcg@1: 0.203689	valid_0's ndcg@2: 0.280558	valid_0's ndcg@3: 0.326705	valid_0's ndcg@4: 0.359929	valid_0's ndcg@5: 0.383906
[239]	valid_0's ndcg@1: 0.203514	valid_0's ndcg@2: 0.280449	valid_0's ndcg@3: 0.326392	valid_0's ndcg@4: 0.359729	valid_0's ndcg@5: 0.383744
[240]	valid_0's ndcg@1: 0.203363	valid_0's ndcg@2: 0.280485	valid_0's ndcg@3: 0.326502	valid_0's ndcg@4: 0.359738	valid_0's ndcg@5: 0.383716
[241]	valid_0's ndcg@1: 0.203263	valid_0's ndcg@2: 0.280496	valid_0's ndcg@3: 0.326568	valid_0's ndcg@4: 0.35976	valid_0's ndcg@5: 0.383786
[242]	valid_0's ndcg@1: 0.203065	valid_0's ndcg@2: 0.280328	valid_0's ndcg@3: 0.326481	valid_0's ndcg@4: 0.35968	valid_0's ndcg@5: 0.383693
[243]	valid_0's ndcg@1: 0.202989	valid_0's ndcg@2: 0.280181	valid_0's ndcg@3: 0.32649	valid_0's ndcg@4: 0.359621	valid_0's ndcg@5: 0.383632
[244]	valid_0's n

[296]	valid_0's ndcg@1: 0.204464	valid_0's ndcg@2: 0.281444	valid_0's ndcg@3: 0.327777	valid_0's ndcg@4: 0.361402	valid_0's ndcg@5: 0.38512
[297]	valid_0's ndcg@1: 0.204513	valid_0's ndcg@2: 0.281695	valid_0's ndcg@3: 0.327872	valid_0's ndcg@4: 0.361455	valid_0's ndcg@5: 0.3852
[298]	valid_0's ndcg@1: 0.204587	valid_0's ndcg@2: 0.281669	valid_0's ndcg@3: 0.327915	valid_0's ndcg@4: 0.361454	valid_0's ndcg@5: 0.385239
[299]	valid_0's ndcg@1: 0.204564	valid_0's ndcg@2: 0.281739	valid_0's ndcg@3: 0.327912	valid_0's ndcg@4: 0.361462	valid_0's ndcg@5: 0.385281
[300]	valid_0's ndcg@1: 0.204539	valid_0's ndcg@2: 0.2817	valid_0's ndcg@3: 0.327907	valid_0's ndcg@4: 0.361454	valid_0's ndcg@5: 0.385252
[301]	valid_0's ndcg@1: 0.204765	valid_0's ndcg@2: 0.281606	valid_0's ndcg@3: 0.327928	valid_0's ndcg@4: 0.361494	valid_0's ndcg@5: 0.385293
[302]	valid_0's ndcg@1: 0.20479	valid_0's ndcg@2: 0.281637	valid_0's ndcg@3: 0.328042	valid_0's ndcg@4: 0.361468	valid_0's ndcg@5: 0.385445
[303]	valid_0's ndc

[355]	valid_0's ndcg@1: 0.205169	valid_0's ndcg@2: 0.281991	valid_0's ndcg@3: 0.328945	valid_0's ndcg@4: 0.362176	valid_0's ndcg@5: 0.386248
[356]	valid_0's ndcg@1: 0.205219	valid_0's ndcg@2: 0.282041	valid_0's ndcg@3: 0.329036	valid_0's ndcg@4: 0.362264	valid_0's ndcg@5: 0.386276
[357]	valid_0's ndcg@1: 0.205319	valid_0's ndcg@2: 0.282	valid_0's ndcg@3: 0.329101	valid_0's ndcg@4: 0.362242	valid_0's ndcg@5: 0.386315
[358]	valid_0's ndcg@1: 0.205244	valid_0's ndcg@2: 0.282004	valid_0's ndcg@3: 0.32905	valid_0's ndcg@4: 0.362271	valid_0's ndcg@5: 0.386391
[359]	valid_0's ndcg@1: 0.205169	valid_0's ndcg@2: 0.28196	valid_0's ndcg@3: 0.328914	valid_0's ndcg@4: 0.362262	valid_0's ndcg@5: 0.386336
[360]	valid_0's ndcg@1: 0.205245	valid_0's ndcg@2: 0.281972	valid_0's ndcg@3: 0.328897	valid_0's ndcg@4: 0.362259	valid_0's ndcg@5: 0.386372
[361]	valid_0's ndcg@1: 0.205169	valid_0's ndcg@2: 0.281927	valid_0's ndcg@3: 0.32889	valid_0's ndcg@4: 0.362247	valid_0's ndcg@5: 0.386411
[362]	valid_0's ndc

[414]	valid_0's ndcg@1: 0.205718	valid_0's ndcg@2: 0.282604	valid_0's ndcg@3: 0.329269	valid_0's ndcg@4: 0.363075	valid_0's ndcg@5: 0.387304
[415]	valid_0's ndcg@1: 0.205719	valid_0's ndcg@2: 0.282537	valid_0's ndcg@3: 0.329138	valid_0's ndcg@4: 0.363019	valid_0's ndcg@5: 0.387317
[416]	valid_0's ndcg@1: 0.205719	valid_0's ndcg@2: 0.282637	valid_0's ndcg@3: 0.329358	valid_0's ndcg@4: 0.363134	valid_0's ndcg@5: 0.387402
[417]	valid_0's ndcg@1: 0.205696	valid_0's ndcg@2: 0.282676	valid_0's ndcg@3: 0.329392	valid_0's ndcg@4: 0.363103	valid_0's ndcg@5: 0.38743
[418]	valid_0's ndcg@1: 0.20587	valid_0's ndcg@2: 0.282677	valid_0's ndcg@3: 0.329375	valid_0's ndcg@4: 0.363149	valid_0's ndcg@5: 0.387485
[419]	valid_0's ndcg@1: 0.205969	valid_0's ndcg@2: 0.282776	valid_0's ndcg@3: 0.329449	valid_0's ndcg@4: 0.363202	valid_0's ndcg@5: 0.38757
[420]	valid_0's ndcg@1: 0.205944	valid_0's ndcg@2: 0.282767	valid_0's ndcg@3: 0.32949	valid_0's ndcg@4: 0.363218	valid_0's ndcg@5: 0.387546
[421]	valid_0's n

[473]	valid_0's ndcg@1: 0.20587	valid_0's ndcg@2: 0.283808	valid_0's ndcg@3: 0.330165	valid_0's ndcg@4: 0.363852	valid_0's ndcg@5: 0.387631
[474]	valid_0's ndcg@1: 0.205795	valid_0's ndcg@2: 0.283765	valid_0's ndcg@3: 0.330121	valid_0's ndcg@4: 0.363841	valid_0's ndcg@5: 0.38758
[475]	valid_0's ndcg@1: 0.20572	valid_0's ndcg@2: 0.283698	valid_0's ndcg@3: 0.33016	valid_0's ndcg@4: 0.363809	valid_0's ndcg@5: 0.38759
[476]	valid_0's ndcg@1: 0.205846	valid_0's ndcg@2: 0.283709	valid_0's ndcg@3: 0.330225	valid_0's ndcg@4: 0.36387	valid_0's ndcg@5: 0.387638
[477]	valid_0's ndcg@1: 0.205821	valid_0's ndcg@2: 0.283759	valid_0's ndcg@3: 0.330299	valid_0's ndcg@4: 0.363898	valid_0's ndcg@5: 0.387768
[478]	valid_0's ndcg@1: 0.205796	valid_0's ndcg@2: 0.283766	valid_0's ndcg@3: 0.330306	valid_0's ndcg@4: 0.363915	valid_0's ndcg@5: 0.387776
[479]	valid_0's ndcg@1: 0.205846	valid_0's ndcg@2: 0.283775	valid_0's ndcg@3: 0.330207	valid_0's ndcg@4: 0.363936	valid_0's ndcg@5: 0.387738
[480]	valid_0's ndc

In [9]:
# # Optimize LGBM with optuna
# import optuna
# from functools import partial

# def objective(trial, X_train, y_train, X_test, test_ideal):
#     y_train_xgb = y_train.astype(int)
#     y_train_xgb[y_train == 5] = 2

#     params = {
#         "objective": "lambdarank",
#         "metric":"ndcg",
#         "random_state": 42,
#         "n_estimators": trial.suggest_int("n_estimators", 50, 500),
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#         "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
#         "subsample": trial.suggest_float("subsample", 0.5, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
#         "gamma": trial.suggest_float("gamma", 0, 1),
#         "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 1e-1, log=True),
#         "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 1e-1, log=True),
#     }

#     gbm = lgb.LGBMRanker(**params)
#     gbm.fit(X_train, y_train, group=train_groups, eval_set=[(X_test, y_test)], eval_group=[test_groups])

#     pred_lgbm = constructs_predictions(gbm, X_test, ideal_df=test_ideal)
#     ndcg = calc_NDCG(test_ideal, pred_lgbm)

#     return ndcg

# print("Training LGBM")

# # Wrap the objective function with the input data
# objective_with_data = partial(objective, X_train=X_train, y_train=y_train, X_test=X_test, test_ideal=test_ideal)

# # Create an Optuna study and optimize the objective function
# study = optuna.create_study(direction="maximize")
# study.optimize(objective_with_data, n_trials=20)



In [10]:
# best_params = study.best_params
# print(best_params)

In [11]:
# best_params = study.best_params
# gbm = lgb.LGBMRanker(objective="lambdarank",metric="ndcg", **best_params)
# gbm.fit(X_train, y_train, group=train_groups, eval_set=[(X_test, y_test)], eval_group=[test_groups])

<h1> Validation <h1\>

In [12]:
def construct_pred_ideal(df_in, df_ideal, y_pred):
    df = df_in.copy()
    df['pred_grades'] = y_pred
    df = df.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)

    # Merge grades from ideal on srch_id and prop_id
    df = df.merge(df_ideal, on=['srch_id', 'prop_id'], how='left')

    # Return srch_id, prop_id and pred_grades
    return df[['srch_id', 'prop_id', 'pred_grades', 'score']]

def construct_pred_submission(df_in, y_pred):
    df = df_in.copy()
    df['pred_grades'] = y_pred
    df = df.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)

    # Return srch_id, prop_id and pred_grades
    return df[['srch_id', 'prop_id']]

def constructs_predictions(model, data, ideal_df = None):
    y_pred = model.predict(data)

    if ideal_df is not None:
        pred_df = construct_pred_ideal(data, test_ideal, y_pred)
    else:
        pred_df = construct_pred_submission(data, y_pred)
    return pred_df

def calc_NDCG(df_ideal, df_pred, k = 5):
    # Group by 5
    df_ideal = df_ideal.groupby('srch_id').head(k)
    df_pred = df_pred.groupby('srch_id').head(k)

    assert df_ideal.shape[0] % k == 0
    assert df_pred.shape[0] % k == 0
    
    # Get grades matrices
    ideal_grades = df_ideal['score'].values.reshape(int(df_ideal.shape[0] / k), k)
    pred_grades = df_pred['score'].values.reshape(int(df_pred.shape[0] / k), k)

    discount_vec = [1/np.log2(i+2) for i in range(k)]

    # Calculate NDCG
    NDCG = (pred_grades @ discount_vec).sum() / (ideal_grades @ discount_vec).sum()

    return NDCG

In [None]:
pred_lgbm = constructs_predictions(gbm, X_test, ideal_df=test_ideal)
pred_random = construct_pred_ideal(X_test, test_ideal, np.random.rand(len(X_test)))
pred_lgbm_submission = constructs_predictions(gbm, df_test)

Highest score: 0.409492660282207

In [None]:
print(f"LGBM: {calc_NDCG(test_ideal, pred_lgbm)},\nRandom: {calc_NDCG(test_ideal, pred_random)}")

In [None]:
pred_lgbm_submission.to_csv('../data/submission_LGBM.csv', index=False)