<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import seaborn as sb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn import preprocessing
import timeit

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

In [2]:
# # Load data
df_train = pd.read_parquet('../data/training_set.parquet', engine = 'auto')
#df_train = df_train[df_train['srch_id'] < 10000]
df_test = pd.read_parquet('../data/test_set.parquet', engine = 'auto')

<h1>Data prep<h1\>

In [3]:
def make_score(row):
    if row['booking_bool'] == 1:
        return 5
    elif row['click_bool'] == 1:
        return 1
    return 0

# Add features for hour, day and month.
def date_time(df):
    df_copy = df.copy()
    df_copy['date_time'] = pd.to_datetime(df_copy['date_time'])
    df_copy['hour'] = df_copy['date_time'].dt.hour
    df_copy['day'] = df_copy['date_time'].dt.day
    df_copy['month'] = df_copy['date_time'].dt.month
    df_copy = df_copy.drop('date_time', axis=1)
    return df_copy

def remove_cols(df, cols):
    return df.drop(cols, axis=1)

def remove_cols_nan(df, limit):
    df_new = df.copy()
    for col in df_new.columns:
        if len(df_new[col]) * limit < df_new[col].isna().sum():
            df_new = df_new.drop(col, axis=1)
    return df_new

# Add column with a ranking for each property in a search based on another column.
def create_rank_feature(df, col):
    df_new = df.copy()
    df_new['rank_' + str(col)] = df.groupby('srch_id')[col].rank(ascending=False)
    return df_new

def add_normalized_column(df, col, group):
    df_new = df.copy()
    df_new['norm_' + str(col) + "_" + str(group)] = (
        (df_new[col] - df_new.groupby(group)[col].transform('mean')) 
        / df_new.groupby(group)[col].transform('std')
    )
    return df_new

def prep_data(df, target_cols, test=False):
    df_new = df.copy()
    if not test:
        df_new['score'] = df_new.apply(lambda row: make_score(row), axis=1)
        df_new = df_new.drop(target_cols, axis=1)
    df_new = date_time(df_new)
    #df_new = remove_cols_nan(df_new, 0.8)
    
    # difference features
    df_new['usd_diff'] = abs(df_new['visitor_hist_adr_usd'] - df_new['price_usd'])
    df_new['star_diff'] = abs(df_new['visitor_hist_starrating'] - df_new['prop_starrating'])
    
    # ranking features
    df_new = create_rank_feature(df_new, 'price_usd')
    df_new = create_rank_feature(df_new, 'prop_starrating')
    df_new = create_rank_feature(df_new, 'prop_review_score')
    df_new = create_rank_feature(df_new, 'prop_location_score1')
    df_new = create_rank_feature(df_new, 'prop_location_score2')
    
    df_new.fillna(-1, inplace=True)
    
    df_new = add_normalized_column(df_new, 'price_usd', 'srch_id')
    df_new = add_normalized_column(df_new, 'prop_starrating', 'srch_id')
    df_new = add_normalized_column(df_new, 'prop_review_score', 'srch_id')
    df_new = add_normalized_column(df_new, 'prop_location_score1', 'srch_id')
    df_new = add_normalized_column(df_new, 'prop_location_score2', 'srch_id')

    df_new = add_normalized_column(df_new, 'price_usd', 'prop_id')
    df_new = add_normalized_column(df_new, 'prop_starrating', 'prop_id')
    df_new = add_normalized_column(df_new, 'prop_review_score', 'prop_id')
    df_new = add_normalized_column(df_new, 'prop_location_score1', 'prop_id')
    df_new = add_normalized_column(df_new, 'prop_location_score2', 'prop_id')
    
    df_new = add_normalized_column(df_new, 'price_usd', 'prop_country_id')
    df_new = add_normalized_column(df_new, 'prop_starrating', 'prop_country_id')
    df_new = add_normalized_column(df_new, 'prop_review_score', 'prop_country_id')
    df_new = add_normalized_column(df_new, 'prop_location_score1', 'prop_country_id')
    df_new = add_normalized_column(df_new, 'prop_location_score2', 'prop_country_id')
    
    return df_new

In [None]:
target_cols = ['click_bool', 'booking_bool', 'gross_bookings_usd', 'position']
#df_new["norm_" + str(group) + str(col)] = df.groupby(group).col(df.col-g.transform('min')) / g.transform(np.ptp)

df_train = prep_data(df_train, target_cols, False)
df_test = prep_data(df_test, target_cols, True)

In [None]:
#df_new[['price_usd', 'norm_price_usdsrch_id']].loc[df_new['srch_id'] == 1]

<h1>Data split<h1\>

In [None]:
from sklearn.model_selection import GroupShuffleSplit

splitter = GroupShuffleSplit(test_size=0.2, n_splits=2, random_state = 7)
split = splitter.split(df_train, groups=df_train['srch_id'])
train_inds, test_inds = next(split)

df_ideal = df_train.iloc[test_inds].copy().sort_values(by=['srch_id', 'score'], ascending=[True, False], inplace=False)

X = df_train.drop(['score'], axis=1)
y = df_train['score']

X_train, X_test, y_train, y_test, test_ideal = X.iloc[train_inds], X.iloc[test_inds], y.iloc[train_inds], y.iloc[test_inds], df_ideal

train_groups = X_train.groupby('srch_id').size().to_frame('size')['size'].to_numpy()

test_groups = X_test.groupby('srch_id').size().to_frame('size')['size'].to_numpy()


<h1>Training <h1\>

In [None]:
params = {
    "objective": "lambdarank",
    "metric":"ndcg",
    'n_estimators': 498, 
    'max_depth': 7, 
    'learning_rate': 0.04938250379207737, 
    'subsample': 0.5098019827512731, 
    'colsample_bytree': 0.5433556425106324, 
    'gamma': 0.33103514405053813, 
    'reg_alpha': 0.0030927739265164565, 
    'reg_lambda': 0.0005679745733624298
}

gbm = lgb.LGBMRanker(**params)



In [None]:
gbm.fit(X_train, y_train, 
        group=train_groups, eval_set=[(X_test, y_test)], 
        eval_group=[test_groups])






In [None]:
# # Optimize LGBM with optuna
# import optuna
# from functools import partial

# def objective(trial, X_train, y_train, X_test, test_ideal):
#     y_train_xgb = y_train.astype(int)
#     y_train_xgb[y_train == 5] = 2

#     params = {
#         "objective": "lambdarank",
#         "metric":"ndcg",
#         "random_state": 42,
#         "n_estimators": trial.suggest_int("n_estimators", 50, 500),
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#         "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
#         "subsample": trial.suggest_float("subsample", 0.5, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
#         "gamma": trial.suggest_float("gamma", 0, 1),
#         "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 1e-1, log=True),
#         "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 1e-1, log=True),
#     }

#     gbm = lgb.LGBMRanker(**params)
#     gbm.fit(X_train, y_train, group=train_groups, eval_set=[(X_test, y_test)], eval_group=[test_groups])

#     pred_lgbm = constructs_predictions(gbm, X_test, ideal_df=test_ideal)
#     ndcg = calc_NDCG(test_ideal, pred_lgbm)

#     return ndcg

# print("Training LGBM")

# # Wrap the objective function with the input data
# objective_with_data = partial(objective, X_train=X_train, y_train=y_train, X_test=X_test, test_ideal=test_ideal)

# # Create an Optuna study and optimize the objective function
# study = optuna.create_study(direction="maximize")
# study.optimize(objective_with_data, n_trials=20)



In [None]:
# best_params = study.best_params
# print(best_params)

In [None]:
# best_params = study.best_params
# gbm = lgb.LGBMRanker(objective="lambdarank",metric="ndcg", **best_params)
# gbm.fit(X_train, y_train, group=train_groups, eval_set=[(X_test, y_test)], eval_group=[test_groups])

<h1> Validation <h1\>

In [None]:
def construct_pred_ideal(df_in, df_ideal, y_pred):
    df = df_in.copy()
    df['pred_grades'] = y_pred
    df = df.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)

    # Merge grades from ideal on srch_id and prop_id
    df = df.merge(df_ideal, on=['srch_id', 'prop_id'], how='left')

    # Return srch_id, prop_id and pred_grades
    return df[['srch_id', 'prop_id', 'pred_grades', 'score']]

def construct_pred_submission(df_in, y_pred):
    df = df_in.copy()
    df['pred_grades'] = y_pred
    df = df.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)

    # Return srch_id, prop_id and pred_grades
    return df[['srch_id', 'prop_id']]

def constructs_predictions(model, data, ideal_df = None):
    y_pred = model.predict(data)

    if ideal_df is not None:
        pred_df = construct_pred_ideal(data, test_ideal, y_pred)
    else:
        pred_df = construct_pred_submission(data, y_pred)
    return pred_df

def calc_NDCG(df_ideal, df_pred, k = 5):
    # Group by 5
    df_ideal = df_ideal.groupby('srch_id').head(k)
    df_pred = df_pred.groupby('srch_id').head(k)

    assert df_ideal.shape[0] % k == 0
    assert df_pred.shape[0] % k == 0
    
    # Get grades matrices
    ideal_grades = df_ideal['score'].values.reshape(int(df_ideal.shape[0] / k), k)
    pred_grades = df_pred['score'].values.reshape(int(df_pred.shape[0] / k), k)

    discount_vec = [1/np.log2(i+2) for i in range(k)]

    # Calculate NDCG
    NDCG = (pred_grades @ discount_vec).sum() / (ideal_grades @ discount_vec).sum()

    return NDCG

In [None]:
pred_lgbm = constructs_predictions(gbm, X_test, ideal_df=test_ideal)
pred_random = construct_pred_ideal(X_test, test_ideal, np.random.rand(len(X_test)))
pred_lgbm_submission = constructs_predictions(gbm, df_test)

Highest score: 0.40449893782893503

In [None]:
print(f"LGBM: {calc_NDCG(test_ideal, pred_lgbm)},\nRandom: {calc_NDCG(test_ideal, pred_random)}")

In [None]:
pred_lgbm_submission.to_csv('../data/submission_LGBM.csv', index=False)