<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import seaborn as sb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn import preprocessing
import timeit

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

In [2]:
# # Load data
df_train = pd.read_parquet('../data/training_set.parquet', engine = 'auto')
#df_train = df_train[df_train['srch_id'] < 10000]
df_test = pd.read_parquet('../data/test_set.parquet', engine = 'auto')

<h1>Data prep<h1\>

In [3]:
def make_score(row):
    if row['booking_bool'] == 1:
        return 5
    elif row['click_bool'] == 1:
        return 1
    return 0

# Add features for hour, day and month.
def date_time(df):
    df['date_time'] = pd.to_datetime(df['date_time'])
    df['hour'] = df['date_time'].dt.hour
    df['day'] = df['date_time'].dt.day
    df['month'] = df['date_time'].dt.month
    df = df.drop('date_time', axis=1)
    return df

def remove_cols(df, cols):
    return df.drop(cols, axis=1)

def remove_cols_nan(df, limit):
    for col in df.columns:
        if len(df[col]) * limit < df[col].isna().sum():
            df = df.drop(col, axis=1)
    return df

# Add column with a ranking for each property in a search based on another column.
def create_rank_feature(df, col):
    df['rank_' + str(col)] = df.groupby('srch_id')[col].rank(ascending=False)
    return df

# location score 2 has missing values for property on some rows while some rows have a score.
# take average of rows that do have a score. reduces nans from 1090348 to 182213.
def fill_location_score_2(df):
    df['prop_location_score2'] = df.groupby('prop_id')['prop_location_score2'].transform(lambda x: x.fillna(x.mean()))
    return df

# adds a normalised version of a column based on a chosen grouping.
def add_normalized_column(df, col, group):
    df['norm_' + str(col) + "_" + str(group)] = (
        (df[col] - df.groupby(group)[col].transform('mean')) 
        / df.groupby(group)[col].transform('std')
    )
    return df

def prep_data(df, target_cols, test=False):
    df_new = df.copy()
    
    if not test:
        print('add score')
        df_new['score'] = df_new.apply(lambda row: make_score(row), axis=1)
        df_new = df_new.drop(target_cols, axis=1)

        print('add hour, day, month')
        
    df_new = date_time(df_new)
    #df_new = remove_cols_nan(df_new, 0.9)
    
    # difference features assumes that users purchase in same category as history.
    print('add difference features')
    df_new['usd_diff'] = df_new['visitor_hist_adr_usd'] - df_new['price_usd']
    df_new['star_diff'] = df_new['visitor_hist_starrating'] - df_new['prop_starrating']    
    df_new['log_price_diff'] = df_new['prop_log_historical_price'] - np.log(df_new['price_usd'])
    
    # count variables 
    # theory: A property that is in more searches is purchased more often.
    df_new['prop_id_count'] = df.groupby('prop_id')['prop_id'].transform('count')
    
    df_new['srch_destination_id_count'] = df.groupby('srch_destination_id')['srch_destination_id'].transform('count')

    
    # ranking features
    print('add rank features')
    df_new = create_rank_feature(df_new, 'price_usd')
    df_new = create_rank_feature(df_new, 'prop_starrating')
    df_new = create_rank_feature(df_new, 'prop_review_score')
    df_new = create_rank_feature(df_new, 'prop_location_score1')
    df_new = create_rank_feature(df_new, 'prop_location_score2')
    
    print("fill ls2")
    df_new = fill_location_score_2(df_new)
    
    # Fill distance with mean.
    df['orig_destination_distance'].fillna(df['orig_destination_distance'].mean(),inplace=True)
    
    
    print("remove nan")
    # Fill rest of nan values with lowest.
    for i in df_new.columns[df_new.isnull().any(axis=0)]:
        df_new[i].fillna(0,inplace=True)
    
    print('add normalised features')
    groups = ['srch_id', 'prop_country_id', 'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window', 'month']
    targets = ['price_usd', 'prop_starrating', 'prop_review_score', 'prop_location_score1', 'prop_location_score2']
    for group in groups:
        for target in targets:
            df_new = add_normalized_column(df_new, target, group)
   
    df_new = add_normalized_column(df_new, 'price_usd', 'prop_id')
    df_new = add_normalized_column(df_new, 'price_usd', 'srch_room_count')
    
    # Normalisation might create nans
    print("remove nan\n")
    for i in df_new.columns[df_new.isnull().any(axis=0)]:
        df_new[i].fillna(0,inplace=True)
        
    for c in categorical_features:
        df_new[c] = df_new[c].astype('category')
    
    return df_new

In [4]:
# 'click_bool', 'booking_bool',
target_cols = ['gross_bookings_usd', 'position']
#df_new["norm_" + str(group) + str(col)] = df.groupby(group).col(df.col-g.transform('min')) / g.transform(np.ptp)
categorical_features = ['hour', 'day', 'month', 'site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_id', 'srch_destination_id']

print("Prepping training data")
df_train = prep_data(df_train, target_cols, False)
print("Prepping test data")
df_test = prep_data(df_test, target_cols, True)

Prepping training data
add score
add hour, day, month
add difference features


  result = getattr(ufunc, method)(*inputs, **kwargs)


add rank features
fill ls2
remove nan
add normalised features
remove nan

Prepping test data
add difference features


  result = getattr(ufunc, method)(*inputs, **kwargs)


add rank features
fill ls2
remove nan
add normalised features
remove nan



In [5]:
df_train

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,booking_bool,score,hour,day,month,usd_diff,star_diff,log_price_diff,prop_id_count,srch_destination_id_count,rank_price_usd,rank_prop_starrating,rank_prop_review_score,rank_prop_location_score1,rank_prop_location_score2,norm_price_usd_srch_id,norm_prop_starrating_srch_id,norm_prop_review_score_srch_id,norm_prop_location_score1_srch_id,norm_prop_location_score2_srch_id,norm_price_usd_prop_country_id,norm_prop_starrating_prop_country_id,norm_prop_review_score_prop_country_id,norm_prop_location_score1_prop_country_id,norm_prop_location_score2_prop_country_id,norm_price_usd_srch_destination_id,norm_prop_starrating_srch_destination_id,norm_prop_review_score_srch_destination_id,norm_prop_location_score1_srch_destination_id,norm_prop_location_score2_srch_destination_id,norm_price_usd_srch_length_of_stay,norm_prop_starrating_srch_length_of_stay,norm_prop_review_score_srch_length_of_stay,norm_prop_location_score1_srch_length_of_stay,norm_prop_location_score2_srch_length_of_stay,norm_price_usd_srch_booking_window,norm_prop_starrating_srch_booking_window,norm_prop_review_score_srch_booking_window,norm_prop_location_score1_srch_booking_window,norm_prop_location_score2_srch_booking_window,norm_price_usd_month,norm_prop_starrating_month,norm_prop_review_score_month,norm_prop_location_score1_month,norm_prop_location_score2_month,norm_price_usd_prop_id,norm_price_usd_srch_room_count
0,1,12,187,0.0,0.0,219,893,3,3.5,1,2.83,0.043800,4.95,104.77,0,23246,1,0,4,0,1,1,0.0,0.00,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,8,4,4,0.0,0.0,0.298233,612,13539,23.5,15.5,18.0,3.5,10.0,-0.611787,-0.093205,0.016094,1.022407,-0.035853,-0.011632,0.023546,-0.458546,0.171391,-0.511530,-0.414131,-0.172030,-0.344264,0.868014,0.117844,-0.010382,-0.059211,-0.263036,0.166135,-0.485319,-0.010413,0.041512,-0.265636,0.289655,-0.457185,-0.014101,-0.151051,-0.227749,-0.011815,-0.497167,-0.786824,-0.009158
1,1,12,187,0.0,0.0,219,10404,4,4.0,1,2.20,0.014900,5.03,170.74,0,23246,1,0,4,0,1,1,0.0,0.00,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,8,4,4,0.0,0.0,-0.110142,583,13539,10.0,5.0,9.5,19.0,19.0,0.072867,1.211671,0.466718,-0.192089,-0.647928,-0.003416,1.045138,0.144823,-0.288586,-0.708271,0.528882,1.057051,0.317057,-0.081617,-0.531846,-0.004627,0.907438,0.224972,-0.273203,-0.686222,-0.007990,1.039353,0.250049,-0.175353,-0.669803,-0.006236,0.794547,0.228380,-0.421392,-0.687283,0.047799,-0.005215
2,1,12,187,0.0,0.0,219,21315,3,4.5,1,2.20,0.024500,4.92,179.80,0,23246,1,0,4,0,1,1,0.0,0.00,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,8,4,4,0.0,0.0,-0.271845,551,13539,9.0,15.5,3.0,19.0,14.0,0.166894,-0.093205,0.917342,-0.192089,-0.444609,-0.002287,0.023546,0.748192,-0.288586,-0.642917,0.658391,-0.172030,0.978377,-0.081617,-0.316032,-0.003837,-0.059211,0.712980,-0.273203,-0.619486,-0.007657,0.041512,0.765734,-0.175353,-0.599176,-0.005156,-0.151051,0.684508,-0.421392,-0.624130,0.032590,-0.004673
3,1,12,187,0.0,0.0,219,27348,2,4.0,1,2.83,0.012500,4.39,602.77,0,23246,1,0,4,0,1,1,0.0,0.00,1,0.0,0.0,0.0,-1.0,0.0,5.0,-1.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,5.0,0,0,0,8,4,4,0.0,0.0,-2.011536,460,13539,1.0,25.0,9.5,3.5,21.0,4.556587,-1.398082,0.466718,1.022407,-0.698758,0.050392,-0.998046,0.144823,0.171391,-0.724609,6.704567,-1.401111,0.317057,0.868014,-0.585800,0.033059,-1.025860,0.224972,0.166135,-0.702906,0.007881,-0.956329,0.250049,0.289655,-0.687460,0.045266,-1.096649,0.228380,-0.011815,-0.703071,1.701209,0.020608
4,1,12,187,0.0,0.0,219,29604,4,3.5,1,2.64,0.124100,4.93,143.58,0,23246,1,0,4,0,1,1,0.0,0.00,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,8,4,4,0.0,0.0,-0.036892,665,13539,13.0,5.0,18.0,10.0,4.0,-0.209007,1.211671,0.016094,0.656130,1.664826,-0.006798,1.045138,-0.458546,0.032668,0.035126,0.140641,1.057051,-0.344264,0.581618,1.923039,-0.006996,0.907438,-0.263036,0.033636,0.072901,-0.008988,1.039353,-0.265636,0.149415,0.133586,-0.009474,0.794547,-0.227749,-0.135339,0.031077,0.013690,-0.006838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,5,219,0.0,0.0,219,77700,3,4.0,1,1.61,0.047100,0.00,118.00,0,16974,1,21,3,0,1,0,0.0,550.92,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,19,30,6,0.0,0.0,-4.770685,47,1246,2.0,3.5,2.5,4.0,3.0,0.583226,0.000000,0.475383,-0.089278,-0.540015,-0.009984,0.023546,0.144823,-0.719359,-0.489064,0.680922,0.946260,0.435114,0.240690,0.137403,-0.009228,-0.059211,0.224972,-0.684646,-0.462378,-0.225636,-0.216092,0.192004,-0.866169,-0.500730,-0.008266,-0.133918,0.242633,-0.794190,-0.468051,-0.789319,-0.008367
4958343,332785,5,219,0.0,0.0,219,88083,3,4.0,1,1.95,0.152000,0.00,89.00,0,16974,1,21,3,0,1,0,0.0,553.14,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,19,30,6,0.0,0.0,-4.488636,38,1246,5.0,3.5,2.5,2.5,1.0,-0.507971,0.000000,0.475383,0.639229,0.092665,-0.013596,0.023546,0.144823,-0.471117,0.225059,-0.039035,0.946260,0.435114,0.677880,2.203887,-0.011757,-0.059211,0.224972,-0.447543,0.266853,-0.382909,-0.216092,0.192004,-0.643544,0.194593,-0.009068,-0.133918,0.242633,-0.571548,0.230297,0.292226,-0.010100
4958344,332785,5,219,0.0,0.0,219,94508,3,3.5,1,1.10,0.016400,0.00,99.00,0,16974,1,21,3,0,1,0,0.0,544.43,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,19,30,6,0.0,0.0,-4.595120,43,1246,4.0,3.5,4.0,5.5,4.0,-0.131696,0.000000,0.118846,-1.182037,-0.725174,-0.012351,0.023546,-0.458546,-1.091722,-0.698059,0.209226,0.946260,-0.062273,-0.415095,-0.467373,-0.010885,-0.059211,-0.263036,-1.040301,-0.675795,-0.328677,-0.216092,-0.292067,-1.200105,-0.704223,-0.008791,-0.133918,-0.204595,-1.128154,-0.672429,-0.756023,-0.009503
4958345,332785,5,219,0.0,0.0,219,128360,3,5.0,1,1.95,0.066200,0.00,139.00,0,16974,1,21,3,0,1,0,0.0,550.38,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,5,19,30,6,0.0,0.0,-4.934474,37,1246,1.0,3.5,1.0,2.5,2.0,1.373404,0.000000,1.188457,0.639229,-0.424817,-0.007369,0.023546,1.351561,-0.471117,-0.359038,1.202271,0.946260,1.429887,0.677880,0.513665,-0.007396,-0.059211,1.200988,-0.447543,-0.329601,-0.111748,-0.216092,1.160148,-0.643544,-0.374127,-0.007685,-0.133918,1.137089,-0.571548,-0.340897,-0.411482,-0.007112


<h1>Data split<h1\>

In [23]:
from sklearn.model_selection import GroupShuffleSplit

splitter = GroupShuffleSplit(test_size=0.2, n_splits=2, random_state = 7)
split = splitter.split(df_train, groups=df_train['srch_id'])
train_inds, test_inds = next(split)

df_ideal = df_train.iloc[test_inds].copy().sort_values(by=['srch_id', 'score'], ascending=[True, False], inplace=False)

X = df_train.drop(['score'], axis=1)
y = df_train['score']

X_train, X_test, y_train, y_test, test_ideal = X.iloc[train_inds], X.iloc[test_inds], y.iloc[train_inds], y.iloc[test_inds], df_ideal

train_groups = X_train.groupby('srch_id').size().to_frame('size')['size'].to_numpy()

test_groups = X_test.groupby('srch_id').size().to_frame('size')['size'].to_numpy()


<h1>Training <h1\>

In [24]:
params = {
    "objective": "lambdarank",
    "boosting_type":"dart",
    "metric":"ndcg",
    'n_estimators': 500, 
    'num_leaves': 180,
    'max_depth': 10, 
    'learning_rate': 0.08925380432712779, 
    'subsample': 0.523890758165789, 
    'colsample_bytree': 0.5433556425106324, 
    'feature_fraction': 0.9677058301342538,
    'reg_alpha': 0.00011669441178850782, 
    'reg_lambda': 0.008250891056480582
}

gbm = lgb.LGBMRanker(**params)


In [None]:
gbm.fit(
    X=X_train,
    y=y_train,
    group=train_groups,
    eval_set=[(X_train, y_train),(X_test, y_test)],
    eval_group=[train_groups, test_groups],
    eval_at=[5],
    feature_name='auto', 
    categorical_feature = 'auto')




In [None]:
# # Optimize LGBM with optuna
# import optuna
# from functools import partial

# def objective(trial, X_train, y_train, X_test, test_ideal):
#     #y_train_gbm = y_train.astype(int)
#     #y_train_gbm[y_train == 5] = 2

#     params = {
#         "objective": "lambdarank",
#         "metric":"ndcg",
#         "random_state": 42,
#         "boosting_type": "dart",
#         #"early_stopping_round": trial.suggest_int("early_stopping_round", 10, 50),
#         "n_estimators": trial.suggest_int("n_estimators", 100, 500),
#         "num_leaves": trial.suggest_int("num_leaves", 20, 200),
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#         "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
#         "subsample": trial.suggest_float("subsample", 0.2, 1.0),
#         "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 1e-1, log=True),
#         "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 1e-1, log=True),
#         "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
#     }

#     gbm = lgb.LGBMRanker(**params)
#     gbm.fit(X_train, y_train, group=train_groups, eval_set=[(X_test, y_test)], eval_group=[test_groups])

#     pred_lgbm = constructs_predictions(gbm, X_test, ideal_df=test_ideal)
#     ndcg = calc_NDCG(test_ideal, pred_lgbm)

#     return ndcg

# print("Training LGBM")

# # Wrap the objective function with the input data
# objective_with_data = partial(objective, X_train=X_train, y_train=y_train, X_test=X_test, test_ideal=test_ideal)

# # Create an Optuna study and optimize the objective function
# study = optuna.create_study(direction="maximize")
# study.optimize(objective_with_data, n_trials=20)



In [None]:
# best_params = study.best_params
# print(best_params)

In [None]:
# best_params = study.best_params
# gbm = lgb.LGBMRanker(objective="lambdarank", metric="ndcg", boosting_type="dart", **best_params)
# gbm.fit(X_train, y_train, group=train_groups, eval_set=[(X_test, y_test)], eval_group=[test_groups])

<h1> Validation <h1\>

In [18]:
def construct_pred_ideal(df_in, df_ideal, y_pred):
    df = df_in.copy()
    df['pred_grades'] = y_pred
    df = df.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)

    # Merge grades from ideal on srch_id and prop_id
    df = df.merge(df_ideal, on=['srch_id', 'prop_id'], how='left')

    # Return srch_id, prop_id and pred_grades
    return df[['srch_id', 'prop_id', 'pred_grades', 'score']]

def construct_pred_submission(df_in, y_pred):
    df = df_in.copy()
    df['pred_grades'] = y_pred
    df = df.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)

    # Return srch_id, prop_id and pred_grades
    return df[['srch_id', 'prop_id']]

def constructs_predictions(model, data, ideal_df = None):
    y_pred = model.predict(data)

    if ideal_df is not None:
        pred_df = construct_pred_ideal(data, test_ideal, y_pred)
    else:
        pred_df = construct_pred_submission(data, y_pred)
    return pred_df

def calc_NDCG(df_ideal, df_pred, k = 5):
    # Group by 5
    df_ideal = df_ideal.groupby('srch_id').head(k)
    df_pred = df_pred.groupby('srch_id').head(k)

    assert df_ideal.shape[0] % k == 0
    assert df_pred.shape[0] % k == 0
    
    # Get grades matrices
    ideal_grades = df_ideal['score'].values.reshape(int(df_ideal.shape[0] / k), k)
    pred_grades = df_pred['score'].values.reshape(int(df_pred.shape[0] / k), k)

    discount_vec = [1/np.log2(i+2) for i in range(k)]

    # Calculate NDCG
    NDCG = (pred_grades @ discount_vec).sum() / (ideal_grades @ discount_vec).sum()

    return NDCG

In [20]:
pred_lgbm = constructs_predictions(gbm, X_test, ideal_df=test_ideal)
pred_random = construct_pred_ideal(X_test, test_ideal, np.random.rand(len(X_test)))
pred_lgbm_submission = constructs_predictions(gbm, df_test)

KeyError: 'srch_id'

Highest score: 0.4167321465628548

In [None]:
print(f"LGBM: {calc_NDCG(test_ideal, pred_lgbm)},\nRandom: {calc_NDCG(test_ideal, pred_random)}")

In [None]:
pred_lgbm_submission.to_csv('../data/submission_LGBM.csv', index=False)

In [None]:
lgb.plot_importance(gbm, figsize = (12,20))