In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import ndcg_score

# Load the dataset 
df = pd.read_csv("../../data/training_set_VU_DM.csv")

# Split into train/val/test
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(df, groups=df["srch_id"]))

df_train = df.iloc[train_idx].copy()
df_val = df.iloc[val_idx].copy()

# Features only in training set
agg_cols = ['price_usd', 'prop_review_score', 'prop_location_score1', 'prop_location_score2']
agg_funcs = ['mean', 'std', 'median', 'min', 'max']

prop_agg = df_train.groupby('prop_id')[agg_cols].agg(agg_funcs)
prop_agg.columns = ['_'.join(col) for col in prop_agg.columns]
prop_agg.reset_index(inplace=True)

# Merge into all sets
df_train = df_train.merge(prop_agg, on='prop_id', how='left')
df_val = df_val.merge(prop_agg, on='prop_id', how='left')

# Define features
stat_features = [col for col in df_train.columns if any(col.endswith(suffix) for suffix in ['mean', 'std', 'median', 'min', 'max'])]

base_features = [
    'price_usd',
    'prop_review_score',
    'prop_starrating',
    'prop_location_score1',
    'prop_location_score2',
    'promotion_flag',
    'srch_length_of_stay',
    'srch_booking_window',
    'srch_adults_count',
    'srch_children_count',
    'srch_room_count',
    'srch_saturday_night_bool',
    'orig_destination_distance',
    'visitor_hist_starrating',
    'visitor_hist_adr_usd',
    'prop_log_historical_price'
]

features = base_features + stat_features

# Helper for ranking
def make_lgb_dataset(df, features):
    X = df[features]
    y = df['booking_bool']
    group = df.groupby('srch_id').size().to_list()
    return lgb.Dataset(X, label=y, group=group), X, y, group

train_set, X_train, y_train, train_group = make_lgb_dataset(df_train, features)
val_set, X_val, y_val, val_group = make_lgb_dataset(df_val, features)

# LightGBM Training with LambdaMART
params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "ndcg_eval_at": [5],
    "learning_rate": 0.05,
    "num_leaves": 64,
    "min_data_in_leaf": 20,
    "max_bin": 255,
    "verbosity": -1,
    "random_state": 42
}

model = lgb.train(
    params,
    train_set,
    valid_sets=[train_set, val_set],
    valid_names=["train", "valid"],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=30),
        lgb.log_evaluation(period=50)
    ]
)

# Evaluation – NDCG@5
val_df = df_val.copy()
val_df["pred"] = model.predict(X_val)

ndcg_scores = []
for _, group in val_df.groupby("srch_id"):
    if group["booking_bool"].sum() == 0:
        continue  # skip queries with no relevant items
    y_true = group["booking_bool"].values.reshape(1, -1)
    y_score = group["pred"].values.reshape(1, -1)
    ndcg = ndcg_score(y_true, y_score, k=5)
    ndcg_scores.append(ndcg)

print("Validation NDCG@5:", np.mean(ndcg_scores))

Training until validation scores don't improve for 30 rounds
[50]	train's ndcg@5: 0.59974	valid's ndcg@5: 0.591691
[100]	train's ndcg@5: 0.608435	valid's ndcg@5: 0.596799
[150]	train's ndcg@5: 0.615039	valid's ndcg@5: 0.599573
[200]	train's ndcg@5: 0.62134	valid's ndcg@5: 0.601094
[250]	train's ndcg@5: 0.626857	valid's ndcg@5: 0.603275
[300]	train's ndcg@5: 0.63207	valid's ndcg@5: 0.604558
[350]	train's ndcg@5: 0.636998	valid's ndcg@5: 0.605048
[400]	train's ndcg@5: 0.641569	valid's ndcg@5: 0.605345
[450]	train's ndcg@5: 0.645864	valid's ndcg@5: 0.605922
[500]	train's ndcg@5: 0.649663	valid's ndcg@5: 0.606286
[550]	train's ndcg@5: 0.653518	valid's ndcg@5: 0.606593
[600]	train's ndcg@5: 0.657325	valid's ndcg@5: 0.606585
Early stopping, best iteration is:
[586]	train's ndcg@5: 0.656249	valid's ndcg@5: 0.606983
Validation NDCG@5: 0.4295257724438483
