In [10]:
import pandas as pd
import numpy as np

# Columns to drop (sparsity > 80%)
DROPPED_COLS = [
    'comp1_rate_percent_diff', 'comp6_rate_percent_diff', 'comp1_rate', 'comp1_inv',
    'comp4_rate_percent_diff', 'gross_bookings_usd', 'comp7_rate_percent_diff',
    'comp6_rate', 'visitor_hist_starrating', 'visitor_hist_adr_usd',
    'comp6_inv', 'comp4_rate', 'comp7_rate', 'srch_query_affinity_score',
    'comp4_inv', 'comp7_inv', 'comp3_rate_percent_diff', 'comp2_rate_percent_diff',
    'comp8_rate_percent_diff', 'comp5_rate_percent_diff'
]

def get_season(month):
    if month in [12, 1, 2]: return 'winter'
    elif month in [3, 4, 5]: return 'spring'
    elif month in [6, 7, 8]: return 'summer'
    return 'fall'

def prepare_data(df, is_train=True):
    df = df.copy()
    
    # Drop sparse columns
    df.drop(columns=DROPPED_COLS, inplace=True, errors='ignore')
    
    # Parse datetime and extract components
    df['date_time'] = pd.to_datetime(df['date_time'], errors='coerce')
    df['year'] = df['date_time'].dt.year
    df['month'] = df['date_time'].dt.month
    df['day'] = df['date_time'].dt.day
    df['hour'] = df['date_time'].dt.hour
    df['day_of_week'] = df['date_time'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['season'] = df['month'].apply(get_season)
    
    # Map season to ordinal (optional)
    df['season'] = df['season'].map({'winter': 0, 'spring': 1, 'summer': 2, 'fall': 3})

    # Impute prop_review_score with 0
    df['prop_review_score'].fillna(0, inplace=True)

    # Impute prop_location_score2 using known values
    if 'prop_location_score2' in df.columns:
        score2_lookup = df[['prop_id', 'prop_location_score2']].dropna().drop_duplicates('prop_id').set_index('prop_id')['prop_location_score2']
        df['prop_location_score2'] = df.apply(
            lambda row: score2_lookup[row['prop_id']]
            if pd.isnull(row['prop_location_score2']) and row['prop_id'] in score2_lookup
            else row['prop_location_score2'],
            axis=1
        )
        df['prop_location_score2'].fillna(0, inplace=True)

    # Impute orig_destination_distance
    df['orig_distance_missing'] = df['orig_destination_distance'].isnull().astype(int)
    df['orig_destination_distance'].fillna(-1, inplace=True)

    # Remove target variables from train when preparing features
    if is_train:
        target_cols = ['booking_bool', 'click_bool', 'position', 'gross_bookings_usd']
        features = [col for col in df.columns if col not in target_cols]
        X = df[features]
        y = df['booking_bool']
        return X, y
    else:
        return df


In [11]:
# Prepare training data
X_train, y_train = prepare_data(train, is_train=True)

# Prepare test data
X_test = prepare_data(test, is_train=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['prop_review_score'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['prop_location_score2'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

In [12]:
# === Save cleaned versions ===
X_train.to_csv("train_features.csv", index=False)
y_train.to_csv("train_labels.csv", index=False)
X_test.to_csv("test_features.csv", index=False)

print("✅ Saved: train_features.csv, train_labels.csv, test_features.csv")

✅ Saved: train_features.csv, train_labels.csv, test_features.csv


In [13]:
# Assume prepare_data has already been called
X_train, y_train = prepare_data(train, is_train=True)
X_test = prepare_data(test, is_train=False)

# === 1. Check shape and preview ===
print(f"✅ X_train shape: {X_train.shape}")
print(f"✅ X_test shape:  {X_test.shape}")
print(f"🎯 y_train shape: {y_train.shape}")

# === 2. Check for NaNs ===
print("\n=== Missing values per column in X_train ===")
print(X_train.isnull().sum()[X_train.isnull().sum() > 0])

print("\n=== Missing values per column in X_test ===")
print(X_test.isnull().sum()[X_test.isnull().sum() > 0])

# === 3. Check for object dtype leftovers ===
print("\n=== Object-type columns in X_train ===")
print(X_train.select_dtypes(include=['object']).columns.tolist())

print("\n=== Object-type columns in X_test ===")
print(X_test.select_dtypes(include=['object']).columns.tolist())

# === 4. Confirm column match ===
train_cols = set(X_train.columns)
test_cols = set(X_test.columns)
print("\n✅ Columns match between train/test:", train_cols == test_cols)

if train_cols != test_cols:
    print("❌ Difference:", train_cols.symmetric_difference(test_cols))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['prop_review_score'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['prop_location_score2'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

✅ X_train shape: (4958347, 39)
✅ X_test shape:  (4959183, 39)
🎯 y_train shape: (4958347,)

=== Missing values per column in X_train ===
comp2_rate    2933675
comp2_inv     2828078
comp3_rate    3424059
comp3_inv     3307357
comp5_rate    2735974
comp5_inv     2598327
comp8_rate    3041693
comp8_inv     2970844
dtype: int64

=== Missing values per column in X_test ===
comp2_rate    2943222
comp2_inv     2837914
comp3_rate    3434198
comp3_inv     3317952
comp5_rate    2737262
comp5_inv     2598370
comp8_rate    3056794
comp8_inv     2986298
dtype: int64

=== Object-type columns in X_train ===
[]

=== Object-type columns in X_test ===
[]

✅ Columns match between train/test: True


In [14]:
# List of sparse competitor columns with known missing values
sparse_cols = [
    'comp2_rate', 'comp2_inv',
    'comp3_rate', 'comp3_inv',
    'comp5_rate', 'comp5_inv',
    'comp8_rate', 'comp8_inv'
]

# Add missing value indicator columns (1 = missing, 0 = present)
for col in sparse_cols:
    X_train[f'{col}_missing'] = X_train[col].isnull().astype(int)
    X_test[f'{col}_missing'] = X_test[col].isnull().astype(int)


In [15]:
X_train.to_csv("X_train_with_flags.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
X_test.to_csv("X_test_with_flags.csv", index=False)


In [16]:
X_train['price_diff_vs_hist'] = X_train['price_usd'] - X_train['prop_log_historical_price']
X_test['price_diff_vs_hist'] = X_test['price_usd'] - X_test['prop_log_historical_price']


In [17]:
X_train['price_rank'] = X_train.groupby('srch_id')['price_usd'].rank(method='min')
X_test['price_rank'] = X_test.groupby('srch_id')['price_usd'].rank(method='min')


In [18]:
X_train['review_rank'] = X_train.groupby('srch_id')['prop_review_score'].rank(method='min')
X_test['review_rank'] = X_test.groupby('srch_id')['prop_review_score'].rank(method='min')


In [19]:
def bucket_star_rating(rating):
    if rating < 2.5:
        return 0  # low-tier
    elif rating < 4:
        return 1  # mid-tier
    else:
        return 2  # high-tier

X_train['star_rating_bucket'] = X_train['prop_starrating'].apply(bucket_star_rating)
X_test['star_rating_bucket'] = X_test['prop_starrating'].apply(bucket_star_rating)


In [20]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
X_train['star_rating_bucket'] = le.fit_transform(X_train['star_rating_bucket'])
X_test['star_rating_bucket'] = le.transform(X_test['star_rating_bucket'])


In [21]:
# Compute booking rate from training data
booking_rate = train.groupby('prop_id')['booking_bool'].mean()

# Map it to both train and test
X_train['prop_booking_rate'] = X_train['prop_id'].map(booking_rate)
X_test['prop_booking_rate'] = X_test['prop_id'].map(booking_rate)

# Fill unknown properties in test with global mean
global_booking_mean = train['booking_bool'].mean()
X_test['prop_booking_rate'].fillna(global_booking_mean, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test['prop_booking_rate'].fillna(global_booking_mean, inplace=True)


In [22]:
X_train['location_score_ratio'] = X_train['prop_location_score1'] / (X_train['prop_location_score2'] + 1e-5)
X_test['location_score_ratio'] = X_test['prop_location_score1'] / (X_test['prop_location_score2'] + 1e-5)


In [23]:
print("🔍 Final Check: Missing Values in X_train")
missing_train = X_train.isnull().sum()
print(missing_train[missing_train > 0].sort_values(ascending=False))

print("\n🔍 Final Check: Missing Values in X_test")
missing_test = X_test.isnull().sum()
print(missing_test[missing_test > 0].sort_values(ascending=False))

print("\n🧠 Columns with dtype 'object' in X_train:")
print(X_train.select_dtypes(include='object').columns.tolist())

print("\n🧠 Columns with dtype 'object' in X_test:")
print(X_test.select_dtypes(include='object').columns.tolist())


🔍 Final Check: Missing Values in X_train
comp3_rate    3424059
comp3_inv     3307357
comp8_rate    3041693
comp8_inv     2970844
comp2_rate    2933675
comp2_inv     2828078
comp5_rate    2735974
comp5_inv     2598327
dtype: int64

🔍 Final Check: Missing Values in X_test
comp3_rate    3434198
comp3_inv     3317952
comp8_rate    3056794
comp8_inv     2986298
comp2_rate    2943222
comp2_inv     2837914
comp5_rate    2737262
comp5_inv     2598370
dtype: int64

🧠 Columns with dtype 'object' in X_train:
[]

🧠 Columns with dtype 'object' in X_test:
[]


In [24]:
# Competitor columns to fill
comp_cols = [
    'comp2_rate', 'comp2_inv',
    'comp3_rate', 'comp3_inv',
    'comp5_rate', 'comp5_inv',
    'comp8_rate', 'comp8_inv'
]

for col in comp_cols:
    print(f"\n🔄 Imputing: {col}")
    
    # Step 1: compute mean per group (from X_train only)
    group_means = X_train.groupby('srch_destination_id')[col].mean()

    # Step 2: fallback value (global mean, can also use 0 if preferred)
    global_mean = X_train[col].mean()
    
    # Step 3: apply to train
    X_train[col] = X_train.apply(
        lambda row: group_means.get(row['srch_destination_id'], global_mean)
        if pd.isnull(row[col]) else row[col],
        axis=1
    )

    # Step 4: apply to test
    X_test[col] = X_test.apply(
        lambda row: group_means.get(row['srch_destination_id'], global_mean)
        if pd.isnull(row[col]) else row[col],
        axis=1
    )



🔄 Imputing: comp2_rate

🔄 Imputing: comp2_inv

🔄 Imputing: comp3_rate

🔄 Imputing: comp3_inv

🔄 Imputing: comp5_rate

🔄 Imputing: comp5_inv

🔄 Imputing: comp8_rate

🔄 Imputing: comp8_inv


In [25]:
print("\n✅ Post-Imputation Check")
print(X_train[comp_cols].isnull().sum())
print(X_test[comp_cols].isnull().sum())



✅ Post-Imputation Check
comp2_rate     829747
comp2_inv      805772
comp3_rate     782746
comp3_inv      751390
comp5_rate     245083
comp5_inv      230210
comp8_rate    1033198
comp8_inv     1018927
dtype: int64
comp2_rate    760936
comp2_inv     736525
comp3_rate    708357
comp3_inv     678913
comp5_rate    204110
comp5_inv     189586
comp8_rate    948855
comp8_inv     935236
dtype: int64


In [26]:
for col in comp_cols:
    global_mean = X_train[col].mean()
    X_train[col].fillna(global_mean, inplace=True)
    X_test[col].fillna(global_mean, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(global_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(global_mean, inplace=True)


In [27]:
print("\n✅ Final Missing Check")
print(X_train[comp_cols].isnull().sum())
print(X_test[comp_cols].isnull().sum())



✅ Final Missing Check
comp2_rate    0
comp2_inv     0
comp3_rate    0
comp3_inv     0
comp5_rate    0
comp5_inv     0
comp8_rate    0
comp8_inv     0
dtype: int64
comp2_rate    0
comp2_inv     0
comp3_rate    0
comp3_inv     0
comp5_rate    0
comp5_inv     0
comp8_rate    0
comp8_inv     0
dtype: int64


In [28]:
X_train.to_csv("X_train_final.csv", index=False)
y_train.to_csv("y_train_final.csv", index=False)
X_test.to_csv("X_test_final.csv", index=False)


In [29]:
X_train.head(20)


Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,...,comp5_rate_missing,comp5_inv_missing,comp8_rate_missing,comp8_inv_missing,price_diff_vs_hist,price_rank,review_rank,star_rating_bucket,prop_booking_rate,location_score_ratio
0,1,2013-04-04 08:32:15,12,187,219,893,3,3.5,1,2.83,...,0,0,0,0,99.82,5.0,7.0,1,0.01634,64.597124
1,1,2013-04-04 08:32:15,12,187,219,10404,4,4.0,1,2.2,...,0,0,0,0,165.71,19.0,16.0,2,0.015437,147.551979
2,1,2013-04-04 08:32:15,12,187,219,21315,3,4.5,1,2.2,...,0,0,0,0,174.88,20.0,24.0,1,0.00363,89.759282
3,1,2013-04-04 08:32:15,12,187,219,27348,2,4.0,1,2.83,...,0,0,0,0,598.38,28.0,16.0,0,0.013043,226.219025
4,1,2013-04-04 08:32:15,12,187,219,29604,4,3.5,1,2.64,...,0,0,0,0,138.65,16.0,7.0,2,0.03609,21.271453
5,1,2013-04-04 08:32:15,12,187,219,30184,4,4.5,1,2.77,...,0,0,0,0,190.12,25.0,24.0,2,0.021773,21.273328
6,1,2013-04-04 08:32:15,12,187,219,44147,3,3.5,1,2.2,...,1,1,0,0,124.54,12.0,7.0,1,0.0,61.780399
7,1,2013-04-04 08:32:15,12,187,219,50984,2,0.0,0,1.61,...,1,1,1,1,81.23,1.0,1.0,0,0.019231,161000.0
8,1,2013-04-04 08:32:15,12,187,219,53341,4,4.0,1,2.56,...,1,1,0,0,144.87,17.0,16.0,2,0.088272,20.676844
9,1,2013-04-04 08:32:15,12,187,219,56880,4,4.0,1,2.83,...,0,0,0,0,275.54,27.0,16.0,2,0.038519,27.526505


In [30]:
X_test.head(20)


Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,...,comp5_rate_missing,comp5_inv_missing,comp8_rate_missing,comp8_inv_missing,price_diff_vs_hist,price_rank,review_rank,star_rating_bucket,prop_booking_rate,location_score_ratio
0,1,2013-02-02 15:27:40,24,216,219,3180,3,4.5,1,2.94,...,1,1,1,1,113.97,22.0,13.0,1,0.043011,42.540877
1,1,2013-02-02 15:27:40,24,216,219,5543,3,4.5,1,2.64,...,1,1,1,1,113.07,21.0,13.0,1,0.053279,31.313012
2,1,2013-02-02 15:27:40,24,216,219,14142,2,3.5,1,2.71,...,0,0,1,1,44.84,3.0,5.0,0,0.013423,48.732242
3,1,2013-02-02 15:27:40,24,216,219,22393,3,4.5,1,2.4,...,0,0,1,1,137.97,25.0,13.0,1,0.013793,42.773124
4,1,2013-02-02 15:27:40,24,216,219,24194,3,4.5,1,2.94,...,0,0,1,1,74.28,11.0,13.0,1,0.014634,14.066313
5,1,2013-02-02 15:27:40,24,216,219,28181,3,4.5,1,2.3,...,1,1,1,1,79.47,15.0,13.0,1,0.075,12.595148
6,1,2013-02-02 15:27:40,24,216,219,34263,3,4.5,1,3.09,...,0,0,1,1,74.37,11.0,13.0,1,0.032086,23.767403
7,1,2013-02-02 15:27:40,24,216,219,37567,2,4.5,0,2.83,...,0,0,1,1,48.19,5.0,13.0,0,0.012579,204.923968
8,1,2013-02-02 15:27:40,24,216,219,50162,2,3.5,1,2.2,...,0,0,1,1,45.63,4.0,5.0,0,0.034247,27.224353
9,1,2013-02-02 15:27:40,24,216,219,54937,3,4.0,1,2.08,...,0,0,1,1,78.55,14.0,8.0,1,0.050891,12.61294


In [31]:
pip install lightgbm



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [32]:
# Compare columns in train and test (excluding target)
train_cols = set(X_train.columns)
test_cols = set(X_test.columns)

# If srch_id was dropped from train before modeling, you can temporarily re-add it for this check
if 'srch_id' not in X_train.columns and 'srch_id' in X_test.columns:
    train_cols.add('srch_id')

print("✅ Columns in Train but NOT in Test:")
print(train_cols - test_cols)

print("\n✅ Columns in Test but NOT in Train:")
print(test_cols - train_cols)

print("\n✅ Do train and test have the same columns?")
print(train_cols == test_cols)


✅ Columns in Train but NOT in Test:
set()

✅ Columns in Test but NOT in Train:
set()

✅ Do train and test have the same columns?
True


In [34]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import joblib
import os
from sklearn.model_selection import GroupKFold

# === Setup ===
os.makedirs("models/lgbm_folds", exist_ok=True)

params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "ndcg_eval_at": [5],
    "learning_rate": 0.05,
    "num_leaves": 31,
    "min_data_in_leaf": 20,
    "verbose": -1
}

kf = GroupKFold(n_splits=5)
group_values = X_train['srch_id'].values
drop_cols = ['srch_id', 'date_time']

# === Store predictions ===
oof_preds = np.zeros(X_train.shape[0])
test_preds = np.zeros(X_test.shape[0])

# === Train & Save each fold ===
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train, groups=group_values), start=1):
    print(f"\n🔁 Training Fold {fold}")

    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    group_tr = X_tr.groupby('srch_id').size().values
    group_val = X_val.groupby('srch_id').size().values

    train_data = lgb.Dataset(X_tr.drop(columns=drop_cols), label=y_tr, group=group_tr)
    valid_data = lgb.Dataset(X_val.drop(columns=drop_cols), label=y_val, group=group_val)

    model = lgb.train(
        params,
        train_set=train_data,
        valid_sets=[valid_data],
        valid_names=["val"],
        num_boost_round=200,
        callbacks=[lgb.early_stopping(10), lgb.log_evaluation(10)]
    )

    # Save model
    model_path = f"models/lgbm_folds/lgbm_fold_{fold}.pkl"
    joblib.dump(model, model_path)
    print(f"✅ Saved: {model_path}")

    # Predict on validation fold
    oof_preds[val_idx] = model.predict(X_val.drop(columns=drop_cols))

    # Predict on test set
    test_preds += model.predict(X_test.drop(columns=drop_cols))

# === Finalize predictions ===
test_preds /= 5  # average across 5 folds
np.save("models/lgbm_oof_preds.npy", oof_preds)
np.save("models/lgbm_test_preds.npy", test_preds)

print("\n✅ LightGBM LambdaRank training complete.")



🔁 Training Fold 1
Training until validation scores don't improve for 10 rounds
[10]	val's ndcg@5: 0.619283
[20]	val's ndcg@5: 0.622156
[30]	val's ndcg@5: 0.623142
[40]	val's ndcg@5: 0.625257
[50]	val's ndcg@5: 0.625974
[60]	val's ndcg@5: 0.627165
[70]	val's ndcg@5: 0.628051
[80]	val's ndcg@5: 0.628903
[90]	val's ndcg@5: 0.629569
[100]	val's ndcg@5: 0.630078
[110]	val's ndcg@5: 0.630745
[120]	val's ndcg@5: 0.631718
[130]	val's ndcg@5: 0.632268
[140]	val's ndcg@5: 0.6327
[150]	val's ndcg@5: 0.63303
[160]	val's ndcg@5: 0.633649
[170]	val's ndcg@5: 0.633843
[180]	val's ndcg@5: 0.634101
[190]	val's ndcg@5: 0.634485
[200]	val's ndcg@5: 0.63443
Did not meet early stopping. Best iteration is:
[192]	val's ndcg@5: 0.634545
✅ Saved: models/lgbm_folds/lgbm_fold_1.pkl

🔁 Training Fold 2
Training until validation scores don't improve for 10 rounds
[10]	val's ndcg@5: 0.618151
[20]	val's ndcg@5: 0.621668
[30]	val's ndcg@5: 0.623228
[40]	val's ndcg@5: 0.624841
[50]	val's ndcg@5: 0.625727
[60]	val's nd

In [35]:
import numpy as np
import pandas as pd
from sklearn.linear_model import RidgeCV

# Load predictions and srch_id/prop_id
lgbm_oof = np.load("models/lgbm_oof_preds.npy")
lgbm_test = np.load("models/lgbm_test_preds.npy")

# 1D → 2D because Ridge expects 2D input
X_meta = lgbm_oof.reshape(-1, 1)
X_meta_test = lgbm_test.reshape(-1, 1)

# Train meta-model
ridge = RidgeCV()
ridge.fit(X_meta, y_train)

# Predict on test set
final_preds = ridge.predict(X_meta_test)


In [36]:
submission_df = X_test[['srch_id', 'prop_id']].copy()
submission_df['score'] = final_preds

# Sort by srch_id and score descending
submission_df = submission_df.sort_values(['srch_id', 'score'], ascending=[True, False])

# Final format
submission_df = submission_df[['srch_id', 'prop_id']]
submission_df.to_csv("submission_lgbm_ridge.csv", index=False)

print("✅ Submission file created: submission_lgbm_ridge.csv")


✅ Submission file created: submission_lgbm_ridge.csv


#### THE ABOVE CODE WAS ABLE TO ACHEIVE A SCORE OF 0.35

### BELOW THE ENSEMBLE MODEL FAILED TO MATCH THE ABOVE

In [37]:
pip install xgboost



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [38]:
import xgboost as xgb
import numpy as np
import joblib
import os
from sklearn.model_selection import GroupKFold

# Setup
os.makedirs("models/xgb_folds", exist_ok=True)

params = {
    "objective": "rank:pairwise",
    "learning_rate": 0.05,
    "n_estimators": 200,
    "max_depth": 6,
    "verbosity": 1,
    "eval_metric": "ndcg@5"
}

kf = GroupKFold(n_splits=5)
group_values = X_train['srch_id'].values
drop_cols = ['srch_id', 'date_time']

oof_preds = np.zeros(X_train.shape[0])
test_preds = np.zeros(X_test.shape[0])

# Train across 5 folds
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train, groups=group_values), start=1):
    print(f"\n🔁 XGBoost - Fold {fold}")

    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    group_tr = X_tr.groupby('srch_id').size().values
    group_val = X_val.groupby('srch_id').size().values

    dtrain = xgb.DMatrix(X_tr.drop(columns=drop_cols), label=y_tr)
    dvalid = xgb.DMatrix(X_val.drop(columns=drop_cols), label=y_val)
    dtest = xgb.DMatrix(X_test.drop(columns=drop_cols))

    dtrain.set_group(group_tr)
    dvalid.set_group(group_val)

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=200,
        evals=[(dvalid, "validation")],
        early_stopping_rounds=10,
        verbose_eval=10
    )

    joblib.dump(model, f"models/xgb_folds/xgb_fold_{fold}.pkl")

    # Predict
    oof_preds[val_idx] = model.predict(xgb.DMatrix(X_val.drop(columns=drop_cols)))
    test_preds += model.predict(dtest)

# Average test predictions across folds
test_preds /= 5

# Save
np.save("models/xgb_oof_preds.npy", oof_preds)
np.save("models/xgb_test_preds.npy", test_preds)

print("✅ XGBoost training complete and saved.")



🔁 XGBoost - Fold 1


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	validation-ndcg@5:0.59194
[10]	validation-ndcg@5:0.60881
[20]	validation-ndcg@5:0.61069
[30]	validation-ndcg@5:0.61234
[40]	validation-ndcg@5:0.61292
[50]	validation-ndcg@5:0.61408
[60]	validation-ndcg@5:0.61497
[70]	validation-ndcg@5:0.61568
[80]	validation-ndcg@5:0.61676
[90]	validation-ndcg@5:0.61810
[100]	validation-ndcg@5:0.61911
[110]	validation-ndcg@5:0.61973
[120]	validation-ndcg@5:0.62066
[130]	validation-ndcg@5:0.62123
[140]	validation-ndcg@5:0.62196
[150]	validation-ndcg@5:0.62249
[160]	validation-ndcg@5:0.62279
[170]	validation-ndcg@5:0.62420
[180]	validation-ndcg@5:0.62463
[190]	validation-ndcg@5:0.62546
[199]	validation-ndcg@5:0.62581

🔁 XGBoost - Fold 2


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	validation-ndcg@5:0.59311
[10]	validation-ndcg@5:0.60936
[20]	validation-ndcg@5:0.61064
[30]	validation-ndcg@5:0.61151
[40]	validation-ndcg@5:0.61232
[50]	validation-ndcg@5:0.61317
[60]	validation-ndcg@5:0.61391
[70]	validation-ndcg@5:0.61533
[80]	validation-ndcg@5:0.61668
[90]	validation-ndcg@5:0.61779
[100]	validation-ndcg@5:0.61851
[110]	validation-ndcg@5:0.61977
[120]	validation-ndcg@5:0.62055
[130]	validation-ndcg@5:0.62123
[140]	validation-ndcg@5:0.62217
[150]	validation-ndcg@5:0.62292
[160]	validation-ndcg@5:0.62332
[170]	validation-ndcg@5:0.62396
[180]	validation-ndcg@5:0.62474
[190]	validation-ndcg@5:0.62524
[199]	validation-ndcg@5:0.62590

🔁 XGBoost - Fold 3


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	validation-ndcg@5:0.59238
[10]	validation-ndcg@5:0.60734
[20]	validation-ndcg@5:0.60932
[30]	validation-ndcg@5:0.61082
[40]	validation-ndcg@5:0.61151
[50]	validation-ndcg@5:0.61276
[60]	validation-ndcg@5:0.61367
[70]	validation-ndcg@5:0.61496
[80]	validation-ndcg@5:0.61613
[90]	validation-ndcg@5:0.61731
[100]	validation-ndcg@5:0.61865
[110]	validation-ndcg@5:0.61953
[120]	validation-ndcg@5:0.62043
[130]	validation-ndcg@5:0.62150
[140]	validation-ndcg@5:0.62238
[150]	validation-ndcg@5:0.62308
[160]	validation-ndcg@5:0.62409
[170]	validation-ndcg@5:0.62472
[180]	validation-ndcg@5:0.62540
[190]	validation-ndcg@5:0.62598
[199]	validation-ndcg@5:0.62658

🔁 XGBoost - Fold 4


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	validation-ndcg@5:0.59328
[10]	validation-ndcg@5:0.60831
[20]	validation-ndcg@5:0.60913
[30]	validation-ndcg@5:0.60957
[40]	validation-ndcg@5:0.61096
[50]	validation-ndcg@5:0.61175
[60]	validation-ndcg@5:0.61246
[70]	validation-ndcg@5:0.61390
[80]	validation-ndcg@5:0.61545
[90]	validation-ndcg@5:0.61676
[100]	validation-ndcg@5:0.61764
[110]	validation-ndcg@5:0.61846
[120]	validation-ndcg@5:0.61944
[130]	validation-ndcg@5:0.62081
[140]	validation-ndcg@5:0.62159
[150]	validation-ndcg@5:0.62259
[160]	validation-ndcg@5:0.62336
[170]	validation-ndcg@5:0.62403
[180]	validation-ndcg@5:0.62462
[190]	validation-ndcg@5:0.62549
[199]	validation-ndcg@5:0.62620

🔁 XGBoost - Fold 5


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	validation-ndcg@5:0.59257
[10]	validation-ndcg@5:0.60771
[20]	validation-ndcg@5:0.61016
[30]	validation-ndcg@5:0.61068
[40]	validation-ndcg@5:0.61216
[50]	validation-ndcg@5:0.61301
[60]	validation-ndcg@5:0.61397
[70]	validation-ndcg@5:0.61529
[80]	validation-ndcg@5:0.61645
[90]	validation-ndcg@5:0.61732
[100]	validation-ndcg@5:0.61838
[110]	validation-ndcg@5:0.61933
[120]	validation-ndcg@5:0.62034
[130]	validation-ndcg@5:0.62118
[140]	validation-ndcg@5:0.62214
[150]	validation-ndcg@5:0.62270
[160]	validation-ndcg@5:0.62346
[170]	validation-ndcg@5:0.62425
[180]	validation-ndcg@5:0.62481
[190]	validation-ndcg@5:0.62529
[199]	validation-ndcg@5:0.62577
✅ XGBoost training complete and saved.


In [39]:
pip install catboost



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [40]:
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import GroupKFold
import numpy as np
import os
import joblib

# Setup
os.makedirs("models/cat_folds", exist_ok=True)

kf = GroupKFold(n_splits=5)
group_values = X_train['srch_id'].values
drop_cols = ['srch_id', 'date_time']

oof_preds = np.zeros(X_train.shape[0])
test_preds = np.zeros(X_test.shape[0])

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train, groups=group_values), start=1):
    print(f"\n🔁 CatBoost - Fold {fold}")

    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    group_tr = X_tr.groupby('srch_id').size().values
    group_val = X_val.groupby('srch_id').size().values

    train_pool = Pool(
        X_tr.drop(columns=drop_cols),
        label=y_tr,
        group_id=X_tr['srch_id']
    )
    val_pool = Pool(
        X_val.drop(columns=drop_cols),
        label=y_val,
        group_id=X_val['srch_id']
    )
    test_pool = Pool(X_test.drop(columns=drop_cols), group_id=X_test['srch_id'])

    model = CatBoostRanker(
        iterations=200,
        learning_rate=0.05,
        depth=6,
        loss_function='YetiRank',
        eval_metric='NDCG:top=5',
        random_seed=42,
        verbose=10
    )

    model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=10)

    joblib.dump(model, f"models/cat_folds/cat_fold_{fold}.pkl")

    oof_preds[val_idx] = model.predict(val_pool)
    test_preds += model.predict(test_pool)

# Average test predictions
test_preds /= 5

np.save("models/cat_oof_preds.npy", oof_preds)
np.save("models/cat_test_preds.npy", test_preds)

print("✅ CatBoost training complete and saved.")



🔁 CatBoost - Fold 1
0:	test: 0.5272629	best: 0.5272629 (0)	total: 2.53s	remaining: 8m 22s
10:	test: 0.5978337	best: 0.5978337 (10)	total: 22.3s	remaining: 6m 22s
20:	test: 0.6047068	best: 0.6047068 (20)	total: 40.8s	remaining: 5m 47s
30:	test: 0.6086980	best: 0.6086980 (30)	total: 57.2s	remaining: 5m 11s
40:	test: 0.6116466	best: 0.6116466 (40)	total: 1m 13s	remaining: 4m 43s
50:	test: 0.6147988	best: 0.6147988 (50)	total: 1m 28s	remaining: 4m 18s
60:	test: 0.6176856	best: 0.6176856 (60)	total: 1m 46s	remaining: 4m 1s
70:	test: 0.6196941	best: 0.6196941 (70)	total: 2m 5s	remaining: 3m 47s
80:	test: 0.6212656	best: 0.6212656 (80)	total: 2m 23s	remaining: 3m 30s
90:	test: 0.6230012	best: 0.6230012 (90)	total: 2m 44s	remaining: 3m 17s
100:	test: 0.6239594	best: 0.6240103 (98)	total: 3m 4s	remaining: 3m 1s
110:	test: 0.6255987	best: 0.6255987 (110)	total: 3m 21s	remaining: 2m 41s
120:	test: 0.6263427	best: 0.6263427 (120)	total: 3m 38s	remaining: 2m 22s
130:	test: 0.6270471	best: 0.627047

In [41]:
from sklearn.linear_model import RidgeCV
import numpy as np
import pandas as pd

# Load base model predictions
lgb_oof = np.load("models/lgbm_oof_preds.npy").reshape(-1, 1)
xgb_oof = np.load("models/xgb_oof_preds.npy").reshape(-1, 1)
cat_oof = np.load("models/cat_oof_preds.npy").reshape(-1, 1)

lgb_test = np.load("models/lgbm_test_preds.npy").reshape(-1, 1)
xgb_test = np.load("models/xgb_test_preds.npy").reshape(-1, 1)
cat_test = np.load("models/cat_test_preds.npy").reshape(-1, 1)

# Stack into feature matrix
X_meta = np.hstack([lgb_oof, xgb_oof, cat_oof])
X_meta_test = np.hstack([lgb_test, xgb_test, cat_test])

# Meta-model
meta_model = RidgeCV()
meta_model.fit(X_meta, y_train)

# Final prediction
final_preds = meta_model.predict(X_meta_test)


In [42]:
submission_df = X_test[['srch_id', 'prop_id']].copy()
submission_df['score'] = final_preds

# Sort by search and predicted score
submission_df = submission_df.sort_values(['srch_id', 'score'], ascending=[True, False])

# Final format
submission_df = submission_df[['srch_id', 'prop_id']]
submission_df.to_csv("submission_stacked_ridge.csv", index=False)

print("✅ Final stacked submission file saved: submission_stacked_ridge.csv")


✅ Final stacked submission file saved: submission_stacked_ridge.csv


In [43]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd

# Load model predictions
lgb_oof = np.load("models/lgbm_oof_preds.npy").reshape(-1, 1)
xgb_oof = np.load("models/xgb_oof_preds.npy").reshape(-1, 1)
cat_oof = np.load("models/cat_oof_preds.npy").reshape(-1, 1)

lgb_test = np.load("models/lgbm_test_preds.npy").reshape(-1, 1)
xgb_test = np.load("models/xgb_test_preds.npy").reshape(-1, 1)
cat_test = np.load("models/cat_test_preds.npy").reshape(-1, 1)

# Stack features
X_meta = np.hstack([lgb_oof, xgb_oof, cat_oof])
X_meta_test = np.hstack([lgb_test, xgb_test, cat_test])

# Train Random Forest
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=5,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_meta, y_train)
final_preds = rf.predict(X_meta_test)


In [44]:
submission_df = X_test[['srch_id', 'prop_id']].copy()
submission_df['score'] = final_preds

# Sort and prepare
submission_df = submission_df.sort_values(['srch_id', 'score'], ascending=[True, False])
submission_df = submission_df[['srch_id', 'prop_id']]
submission_df.to_csv("submission_stacked_rf.csv", index=False)

print("🌲 RandomForest submission saved: submission_stacked_rf.csv")


🌲 RandomForest submission saved: submission_stacked_rf.csv


In [45]:
import numpy as np

# Load base model OOF and test predictions
lgb_oof = np.load("models/lgbm_oof_preds.npy").reshape(-1, 1)
xgb_oof = np.load("models/xgb_oof_preds.npy").reshape(-1, 1)
cat_oof = np.load("models/cat_oof_preds.npy").reshape(-1, 1)

lgb_test = np.load("models/lgbm_test_preds.npy").reshape(-1, 1)
xgb_test = np.load("models/xgb_test_preds.npy").reshape(-1, 1)
cat_test = np.load("models/cat_test_preds.npy").reshape(-1, 1)

# === Enriched meta features: OOF ===
X_meta = np.hstack([
    lgb_oof,
    xgb_oof,
    cat_oof,
    (lgb_oof + xgb_oof + cat_oof) / 3,              # mean
    (lgb_oof - xgb_oof),                            # disagreement
    (lgb_oof - cat_oof),
    (xgb_oof - cat_oof),
    (lgb_oof * cat_oof),                            # interactions
    (xgb_oof * cat_oof),
    (lgb_oof * xgb_oof),
])

# === Enriched meta features: TEST ===
X_meta_test = np.hstack([
    lgb_test,
    xgb_test,
    cat_test,
    (lgb_test + xgb_test + cat_test) / 3,
    (lgb_test - xgb_test),
    (lgb_test - cat_test),
    (xgb_test - cat_test),
    (lgb_test * cat_test),
    (xgb_test * cat_test),
    (lgb_test * xgb_test),
])


In [49]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# Split for validation (not CV — just meta layer)
X_meta_train, X_meta_val, y_meta_train, y_meta_val = train_test_split(
    X_meta, y_train, test_size=0.1, random_state=42
)

# LightGBM dataset
train_data = lgb.Dataset(X_meta_train, label=y_meta_train)
val_data = lgb.Dataset(X_meta_val, label=y_meta_val)

# LightGBM params for meta-model
meta_params = {
    "objective": "regression",
    "metric": "rmse",
    "ndcg_eval_at": [5],
    "learning_rate": 0.03,
    "num_leaves": 15,
    "min_data_in_leaf": 30,
    "verbose": -1
}

# Train meta-model
meta_model = lgb.train(
    meta_params,
    train_data,
    valid_sets=[val_data],
    valid_names=["val"],
    num_boost_round=200,
    callbacks=[
        lgb.early_stopping(stopping_rounds=10),
        lgb.log_evaluation(period=10)  # replaces verbose_eval=10
    ]
)


# Predict test scores
final_preds = meta_model.predict(X_meta_test)


Training until validation scores don't improve for 10 rounds
[10]	val's rmse: 0.162756
[20]	val's rmse: 0.161429
[30]	val's rmse: 0.160693
[40]	val's rmse: 0.160283
[50]	val's rmse: 0.160054
[60]	val's rmse: 0.159925
[70]	val's rmse: 0.159851
[80]	val's rmse: 0.159808
[90]	val's rmse: 0.159785
[100]	val's rmse: 0.15977
[110]	val's rmse: 0.159761
[120]	val's rmse: 0.159756
[130]	val's rmse: 0.159751
[140]	val's rmse: 0.159748
[150]	val's rmse: 0.159746
[160]	val's rmse: 0.159745
[170]	val's rmse: 0.159744
[180]	val's rmse: 0.159744
Early stopping, best iteration is:
[174]	val's rmse: 0.159743


In [50]:
submission_df = X_test[['srch_id', 'prop_id']].copy()
submission_df['score'] = final_preds

# Groupwise rank normalization
submission_df['score'] = submission_df.groupby('srch_id')['score'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min() + 1e-5)
)

# Final sort and format
submission_df = submission_df.sort_values(['srch_id', 'score'], ascending=[True, False])
submission_df = submission_df[['srch_id', 'prop_id']]
submission_df.to_csv("submission_superstack_lgbm.csv", index=False)

print("✅ submission_superstack_lgbm.csv is ready.")


✅ submission_superstack_lgbm.csv is ready.
