In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import GroupShuffleSplit
from xgboost import XGBClassifier



In [2]:
df = pd.read_csv("../../data/training_set_VU_DM.csv")

In [3]:
 to_drop = [
    'comp1_rate_percent_diff',
    'comp6_rate_percent_diff',
    'comp1_inv',
    'comp4_rate_percent_diff',
    'gross_bookings_usd',
    'comp7_rate_percent_diff',
    'visitor_hist_starrating',
    'visitor_hist_adr_usd',
    'comp6_inv',
    'comp4_inv',
    'comp7_inv',
    'comp3_rate_percent_diff',
    'comp2_rate_percent_diff', 
    'comp8_rate_percent_diff', 
    'comp5_rate_percent_diff',
    'comp2_rate'
]


def get_season(month):
    if month in [12, 1, 2]:
        return 'winter'
    elif month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    else:
        return 'fall'

def get_stay_type(nights):
    if nights <= 2:
        return 'short'
    elif nights <= 7:
        return 'medium'
    else:
        return 'long'

def clean_and_engineer(df):
    df = df.copy()

    df['date_time'] = pd.to_datetime(df['date_time'])
    df['year'] = df['date_time'].dt.year
    df['month'] = df['date_time'].dt.month
    df['day'] = df['date_time'].dt.day
    df['day_of_week'] = df['date_time'].dt.dayofweek  # 0 == Monday, 6 == Sunday
    df['hour'] = df['date_time'].dt.hour
    hour_counts = df['hour'].value_counts().sort_index()
    df['is_weekend'] = df['day_of_week'].isin([5, 6])
    df['day_type'] = df['is_weekend'].map({False: 'Weekday', True: 'Weekend'})
    df['season'] = df['month'].apply(get_season)

    df['prop_review_score'].fillna(0, inplace=True)

    missing_score2_ids = df[df['prop_location_score2'].isnull()]['prop_id'].unique()
    known_score2_ids = df[~df['prop_location_score2'].isnull()]['prop_id'].unique()
    recoverable_score2_ids = list(set(missing_score2_ids) & set(known_score2_ids))
    score2_lookup = df.loc[~df['prop_location_score2'].isnull(), ['prop_id', 'prop_location_score2']] \
                    .drop_duplicates(subset='prop_id') \
                    .set_index('prop_id')['prop_location_score2']
    df['prop_location_score2'] = df.apply(
        lambda row: score2_lookup[row['prop_id']] 
        if pd.isnull(row['prop_location_score2']) and row['prop_id'] in score2_lookup 
        else row['prop_location_score2'],
        axis=1
    )
    df['prop_location_score2'].fillna(0, inplace=True)

    df['orig_distance_missing'] = df['orig_destination_distance'].isnull().astype(int)
    df['orig_destination_distance'].fillna(-1, inplace=True)

    inv_cols = [col for col in df.columns if col.endswith('_inv')]
    df[inv_cols] = df[inv_cols].fillna(-1)
    rate_cols = [col for col in df.columns if col.startswith('comp') and col.endswith('_rate')]
    df[rate_cols] = df[rate_cols].fillna(-2)

    df['srch_query_affinity_score'] = df['srch_query_affinity_score'].fillna(-999)
    df['price_per_night'] = df['price_usd'] / df['srch_length_of_stay'].replace(0, 1)

    df['stay_type'] = df['srch_length_of_stay'].apply(get_stay_type)
    df['was_position_1'] = (df['position'] == 1).astype(int)
    df['total_guests'] = df['srch_adults_count'] + df['srch_children_count']

    stars = df['prop_starrating'].replace(0, 1)
    df['score_per_star'] = df['prop_location_score1'] / stars
    df['loc2_per_star'] = df['prop_location_score2'] / stars

    #new part
    df['price_rank_pct'] = df.groupby('srch_id')['price_usd'].rank(pct=True)
    df['score1_rank_pct'] = df.groupby('srch_id')['prop_location_score1'].rank(pct=True)
    df['review_score_rank_pct'] = df.groupby('srch_id')['prop_review_score'].rank(pct=True)

    df['price_vs_group_mean'] = df['price_usd'] - df.groupby('srch_id')['price_usd'].transform('mean')
    df['score1_vs_group_mean'] = df['prop_location_score1'] - df.groupby('srch_id')['prop_location_score1'].transform('mean')
    df['review_vs_group_mean'] = df['prop_review_score'] - df.groupby('srch_id')['prop_review_score'].transform('mean')


    return df









In [None]:
def prepare_final_model_data(df):
    drop_cols = [
        'srch_id', 'date_time', 'click_bool', 'booking_bool',
        'prop_id', 'site_id', 'position', 'random_bool',
        'date', 'prop_log_historical_price', 'target'
    ]

    df = df.drop(columns=drop_cols, errors='ignore')

    df = pd.get_dummies(df, columns=['season', 'day_type', 'stay_type'], drop_first=True)

    return df


In [5]:
# Clean features
df_clean = clean_and_engineer(df)
df_clean.drop(columns=to_drop, inplace=True)

# Prepare modeling inputs
X = prepare_final_model_data(df_clean)
y = df['booking_bool']
srch_ids = df['srch_id']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['prop_review_score'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['prop_location_score2'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

In [6]:
def group_split(X, y, groups, test_size=0.2, seed=42):
    splitter = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
    train_idx, test_idx = next(splitter.split(X, y, groups=groups))

    return (
        X.iloc[train_idx], X.iloc[test_idx],
        y.iloc[train_idx], y.iloc[test_idx],
        groups.iloc[test_idx]
    )


In [7]:
X_train, X_test, y_train, y_test, srch_id_test = group_split(X, y, srch_ids)

In [None]:
# Compute imbalance ratio (otherwise the score be lower)
scale = (y_train == 0).sum() / (y_train == 1).sum()

# model
model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale,
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Train 
model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [8]:
scale = (y_train == 0).sum() / (y_train == 1).sum()

final_model = XGBClassifier(
    n_estimators=100,
    max_depth=9,
    learning_rate=0.2,
    subsample=0.6,
    colsample_bytree=0.8,
    scale_pos_weight=34.78,
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric='logloss'
)
final_model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [9]:
y_pred_proba = final_model.predict_proba(X_test)[:, 1]

In [10]:
test_results = X_test.copy()
test_results['srch_id'] = srch_id_test.values
test_results['booking_bool'] = y_test.values
test_results['pred_score'] = y_pred_proba


In [11]:
test_results_sorted = test_results.sort_values(['srch_id', 'pred_score'], ascending=[True, False])


In [13]:
import numpy as np

def dcg_at_k(r, k=5):
    r = np.asarray(r, dtype=np.float64)[:k]
    return np.sum((2**r - 1) / np.log2(np.arange(2, r.size + 2))) if r.size else 0.0

def ndcg_at_k(r, k=5):
    ideal = dcg_at_k(sorted(r, reverse=True), k)
    return dcg_at_k(r, k) / ideal if ideal > 0 else 0.0


In [15]:
ndcg_scores = [
    ndcg_at_k(group['booking_bool'].values, k=5)
    for _, group in test_results_sorted.groupby('srch_id')
]

avg_ndcg = np.mean(ndcg_scores)
print(f"\n Average NDCG@5: {avg_ndcg:.4f}")


 Average NDCG@5: 0.2680
