In [None]:
# 🛠️ Install
%pip install -q --upgrade xgboost lightgbm catboost polars pandas numpy scikit-learn

In [None]:

import os, gc, pickle, warnings
from pathlib import Path
import numpy as np
import pandas as pd
import polars as pl
from typing import List, Optional, Dict
from sklearn.model_selection import GroupShuffleSplit

warnings.filterwarnings("ignore")

def dcg_at_k(relevances, k=3):
    rels = np.asfarray(relevances)[:k]
    if rels.size:
        discounts = np.log2(np.arange(2, rels.size + 2))
        return np.sum((np.power(2, rels) - 1) / discounts)
    return 0.0

def ndcg_at_k(true_rels, pred_scores, k=3):
    order = np.argsort(-pred_scores)
    ideal = dcg_at_k(sorted(true_rels, reverse=True), k)
    if ideal == 0:
        return 0.0
    return dcg_at_k(np.array(true_rels)[order], k) / ideal

def hitrate_at_k(true_rels, pred_scores, k=3):
    order = np.argsort(-pred_scores)[:k]
    return 1.0 if np.any(np.array(true_rels)[order] > 0) else 0.0

def groupwise_metric(groups, y_true, y_pred, k=3):
    start = 0
    ndcgs, hits = [], []
    for g in groups:
        end = start + g
        rels = y_true[start:end]
        scores = y_pred[start:end]
        ndcgs.append(ndcg_at_k(rels, scores, k=k))
        hits.append(hitrate_at_k(rels, scores, k=k))
        start = end
    return float(np.mean(ndcgs) if ndcgs else 0.0), float(np.mean(hits) if hits else 0.0)

def find_column(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
    for c in candidates:
        if c in df.columns:
            return c
        for col in df.columns:
            if col.lower() == c.lower():
                return col
    return None

def load_data(train_path="train.parquet", test_path="test.parquet", features_pickle="features.pkl") -> Dict:
    if Path(train_path).suffix.lower() in [".parquet", ".pq"]:
        train_df = pl.read_parquet(train_path).to_pandas()
    else:
        train_df = pd.read_csv(train_path)
    test_df = None
    if test_path and Path(test_path).exists():
        if Path(test_path).suffix.lower() in [".parquet", ".pq"]:
            test_df = pl.read_parquet(test_path).to_pandas()
        else:
            test_df = pd.read_csv(test_path)

    feature_cols = None
    cat_features = []
    if Path(features_pickle).exists():
        try:
            with open(features_pickle, "rb") as f:
                meta = pickle.load(f)
            feature_cols = meta.get("feature_cols")
            cat_features = meta.get("cat_features_final", [])
            print(f"Loaded {features_pickle}: {len(feature_cols or [])} features, {len(cat_features)} categorical.")
        except Exception as e:
            print(f"Warning loading {features_pickle}: {e}")

    target_col = find_column(train_df, ["label","target","relevance","clicked","y"])
    group_col  = find_column(train_df, ["ranker_id","query_id","session_id","qid","search_id"])
    id_col     = find_column(train_df, ["id","row_id","pair_id","doc_id","item_id"])

    if target_col is None or group_col is None:
        raise ValueError(f"Need label & group columns. Found label={target_col}, group={group_col}.")

    if feature_cols is None:
        exclude = {target_col, group_col}
        if id_col: exclude.add(id_col)
        feature_cols = [c for c in train_df.columns if c not in exclude and train_df[c].dtype != 'O']
        obj_cols = [c for c in train_df.columns if train_df[c].dtype == 'O' and c not in exclude]
        low_card = [c for c in obj_cols if train_df[c].nunique(dropna=False) <= 1000]
        cat_features = low_card
        feature_cols += low_card
        print(f"Inferred {len(feature_cols)} features ({len(cat_features)} categorical).")

    return dict(train=train_df, test=test_df, features=feature_cols, cat_features=cat_features,
                target=target_col, group=group_col, id=id_col)

def group_split(df: pd.DataFrame, group_col: str, test_size=0.2, seed=42):
    splitter = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
    groups = df[group_col].values
    idx_tr, idx_va = next(splitter.split(df, groups=groups, y=df[group_col].values))
    return idx_tr, idx_va

print("✅ Imports & utils ready.")


In [None]:

# 📦 Prepare data
TRAIN_PATH = "train.parquet"   # or CSV
TEST_PATH  = "test.parquet"    # or CSV or None
FEATURES_PKL = "features.pkl"  # optional

data = load_data(TRAIN_PATH, TEST_PATH, FEATURES_PKL)

df = data["train"].copy()
test_df = data["test"]
feature_cols = data["features"]
cat_features = data["cat_features"]
target_col = data["target"]
group_col = data["group"]
id_col = data["id"]

for c in cat_features:
    if c in df.columns:
        df[c] = df[c].astype("category")
        if test_df is not None and c in test_df.columns:
            test_df[c] = test_df[c].astype("category")

idx_tr, idx_va = group_split(df, group_col, test_size=0.2, seed=42)
train_df = df.iloc[idx_tr].reset_index(drop=True)
val_df   = df.iloc[idx_va].reset_index(drop=True)

def to_group_sizes(g):
    _, counts = np.unique(g, return_counts=True)
    return counts

train_groups = to_group_sizes(train_df[group_col].values)
val_groups   = to_group_sizes(val_df[group_col].values)

X_tr = train_df[feature_cols]; y_tr = train_df[target_col].values
X_va = val_df[feature_cols];   y_va = val_df[target_col].values
X_te = test_df[feature_cols] if test_df is not None else None

print(f"Train: {df.shape} | Train groups: {len(train_groups)} | Val groups: {len(val_groups)}")
print("✅ Data ready.")


In [None]:

# 🚀 Model 1 — XGBoost Ranker
import xgboost as xgb

xgb_params = dict(
    objective="rank:pairwise",
    eval_metric="ndcg@3",
    tree_method="hist",
    learning_rate=0.08,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
)

dtrain = xgb.DMatrix(X_tr, label=y_tr); dtrain.set_group(train_groups)
dvalid = xgb.DMatrix(X_va, label=y_va); dvalid.set_group(val_groups)

xgb_model = xgb.train(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=1000,
    evals=[(dtrain,"train"),(dvalid,"valid")],
    early_stopping_rounds=50,
    verbose_eval=50,
)

xgb_va_pred = xgb_model.predict(dvalid, iteration_range=(0, xgb_model.best_iteration+1))
ndcg3, hit3 = groupwise_metric(val_groups, y_va, xgb_va_pred, k=3)
print(f"NDCG@3={ndcg3:.5f} | HitRate@3={hit3:.5f}")


In [None]:

# 🌿 Model 2 — LightGBM LambdaRank
import lightgbm as lgb

lgb_params = dict(
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[3],
    learning_rate=0.08,
    num_leaves=127,
    min_data_in_leaf=50,
    feature_fraction=0.85,
    bagging_fraction=0.8,
    bagging_freq=1,
    reg_lambda=1.0,
    verbosity=-1,
    deterministic=True,
    force_row_wise=True,
    seed=42,
)

lgb_train = lgb.Dataset(X_tr, label=y_tr, group=train_groups, free_raw_data=False)
lgb_valid = lgb.Dataset(X_va, label=y_va, group=val_groups, reference=lgb_train, free_raw_data=False)

lgb_model = lgb.train(
    params=lgb_params,
    train_set=lgb_train,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=["train","valid"],
    num_boost_round=5000,
    early_stopping_rounds=200,
    verbose_eval=100,
)

lgb_va_pred = lgb_model.predict(X_va, num_iteration=lgb_model.best_iteration)
ndcg3, hit3 = groupwise_metric(val_groups, y_va, lgb_va_pred, k=3)
print(f"NDCG@3={ndcg3:.5f} | HitRate@3={hit3:.5f}")


In [None]:

# 🐈 Model 3 — CatBoost Ranker
from catboost import CatBoostRanker, Pool

cat_idx = [i for i, c in enumerate(feature_cols) if c in cat_features]

train_pool = Pool(X_tr, label=y_tr, group_id=train_df[group_col].values, cat_features=cat_idx or None)
valid_pool = Pool(X_va, label=y_va, group_id=val_df[group_col].values, cat_features=cat_idx or None)

cb_model = CatBoostRanker(
    loss_function="YetiRank",
    eval_metric="NDCG:top=3",
    learning_rate=0.08,
    depth=8,
    l2_leaf_reg=3.0,
    random_seed=42,
    od_type="Iter",
    od_wait=200,
    iterations=5000,
    verbose=100,
    task_type="CPU",
)

cb_model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
cb_va_pred = cb_model.predict(valid_pool)
ndcg3, hit3 = groupwise_metric(val_groups, y_va, cb_va_pred, k=3)
print(f"NDCG@3={ndcg3:.5f} | HitRate@3={hit3:.5f}")
