In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# =========================
# Config
# =========================
DATA_PATH = "../final_data/data_260129_random.csv"
DATE_COL = "day"

Y_TRIP = "trip_cnt_per_user"
Y_NONR = "nonrepeat_cnt_per_user"
TARGETS = [Y_TRIP, Y_NONR]

TEST_START = pd.Timestamp("2026-01-05")
TEST_END   = pd.Timestamp("2026-01-11")  # inclusive

# Random 組有 treatment（共 8 種），也把它當作類別特徵
CAT_COLS = ["treatment", "ops_type_merged", "city_group"]

# ✅ 白名單：這些欄位才會放進模型
# 注意：同日的 trip/nonrepeat 不會放入特徵（只用 lag2w/lag3w）
LAG_DAYS = list(range(10, 22))  # 10..21 inclusive
LAG_VARS = ["trip_cnt_per_user", "nonrepeat_cnt_per_user", "match_rate", "coupon_BD_per_user", "coupon_CDP_per_user", "coupon_folk_per_user",
    "coupon_growth_other_per_user", "coupon_MGM_per_user", "coupon_MKT_per_user",
    "coupon_register_per_user", "coupon_daily_per_user"]

BASE_FEATURES = (
    ["treatment", "ops_type_merged", "city_group",
     "is_weekend_holiday", "is_rainy", "mgm_day"]
    + [f"{v}_lag{d}" for v in LAG_VARS for d in LAG_DAYS]
)

# =========================
# Separate params for each model
# =========================
PARAMS_TRIP = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.05,
    num_leaves=63,
    min_data_in_leaf=200,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    lambda_l2=1.0,
    seed=42,
    verbosity=-1,
)

PARAMS_NONR = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.05,
    num_leaves=31,
    min_data_in_leaf=50,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    lambda_l2=1.0,
    seed=42,
    verbosity=-1,
)

# Training controls
NUM_BOOST_ROUND_TRIP = 10000
NUM_BOOST_ROUND_NONR = 10000
EARLY_STOPPING_TRIP = 200
EARLY_STOPPING_NONR = 200
VALID_DAYS = 28

# =========================
# Helpers
# =========================
def rmse(y_true, y_pred) -> float:
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    return float(np.sqrt(mean_squared_error(y_true[mask], y_pred[mask])))

def align_categories(train_df: pd.DataFrame, test_df: pd.DataFrame, cat_cols):
    # 確保 test 的類別集合不會造成 unseen category 的問題（用 union 後轉 category）
    for c in cat_cols:
        if c not in train_df.columns or c not in test_df.columns:
            continue
        train_vals = train_df[c].astype("object")
        test_vals  = test_df[c].astype("object")
        cats = pd.Index(train_vals.dropna().unique()).union(pd.Index(test_vals.dropna().unique()))
        train_df[c] = pd.Categorical(train_vals, categories=cats)
        test_df[c]  = pd.Categorical(test_vals,  categories=cats)
    return train_df, test_df

def get_feature_cols(df, base_features, cat_cols):
    # 白名單欄位，且必須存在於 df
    feats = [c for c in base_features if c in df.columns]

    # 防呆：同日目標絕不進 X（只允許 lag 特徵）
    feats = [c for c in feats if c not in [Y_TRIP, Y_NONR]]

    # categorical cols 必須同時在 feats 裡才會被當 categorical
    cat = [c for c in cat_cols if c in feats]
    return feats, cat

def get_model_config(target: str):
    """Return (params, num_boost_round, early_stopping_rounds) for this target."""
    if target == Y_TRIP:
        return PARAMS_TRIP, NUM_BOOST_ROUND_TRIP, EARLY_STOPPING_TRIP
    elif target == Y_NONR:
        return PARAMS_NONR, NUM_BOOST_ROUND_NONR, EARLY_STOPPING_NONR
    else:
        raise ValueError(f"Unknown target: {target}")

# =========================
# Load & Split
# =========================
df = pd.read_csv(DATA_PATH)
df[DATE_COL] = pd.to_datetime(df[DATE_COL])
df = df.replace({"": np.nan})

train_df = df[df[DATE_COL] < TEST_START].copy()
test_df  = df[(df[DATE_COL] >= TEST_START) & (df[DATE_COL] <= TEST_END)].copy()

# 先對齊類別（針對 CAT_COLS）
train_df, test_df = align_categories(train_df, test_df, CAT_COLS)

# 取得特徵欄位（白名單）
feature_cols, cat_cols = get_feature_cols(df, BASE_FEATURES, CAT_COLS)
print("Features used:", feature_cols)
print("Categorical:", cat_cols)

# =========================
# Train & Predict
# =========================
pred_out = test_df[[DATE_COL] + [c for c in CAT_COLS if c in df.columns]].copy()

def train_one_target(target: str):
    # 確保同日目標沒有被放進特徵
    assert Y_TRIP not in feature_cols and Y_NONR not in feature_cols, "Leakage: same-day targets in features!"

    tr = train_df.dropna(subset=[target]).copy()

    # time-based valid（最後 VALID_DAYS 當 valid）
    cutoff = tr[DATE_COL].max() - pd.Timedelta(days=VALID_DAYS)
    tr_sub = tr[tr[DATE_COL] < cutoff]
    va_sub = tr[tr[DATE_COL] >= cutoff]

    dtrain = lgb.Dataset(
        tr_sub[feature_cols], label=tr_sub[target],
        categorical_feature=cat_cols, free_raw_data=False
    )
    dvalid = lgb.Dataset(
        va_sub[feature_cols], label=va_sub[target],
        categorical_feature=cat_cols, free_raw_data=False
    )

    params, num_boost_round, early_stop = get_model_config(target)

    print(f"\n=== Training target: {target} ===")
    print("Params:", params)

    model = lgb.train(
        params=params,
        train_set=dtrain,
        num_boost_round=num_boost_round,
        valid_sets=[dvalid],
        callbacks=[lgb.early_stopping(stopping_rounds=early_stop), lgb.log_evaluation(200)],
    )

    pred = model.predict(test_df[feature_cols], num_iteration=model.best_iteration)
    return pred

pred_out[f"pred_{Y_TRIP}"] = train_one_target(Y_TRIP)
pred_out[f"pred_{Y_NONR}"] = train_one_target(Y_NONR)

# =========================
# Quick RMSE on all test rows (row-level)
# =========================
print("\n=== Row-level RMSE on test ===")
print("test rows:", len(test_df))
print(f"{Y_TRIP}: {rmse(test_df[Y_TRIP].values, pred_out[f'pred_{Y_TRIP}'].values):.6f}")
print(f"{Y_NONR}: {rmse(test_df[Y_NONR].values, pred_out[f'pred_{Y_NONR}'].values):.6f}")


Features used: ['treatment', 'ops_type_merged', 'city_group', 'is_weekend_holiday', 'is_rainy', 'mgm_day', 'trip_cnt_per_user_lag10', 'trip_cnt_per_user_lag11', 'trip_cnt_per_user_lag12', 'trip_cnt_per_user_lag13', 'trip_cnt_per_user_lag14', 'trip_cnt_per_user_lag15', 'trip_cnt_per_user_lag16', 'trip_cnt_per_user_lag17', 'trip_cnt_per_user_lag18', 'trip_cnt_per_user_lag19', 'trip_cnt_per_user_lag20', 'trip_cnt_per_user_lag21', 'nonrepeat_cnt_per_user_lag10', 'nonrepeat_cnt_per_user_lag11', 'nonrepeat_cnt_per_user_lag12', 'nonrepeat_cnt_per_user_lag13', 'nonrepeat_cnt_per_user_lag14', 'nonrepeat_cnt_per_user_lag15', 'nonrepeat_cnt_per_user_lag16', 'nonrepeat_cnt_per_user_lag17', 'nonrepeat_cnt_per_user_lag18', 'nonrepeat_cnt_per_user_lag19', 'nonrepeat_cnt_per_user_lag20', 'nonrepeat_cnt_per_user_lag21', 'match_rate_lag10', 'match_rate_lag11', 'match_rate_lag12', 'match_rate_lag13', 'match_rate_lag14', 'match_rate_lag15', 'match_rate_lag16', 'match_rate_lag17', 'match_rate_lag18', 'matc

# optuna

In [5]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.metrics import mean_squared_error

optuna.logging.set_verbosity(optuna.logging.WARNING)

# =========================
# Config (RANDOM)
# =========================
DATA_PATH = "../final_data/data_260129_random.csv"
DATE_COL = "day"

Y_TRIP = "trip_cnt_per_user"
Y_NONR = "nonrepeat_cnt_per_user"

TEST_START = pd.Timestamp("2026-01-05")
TEST_END   = pd.Timestamp("2026-01-11")  # inclusive

# RANDOM 的 group keys：treatment x ops_type_merged x city_group = 8*8*3=192 groups
GROUP_KEYS = ["treatment", "ops_type_merged", "city_group"]
CAT_COLS   = ["treatment", "ops_type_merged", "city_group"]

# ✅ 白名單特徵：RANDOM 要把 treatment 放進模型
LAG_DAYS = list(range(10, 22))  # 10..21 inclusive
LAG_VARS = ["trip_cnt_per_user", "nonrepeat_cnt_per_user", "match_rate", "coupon_BD_per_user", "coupon_CDP_per_user", "coupon_folk_per_user",
    "coupon_growth_other_per_user", "coupon_MGM_per_user", "coupon_MKT_per_user",
    "coupon_register_per_user", "coupon_daily_per_user"]

BASE_FEATURES = (
    ["treatment", "ops_type_merged", "city_group",
     "is_weekend_holiday", "is_rainy", "mgm_day"]
    + [f"{v}_lag{d}" for v in LAG_VARS for d in LAG_DAYS]
)

# 你目前手動微調後的參數（baseline / center）
PARAMS_TRIP_BASE = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.04,
    num_leaves=31,
    min_data_in_leaf=200,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    lambda_l2=1.0,
    seed=42,
    verbosity=-1,
)

PARAMS_NONR_BASE = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.05,
    num_leaves=31,
    min_data_in_leaf=50,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    lambda_l2=1.0,
    seed=42,
    verbosity=-1,
)

NUM_BOOST_ROUND_TRIP = 10000
NUM_BOOST_ROUND_NONR = 10000
EARLY_STOPPING_TRIP = 200
EARLY_STOPPING_NONR = 200
VALID_DAYS = 28

N_TRIALS = 120  # per model

# =========================
# Helpers
# =========================
def rmse(y_true, y_pred) -> float:
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    return float(np.sqrt(mean_squared_error(y_true[mask], y_pred[mask])))

def weekly_sum_rmse_groups(test_df, pred_df, target_col, pred_col, expected_groups: int):
    """
    Weekly-sum RMSE across expected_groups groups.
    test_df: must contain GROUP_KEYS + DATE_COL + target_col
    pred_df: must contain GROUP_KEYS + DATE_COL + pred_col
    """
    eval_df = (
        test_df[GROUP_KEYS + [DATE_COL, target_col]]
        .merge(
            pred_df[GROUP_KEYS + [DATE_COL, pred_col]],
            on=GROUP_KEYS + [DATE_COL],
            how="left",
            validate="one_to_one",
        )
    )
    if eval_df[pred_col].isna().any():
        raise ValueError(f"Missing predictions after merge for {pred_col}")

    n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
    if n_groups != expected_groups:
        raise ValueError(f"Expected {expected_groups} groups, got {n_groups}")

    g = (
        eval_df
        .groupby(GROUP_KEYS, dropna=False)
        .agg(actual_sum=(target_col, "sum"), pred_sum=(pred_col, "sum"))
        .reset_index()
    )
    return float(np.sqrt(np.mean((g["actual_sum"] - g["pred_sum"]) ** 2)))

def align_categories_for_train_test(train_df, test_df, cat_cols):
    for c in cat_cols:
        if c not in train_df.columns or c not in test_df.columns:
            continue
        tr = train_df[c].astype("object")
        te = test_df[c].astype("object")
        cats = pd.Index(tr.dropna().unique()).union(pd.Index(te.dropna().unique()))
        train_df[c] = pd.Categorical(tr, categories=cats)
        test_df[c]  = pd.Categorical(te, categories=cats)
    return train_df, test_df

def get_feature_cols(df):
    feats = [c for c in BASE_FEATURES if c in df.columns]
    # 防呆：同日 target 永不進特徵
    feats = [c for c in feats if c not in [Y_TRIP, Y_NONR]]
    cat = [c for c in CAT_COLS if c in feats]
    return feats, cat

def time_based_train_valid_split(train_df, valid_days=28):
    cutoff = train_df[DATE_COL].max() - pd.Timedelta(days=valid_days)
    tr_sub = train_df[train_df[DATE_COL] < cutoff].copy()
    va_sub = train_df[train_df[DATE_COL] >= cutoff].copy()
    return tr_sub, va_sub

def build_params_from_trial_narrow(trial: optuna.Trial, base_params: dict, target: str):
    """
    Narrow search space around your hand-tuned base.
    (Same policy as control version)
    """
    params = dict(base_params)

    base_lr = float(base_params.get("learning_rate", 0.05))
    lr_low  = max(0.01, base_lr * 0.6)
    lr_high = min(0.12, base_lr * 1.6)
    params["learning_rate"] = trial.suggest_float("learning_rate", lr_low, lr_high, log=True)

    params["num_leaves"] = trial.suggest_int("num_leaves", 15, 63, step=2)

    if target == Y_TRIP:
        params["min_data_in_leaf"] = trial.suggest_int("min_data_in_leaf", 80, 350, step=10)
    else:
        params["min_data_in_leaf"] = trial.suggest_int("min_data_in_leaf", 20, 120, step=5)

    params["max_depth"] = trial.suggest_categorical("max_depth", [-1, 4, 5, 6, 7, 8, 9, 10])

    params["feature_fraction"] = trial.suggest_float("feature_fraction", 0.75, 1.0, step=0.05)
    params["bagging_fraction"] = trial.suggest_float("bagging_fraction", 0.75, 1.0, step=0.05)
    params["bagging_freq"] = trial.suggest_categorical("bagging_freq", [0, 1, 2, 5])

    params["lambda_l2"] = trial.suggest_float("lambda_l2", 0.2, 6.0, log=True)
    params["lambda_l1"] = trial.suggest_float("lambda_l1", 0.0, 2.0)

    params["min_gain_to_split"] = trial.suggest_float("min_gain_to_split", 0.0, 0.05)

    params["extra_trees"] = trial.suggest_categorical("extra_trees", [False, True])

    params["objective"] = "regression"
    params["metric"] = "rmse"
    params["seed"] = 42
    params["verbosity"] = -1

    return params

# =========================
# Load data + split
# =========================
df = pd.read_csv(DATA_PATH)
df[DATE_COL] = pd.to_datetime(df[DATE_COL])
df = df.replace({"": np.nan})

# Train is strictly before test week start
train_all = df[df[DATE_COL] < TEST_START].copy()

# The week you want to evaluate/tune on
test_week = df[(df[DATE_COL] >= TEST_START) & (df[DATE_COL] <= TEST_END)].copy()

feature_cols, cat_cols = get_feature_cols(df)

# Align categories for train/test week
train_all, test_week = align_categories_for_train_test(train_all, test_week, cat_cols)

# internal valid split for early stopping (only within train_all)
tr_sub, va_sub = time_based_train_valid_split(train_all, valid_days=VALID_DAYS)
tr_sub, va_sub = align_categories_for_train_test(tr_sub, va_sub, cat_cols)

# expected number of groups in random: 8*8*3 = 192 (you can hardcode 192)
EXPECTED_GROUPS = 192

# =========================
# Objective factory: minimize weekly-sum RMSE on 1/5-1/11 across 192 groups
# =========================
def make_objective_weeksum(target: str, base_params: dict, num_boost_round: int, early_stop: int):
    X_tr = tr_sub[feature_cols]
    y_tr = tr_sub[target].astype(float)
    X_va = va_sub[feature_cols]
    y_va = va_sub[target].astype(float)

    dtrain = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols, free_raw_data=False)
    dvalid = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_cols, free_raw_data=False)

    X_test = test_week[feature_cols]

    def objective(trial: optuna.Trial) -> float:
        params = build_params_from_trial_narrow(trial, base_params, target)

        model = lgb.train(
            params=params,
            train_set=dtrain,
            num_boost_round=num_boost_round,
            valid_sets=[dvalid],
            callbacks=[lgb.early_stopping(stopping_rounds=early_stop, verbose=False)],
        )

        # predict 1/5-1/11 (expected rows: 7*192=1344)
        pred = model.predict(X_test, num_iteration=model.best_iteration)

        pred_df = test_week[GROUP_KEYS + [DATE_COL]].copy()
        pred_col = f"pred_{target}"
        pred_df[pred_col] = pred

        score = weekly_sum_rmse_groups(
            test_df=test_week,
            pred_df=pred_df,
            target_col=target,
            pred_col=pred_col,
            expected_groups=EXPECTED_GROUPS,
        )

        trial.set_user_attr("best_iteration", int(model.best_iteration))
        return score

    return objective

# =========================
# Run studies (120 trials each)
# =========================
sampler = optuna.samplers.TPESampler(seed=42)

# --- TRIP
study_trip = optuna.create_study(direction="minimize", sampler=sampler)
study_trip.optimize(
    make_objective_weeksum(Y_TRIP, PARAMS_TRIP_BASE, NUM_BOOST_ROUND_TRIP, EARLY_STOPPING_TRIP),
    n_trials=N_TRIALS,
    show_progress_bar=True
)
print("\n=== BEST (TRIP | weekly-sum RMSE across 192 groups) ===")
print("best value:", study_trip.best_value)
print("best params:", study_trip.best_params)
print("best_iteration:", study_trip.best_trial.user_attrs.get("best_iteration"))

# --- NONREPEAT
study_nonr = optuna.create_study(direction="minimize", sampler=sampler)
study_nonr.optimize(
    make_objective_weeksum(Y_NONR, PARAMS_NONR_BASE, NUM_BOOST_ROUND_NONR, EARLY_STOPPING_NONR),
    n_trials=N_TRIALS,
    show_progress_bar=True
)
print("\n=== BEST (NONREPEAT | weekly-sum RMSE across 192 groups) ===")
print("best value:", study_nonr.best_value)
print("best params:", study_nonr.best_params)
print("best_iteration:", study_nonr.best_trial.user_attrs.get("best_iteration"))

# =========================
# Merge best params back to full params dict
# =========================
def merge_best(base_params, best_params):
    merged = dict(base_params)
    merged.update(best_params)
    merged["objective"] = "regression"
    merged["metric"] = "rmse"
    merged["seed"] = 42
    merged["verbosity"] = -1
    return merged

BEST_PARAMS_TRIP = merge_best(PARAMS_TRIP_BASE, study_trip.best_params)
BEST_PARAMS_NONR = merge_best(PARAMS_NONR_BASE, study_nonr.best_params)

print("\nBEST_PARAMS_TRIP =", BEST_PARAMS_TRIP)
print("BEST_PARAMS_NONR =", BEST_PARAMS_NONR)


  0%|          | 0/120 [00:00<?, ?it/s]

  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
[LightGBM] [Fatal] Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You need to set `feature_pre_filter=false` to dynamically change the `min_data_in_leaf`.
[LightGBM] [Fatal] Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You need to set `feature_pre_filter=false` to dynamically change the `min_data_in_leaf`.
  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
[LightGBM] [Fatal] Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You need to set `feature_pre_filter=false` to dynamically change the `min_data_in_leaf`.
[LightGBM] [Fatal] Reducing `min_data_in_leaf` wit


=== BEST (TRIP | weekly-sum RMSE across 192 groups) ===
best value: 0.03554975497131047
best params: {'learning_rate': 0.026617997246391686, 'num_leaves': 41, 'min_data_in_leaf': 280, 'max_depth': 4, 'feature_fraction': 0.85, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'lambda_l2': 0.7197684677490083, 'lambda_l1': 0.4627932237093651, 'min_gain_to_split': 0.0006605846471196759, 'extra_trees': True}
best_iteration: 1018


  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df


  0%|          | 0/120 [00:00<?, ?it/s]

  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
[LightGBM] [Fatal] Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You need to set `feature_pre_filter=false` to dynamically change the `min_data_in_leaf`.
[LightGBM] [Fatal] Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You need to set `feature_pre_filter=false` to dynamically change the `min_data_in_leaf`.
  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
  n_groups = eval_df.group


=== BEST (NONREPEAT | weekly-sum RMSE across 192 groups) ===
best value: 0.04170637493294183
best params: {'learning_rate': 0.07523936698587146, 'num_leaves': 63, 'min_data_in_leaf': 120, 'max_depth': 9, 'feature_fraction': 0.9, 'bagging_fraction': 0.75, 'bagging_freq': 1, 'lambda_l2': 2.0059177129048513, 'lambda_l1': 1.0090268203431685, 'min_gain_to_split': 0.0016088836350458009, 'extra_trees': True}
best_iteration: 879

BEST_PARAMS_TRIP = {'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.026617997246391686, 'num_leaves': 41, 'min_data_in_leaf': 280, 'feature_fraction': 0.85, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'lambda_l2': 0.7197684677490083, 'seed': 42, 'verbosity': -1, 'max_depth': 4, 'lambda_l1': 0.4627932237093651, 'min_gain_to_split': 0.0006605846471196759, 'extra_trees': True}
BEST_PARAMS_NONR = {'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.07523936698587146, 'num_leaves': 63, 'min_data_in_leaf': 120, 'feature_fraction': 0.9, 'bagging_

  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df


# week RMSE

In [3]:
# =========================
# Evaluate BEST params on 2026-01-05 ~ 2026-01-11 (weekly-sum RMSE across 192 groups)  [RANDOM]
# =========================

def train_predict_with_best_params_random(
    target: str,
    best_params: dict,
    num_boost_round: int,
    early_stop: int,
):
    # rebuild train/valid split inside train_all
    tr_sub, va_sub = time_based_train_valid_split(train_all, valid_days=VALID_DAYS)
    tr_sub, va_sub = align_categories_for_train_test(tr_sub, va_sub, cat_cols)

    dtrain = lgb.Dataset(
        tr_sub[feature_cols], label=tr_sub[target].astype(float),
        categorical_feature=cat_cols, free_raw_data=False
    )
    dvalid = lgb.Dataset(
        va_sub[feature_cols], label=va_sub[target].astype(float),
        categorical_feature=cat_cols, free_raw_data=False
    )

    model = lgb.train(
        params=best_params,
        train_set=dtrain,
        num_boost_round=num_boost_round,
        valid_sets=[dvalid],
        callbacks=[lgb.early_stopping(stopping_rounds=early_stop, verbose=False)],
    )

    # predict on test_week (expected rows: 7 * 192 = 1344)
    pred = model.predict(test_week[feature_cols], num_iteration=model.best_iteration)

    pred_df = test_week[GROUP_KEYS + [DATE_COL]].copy()
    pred_col = f"pred_{target}"
    pred_df[pred_col] = pred

    # weekly-sum RMSE across 192 groups
    score = weekly_sum_rmse_groups(
        test_df=test_week,
        pred_df=pred_df,
        target_col=target,
        pred_col=pred_col,
        expected_groups=EXPECTED_GROUPS,   # should be 192
    )

    return score, int(model.best_iteration), pred_df


print("\n==================== BEST PARAMS EVALUATION (weekly-sum RMSE across 192 groups) [RANDOM] ====================")

# --- TRIP
rmse_trip_best, best_iter_trip, pred_trip_df = train_predict_with_best_params_random(
    target=Y_TRIP,
    best_params=BEST_PARAMS_TRIP,
    num_boost_round=NUM_BOOST_ROUND_TRIP,
    early_stop=EARLY_STOPPING_TRIP,
)
print("\n[TRIP] weekly-sum RMSE (192 groups):", f"{rmse_trip_best:.6f}")
print("[TRIP] best_iteration:", best_iter_trip)
print("[TRIP] BEST_PARAMS_TRIP:")
for k in sorted(BEST_PARAMS_TRIP.keys()):
    print(f"  {k}: {BEST_PARAMS_TRIP[k]}")

# --- NONREPEAT
rmse_nonr_best, best_iter_nonr, pred_nonr_df = train_predict_with_best_params_random(
    target=Y_NONR,
    best_params=BEST_PARAMS_NONR,
    num_boost_round=NUM_BOOST_ROUND_NONR,
    early_stop=EARLY_STOPPING_NONR,
)
print("\n[NONREPEAT] weekly-sum RMSE (192 groups):", f"{rmse_nonr_best:.6f}")
print("[NONREPEAT] best_iteration:", best_iter_nonr)
print("[NONREPEAT] BEST_PARAMS_NONR:")
for k in sorted(BEST_PARAMS_NONR.keys()):
    print(f"  {k}: {BEST_PARAMS_NONR[k]}")

print("\n=== SUMMARY [RANDOM] ===")
print(f"TRIP weekly-sum RMSE (192 groups): {rmse_trip_best:.6f}")
print(f"NONREPEAT weekly-sum RMSE (192 groups): {rmse_nonr_best:.6f}")





  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df



[TRIP] weekly-sum RMSE (192 groups): 0.035550
[TRIP] best_iteration: 1018
[TRIP] BEST_PARAMS_TRIP:
  bagging_fraction: 0.8
  bagging_freq: 1
  extra_trees: True
  feature_fraction: 0.85
  lambda_l1: 0.4627932237093651
  lambda_l2: 0.7197684677490083
  learning_rate: 0.026617997246391686
  max_depth: 4
  metric: rmse
  min_data_in_leaf: 280
  min_gain_to_split: 0.0006605846471196759
  num_leaves: 41
  objective: regression
  seed: 42
  verbosity: -1

[NONREPEAT] weekly-sum RMSE (192 groups): 0.041706
[NONREPEAT] best_iteration: 879
[NONREPEAT] BEST_PARAMS_NONR:
  bagging_fraction: 0.75
  bagging_freq: 1
  extra_trees: True
  feature_fraction: 0.9
  lambda_l1: 1.0090268203431685
  lambda_l2: 2.0059177129048513
  learning_rate: 0.07523936698587146
  max_depth: 9
  metric: rmse
  min_data_in_leaf: 120
  min_gain_to_split: 0.0016088836350458009
  num_leaves: 63
  objective: regression
  seed: 42
  verbosity: -1

=== SUMMARY [RANDOM] ===
TRIP weekly-sum RMSE (192 groups): 0.035550
NONREPEAT

  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
