In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# =========================
# Config
# =========================
DATA_PATH = "../final_data/data_260129_control.csv"
DATE_COL = "day"

Y_TRIP = "trip_cnt_per_user"
Y_NONR = "nonrepeat_cnt_per_user"
TARGETS = [Y_TRIP, Y_NONR]

TEST_START = pd.Timestamp("2025-12-15")
TEST_END   = pd.Timestamp("2025-12-28")  # inclusive

CAT_COLS = ["ops_type_merged", "city_group"]

BASE_FEATURES = [
    "ops_type_merged", "city_group",
    "is_weekend_holiday", "is_rainy",
    "trip_cnt_per_user_lag2w", "trip_cnt_per_user_lag3w",
    "nonrepeat_cnt_per_user_lag2w", "nonrepeat_cnt_per_user_lag3w",
    "match_rate_lag2w", "match_rate_lag3w",
]

# =========================
# Separate params for each model
# =========================
PARAMS_TRIP = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.04,
    num_leaves=31,
    min_data_in_leaf=200,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    lambda_l2=1.0,
    seed=42,
    verbosity=-1,
)

PARAMS_NONR = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.05,
    num_leaves=31,
    min_data_in_leaf=50,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    lambda_l2=1.0,
    seed=42,
    verbosity=-1,
)

# Training controls (also adjustable per model if you want)
NUM_BOOST_ROUND_TRIP = 5000
NUM_BOOST_ROUND_NONR = 5000
EARLY_STOPPING_TRIP = 200
EARLY_STOPPING_NONR = 200
VALID_DAYS = 28

# =========================
# Helpers
# =========================
def rmse(y_true, y_pred) -> float:
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    return float(np.sqrt(mean_squared_error(y_true[mask], y_pred[mask])))

def align_categories(train_df: pd.DataFrame, test_df: pd.DataFrame, cat_cols):
    for c in cat_cols:
        if c not in train_df.columns or c not in test_df.columns:
            continue
        train_vals = train_df[c].astype("object")
        test_vals  = test_df[c].astype("object")
        cats = pd.Index(train_vals.dropna().unique()).union(pd.Index(test_vals.dropna().unique()))
        train_df[c] = pd.Categorical(train_vals, categories=cats)
        test_df[c]  = pd.Categorical(test_vals,  categories=cats)
    return train_df, test_df

def get_feature_cols(df, base_features, cat_cols):
    feats = [c for c in base_features if c in df.columns]
    # 防呆：同日目標絕不進 X
    feats = [c for c in feats if c not in [Y_TRIP, Y_NONR]]
    cat = [c for c in cat_cols if c in feats]
    return feats, cat

def get_model_config(target: str):
    """Return (params, num_boost_round, early_stopping_rounds) for this target."""
    if target == Y_TRIP:
        return PARAMS_TRIP, NUM_BOOST_ROUND_TRIP, EARLY_STOPPING_TRIP
    elif target == Y_NONR:
        return PARAMS_NONR, NUM_BOOST_ROUND_NONR, EARLY_STOPPING_NONR
    else:
        raise ValueError(f"Unknown target: {target}")

# =========================
# Load & Split
# =========================
df = pd.read_csv(DATA_PATH)
df[DATE_COL] = pd.to_datetime(df[DATE_COL])
df = df.replace({"": np.nan})

train_df = df[df[DATE_COL] < TEST_START].copy()
test_df  = df[(df[DATE_COL] >= TEST_START) & (df[DATE_COL] <= TEST_END)].copy()

train_df, test_df = align_categories(train_df, test_df, CAT_COLS)

feature_cols, cat_cols = get_feature_cols(df, BASE_FEATURES, CAT_COLS)
print("Features used:", feature_cols)
print("Categorical:", cat_cols)

# =========================
# Train & Predict
# =========================
pred_out = test_df[[DATE_COL] + [c for c in CAT_COLS if c in df.columns]].copy()

def train_one_target(target: str):
    # 依你需求：trip 模型不放 nonrepeat 同日；nonrepeat 模型不放 trip 同日
    # 我們的 feature_cols 本身就不含這兩個同日欄位，因此天然滿足。
    assert Y_TRIP not in feature_cols and Y_NONR not in feature_cols, "Leakage: same-day targets in features!"

    tr = train_df.dropna(subset=[target]).copy()

    # time-based valid
    cutoff = tr[DATE_COL].max() - pd.Timedelta(days=VALID_DAYS)
    tr_sub = tr[tr[DATE_COL] < cutoff]
    va_sub = tr[tr[DATE_COL] >= cutoff]

    dtrain = lgb.Dataset(
        tr_sub[feature_cols], label=tr_sub[target],
        categorical_feature=cat_cols, free_raw_data=False
    )
    dvalid = lgb.Dataset(
        va_sub[feature_cols], label=va_sub[target],
        categorical_feature=cat_cols, free_raw_data=False
    )

    params, num_boost_round, early_stop = get_model_config(target)

    print(f"\n=== Training target: {target} ===")
    print("Params:", params)

    model = lgb.train(
        params=params,
        train_set=dtrain,
        num_boost_round=num_boost_round,
        valid_sets=[dvalid],
        callbacks=[lgb.early_stopping(stopping_rounds=early_stop), lgb.log_evaluation(200)],
    )

    pred = model.predict(test_df[feature_cols], num_iteration=model.best_iteration)
    return pred

pred_out[f"pred_{Y_TRIP}"] = train_one_target(Y_TRIP)
pred_out[f"pred_{Y_NONR}"] = train_one_target(Y_NONR)

# =========================
# (Optional) quick RMSE
# =========================
print("\n=== Row-level RMSE on test ===")
print(f"{Y_TRIP}: {rmse(test_df[Y_TRIP].values, pred_out[f'pred_{Y_TRIP}'].values):.6f}")
print(f"{Y_NONR}: {rmse(test_df[Y_NONR].values, pred_out[f'pred_{Y_NONR}'].values):.6f}")


Features used: ['ops_type_merged', 'city_group', 'is_weekend_holiday', 'is_rainy', 'trip_cnt_per_user_lag2w', 'trip_cnt_per_user_lag3w', 'nonrepeat_cnt_per_user_lag2w', 'nonrepeat_cnt_per_user_lag3w', 'match_rate_lag2w', 'match_rate_lag3w']
Categorical: ['ops_type_merged', 'city_group']

=== Training target: trip_cnt_per_user ===
Params: {'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.04, 'num_leaves': 31, 'min_data_in_leaf': 200, 'feature_fraction': 0.9, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'lambda_l2': 1.0, 'seed': 42, 'verbosity': -1}
Training until validation scores don't improve for 200 rounds
[200]	valid_0's rmse: 0.00634401
Early stopping, best iteration is:
[94]	valid_0's rmse: 0.00627914

=== Training target: nonrepeat_cnt_per_user ===
Params: {'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.05, 'num_leaves': 31, 'min_data_in_leaf': 50, 'feature_fraction': 0.9, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'lambda_l2': 1.0, 'seed': 42, 've

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.metrics import mean_squared_error

optuna.logging.set_verbosity(optuna.logging.WARNING)

# =========================
# Config
# =========================
DATA_PATH = "../final_data/data_260129_control.csv"
DATE_COL = "day"

Y_TRIP = "trip_cnt_per_user"
Y_NONR = "nonrepeat_cnt_per_user"

TEST_START = pd.Timestamp("2025-12-15")
TEST_END   = pd.Timestamp("2025-12-28")  # inclusive

# ✅ 兩個週窗（你要的拆法）
W1_START = pd.Timestamp("2025-12-15")
W1_END   = pd.Timestamp("2025-12-21")
W2_START = pd.Timestamp("2025-12-22")
W2_END   = pd.Timestamp("2025-12-28")

GROUP_KEYS = ["ops_type_merged", "city_group"]
CAT_COLS = ["ops_type_merged", "city_group"]

BASE_FEATURES = [
    "ops_type_merged", "city_group",
    "is_weekend_holiday", "is_rainy",
    "trip_cnt_per_user_lag2w", "trip_cnt_per_user_lag3w",
    "nonrepeat_cnt_per_user_lag2w", "nonrepeat_cnt_per_user_lag3w",
    "match_rate_lag2w", "match_rate_lag3w",
]

PARAMS_TRIP_BASE = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.04,
    num_leaves=31,
    min_data_in_leaf=200,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    lambda_l2=1.0,
    seed=42,
    verbosity=-1,
)

PARAMS_NONR_BASE = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.05,
    num_leaves=31,
    min_data_in_leaf=50,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    lambda_l2=1.0,
    seed=42,
    verbosity=-1,
)

NUM_BOOST_ROUND_TRIP = 5000
NUM_BOOST_ROUND_NONR = 5000
EARLY_STOPPING_TRIP = 200
EARLY_STOPPING_NONR = 200
VALID_DAYS = 28

N_TRIALS = 120  # per model


# =========================
# Helpers
# =========================
def rmse(y_true, y_pred) -> float:
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    return float(np.sqrt(mean_squared_error(y_true[mask], y_pred[mask])))

def add_week_window(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add week_window column: w1_1215_1221 or w2_1222_1228.
    Only keeps rows within the two windows.
    """
    out = df.copy()
    d = out[DATE_COL]

    cond1 = (d >= W1_START) & (d <= W1_END)
    cond2 = (d >= W2_START) & (d <= W2_END)

    out["week_window"] = pd.NA
    out.loc[cond1, "week_window"] = "w1_1215_1221"
    out.loc[cond2, "week_window"] = "w2_1222_1228"

    out = out[out["week_window"].notna()].copy()
    return out

def two_week_window_sum_rmse(test_df, pred_df, target_col, pred_col):
    """
    你要的評估：
    - 對每個 group (ops_type_merged, city_group)
      分別加總兩段：
        * 12/15-12/21
        * 12/22-12/28
    - 所以每個 group 有 2 筆 sum
    - 對所有 (group, week_window) 的 sum 誤差算 RMSE
    """
    eval_df = (
        test_df[GROUP_KEYS + [DATE_COL, target_col]]
        .merge(
            pred_df[GROUP_KEYS + [DATE_COL, pred_col]],
            on=GROUP_KEYS + [DATE_COL],
            how="left",
            validate="one_to_one",
        )
    )
    if eval_df[pred_col].isna().any():
        miss = eval_df[eval_df[pred_col].isna()][GROUP_KEYS + [DATE_COL]].head(10)
        raise ValueError(f"Missing predictions after merge for {pred_col}. Sample:\n{miss}")

    # add week_window and keep only the two windows
    eval_df = add_week_window(eval_df)

    # group count check (仍然是 24 組)
    n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
    if n_groups != 24:
        raise ValueError(f"Expected 24 groups, got {n_groups}")

    # aggregate to (group, week_window)
    g = (
        eval_df
        .groupby(GROUP_KEYS + ["week_window"], dropna=False)
        .agg(actual_sum=(target_col, "sum"), pred_sum=(pred_col, "sum"))
        .reset_index()
    )

    # should have 24 * 2 = 48 rows
    expected_rows = 24 * 2
    if len(g) != expected_rows:
        # 如果某些組別在某一週窗沒有資料，就會少於 48
        # 這裡直接讓 trial 爆掉，避免默默算錯
        raise ValueError(f"Expected {expected_rows} (24 groups * 2 windows) rows, got {len(g)}")

    return float(np.sqrt(np.mean((g["actual_sum"] - g["pred_sum"]) ** 2)))

def align_categories_for_train_test(train_df, test_df, cat_cols):
    for c in cat_cols:
        if c not in train_df.columns or c not in test_df.columns:
            continue
        tr = train_df[c].astype("object")
        te = test_df[c].astype("object")
        cats = pd.Index(tr.dropna().unique()).union(pd.Index(te.dropna().unique()))
        train_df[c] = pd.Categorical(tr, categories=cats)
        test_df[c]  = pd.Categorical(te, categories=cats)
    return train_df, test_df

def get_feature_cols(df):
    feats = [c for c in BASE_FEATURES if c in df.columns]
    feats = [c for c in feats if c not in [Y_TRIP, Y_NONR]]  # 防 leakage
    cat = [c for c in CAT_COLS if c in feats]
    return feats, cat

def time_based_train_valid_split(train_df, valid_days=28):
    cutoff = train_df[DATE_COL].max() - pd.Timedelta(days=valid_days)
    tr_sub = train_df[train_df[DATE_COL] < cutoff].copy()
    va_sub = train_df[train_df[DATE_COL] >= cutoff].copy()
    return tr_sub, va_sub

def build_params_from_trial_narrow(trial: optuna.Trial, base_params: dict, target: str):
    params = dict(base_params)

    base_lr = float(base_params.get("learning_rate", 0.05))
    lr_low  = max(0.01, base_lr * 0.6)
    lr_high = min(0.12, base_lr * 1.6)
    params["learning_rate"] = trial.suggest_float("learning_rate", lr_low, lr_high, log=True)

    params["num_leaves"] = trial.suggest_int("num_leaves", 15, 63, step=2)

    if target == Y_TRIP:
        params["min_data_in_leaf"] = trial.suggest_int("min_data_in_leaf", 80, 350, step=10)
    else:
        params["min_data_in_leaf"] = trial.suggest_int("min_data_in_leaf", 20, 120, step=5)

    params["max_depth"] = trial.suggest_categorical("max_depth", [-1, 4, 5, 6, 7, 8, 9, 10])

    params["feature_fraction"] = trial.suggest_float("feature_fraction", 0.75, 1.0, step=0.05)
    params["bagging_fraction"] = trial.suggest_float("bagging_fraction", 0.75, 1.0, step=0.05)
    params["bagging_freq"] = trial.suggest_categorical("bagging_freq", [0, 1, 2, 5])

    params["lambda_l2"] = trial.suggest_float("lambda_l2", 0.2, 6.0, log=True)
    params["lambda_l1"] = trial.suggest_float("lambda_l1", 0.0, 2.0)

    params["min_gain_to_split"] = trial.suggest_float("min_gain_to_split", 0.0, 0.05)

    params["extra_trees"] = trial.suggest_categorical("extra_trees", [False, True])

    params["objective"] = "regression"
    params["metric"] = "rmse"
    params["seed"] = 42
    params["verbosity"] = -1
    return params


# =========================
# Load data + split
# =========================
df = pd.read_csv(DATA_PATH)
df[DATE_COL] = pd.to_datetime(df[DATE_COL])
df = df.replace({"": np.nan})

train_all = df[df[DATE_COL] < TEST_START].copy()
test_period = df[(df[DATE_COL] >= TEST_START) & (df[DATE_COL] <= TEST_END)].copy()

feature_cols, cat_cols = get_feature_cols(df)

# Align categories for train/test
train_all, test_period = align_categories_for_train_test(train_all, test_period, cat_cols)

# internal valid split for early stopping
tr_sub, va_sub = time_based_train_valid_split(train_all, valid_days=VALID_DAYS)
tr_sub, va_sub = align_categories_for_train_test(tr_sub, va_sub, cat_cols)


# =========================
# Objective factory: minimize 2-week-window sum RMSE across 24 groups
# =========================
def make_objective_two_week_window_sum(target: str, base_params: dict, num_boost_round: int, early_stop: int):
    X_tr = tr_sub[feature_cols]
    y_tr = tr_sub[target].astype(float)
    X_va = va_sub[feature_cols]
    y_va = va_sub[target].astype(float)

    dtrain = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols, free_raw_data=False)
    dvalid = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_cols, free_raw_data=False)

    X_test = test_period[feature_cols]

    def objective(trial: optuna.Trial) -> float:
        params = build_params_from_trial_narrow(trial, base_params, target)

        model = lgb.train(
            params=params,
            train_set=dtrain,
            num_boost_round=num_boost_round,
            valid_sets=[dvalid],
            callbacks=[lgb.early_stopping(stopping_rounds=early_stop, verbose=False)],
        )

        pred = model.predict(X_test, num_iteration=model.best_iteration)

        pred_df = test_period[GROUP_KEYS + [DATE_COL]].copy()
        pred_col = f"pred_{target}"
        pred_df[pred_col] = pred

        score = two_week_window_sum_rmse(
            test_df=test_period,
            pred_df=pred_df,
            target_col=target,
            pred_col=pred_col,
        )

        trial.set_user_attr("best_iteration", int(model.best_iteration))
        return score

    return objective


# =========================
# Run studies
# =========================
sampler = optuna.samplers.TPESampler(seed=42)

# --- TRIP
study_trip = optuna.create_study(direction="minimize", sampler=sampler)
study_trip.optimize(
    make_objective_two_week_window_sum(Y_TRIP, PARAMS_TRIP_BASE, NUM_BOOST_ROUND_TRIP, EARLY_STOPPING_TRIP),
    n_trials=N_TRIALS,
    show_progress_bar=True
)
print("\n=== BEST (TRIP | two-week-window sum RMSE across 24 groups) ===")
print("best value:", study_trip.best_value)
print("best params:", study_trip.best_params)
print("best_iteration:", study_trip.best_trial.user_attrs.get("best_iteration"))

# --- NONREPEAT
study_nonr = optuna.create_study(direction="minimize", sampler=sampler)
study_nonr.optimize(
    make_objective_two_week_window_sum(Y_NONR, PARAMS_NONR_BASE, NUM_BOOST_ROUND_NONR, EARLY_STOPPING_NONR),
    n_trials=N_TRIALS,
    show_progress_bar=True
)
print("\n=== BEST (NONREPEAT | two-week-window sum RMSE across 24 groups) ===")
print("best value:", study_nonr.best_value)
print("best params:", study_nonr.best_params)
print("best_iteration:", study_nonr.best_trial.user_attrs.get("best_iteration"))

def merge_best(base_params, best_params):
    merged = dict(base_params)
    merged.update(best_params)
    merged["objective"] = "regression"
    merged["metric"] = "rmse"
    merged["seed"] = 42
    merged["verbosity"] = -1
    return merged

BEST_PARAMS_TRIP = merge_best(PARAMS_TRIP_BASE, study_trip.best_params)
BEST_PARAMS_NONR = merge_best(PARAMS_NONR_BASE, study_nonr.best_params)

print("\nBEST_PARAMS_TRIP =", BEST_PARAMS_TRIP)
print("BEST_PARAMS_NONR =", BEST_PARAMS_NONR)


  0%|          | 0/120 [00:00<?, ?it/s]

  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
[LightGBM] [Fatal] Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You need to set `feature_pre_filter=false` to dynamically change the `min_data_in_leaf`.
[LightGBM] [Fatal] Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You need to set `feature_pre_filter=false` to dynamically change the `min_data_in_leaf`.
  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
[LightGBM] [Fatal] Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You need to set `feature_pre_filter=false` to dynamically change the `min_data_in_leaf`.
[LightGBM] [Fatal] Reducing `min_data_in_leaf` wit


=== BEST (TRIP | two-week-window sum RMSE across 24 groups) ===
best value: 0.026979889967504847
best params: {'learning_rate': 0.05863472041870775, 'num_leaves': 57, 'min_data_in_leaf': 150, 'max_depth': 9, 'feature_fraction': 0.9, 'bagging_fraction': 0.95, 'bagging_freq': 1, 'lambda_l2': 5.977738741509028, 'lambda_l1': 0.07095869171859365, 'min_gain_to_split': 5.926169992780822e-06, 'extra_trees': True}
best_iteration: 88


  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df


  0%|          | 0/120 [00:00<?, ?it/s]

  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
[LightGBM] [Fatal] Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You need to set `feature_pre_filter=false` to dynamically change the `min_data_in_leaf`.
[LightGBM] [Fatal] Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You need to set `feature_pre_filter=false` to dynamically change the `min_data_in_leaf`.
  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
  n_groups = eval_df.group


=== BEST (NONREPEAT | two-week-window sum RMSE across 24 groups) ===
best value: 0.03826737701978414
best params: {'learning_rate': 0.055280376387490913, 'num_leaves': 57, 'min_data_in_leaf': 75, 'max_depth': 10, 'feature_fraction': 0.95, 'bagging_fraction': 0.95, 'bagging_freq': 1, 'lambda_l2': 0.32730934126132305, 'lambda_l1': 0.029691351349368955, 'min_gain_to_split': 0.00022954203225914035, 'extra_trees': False}
best_iteration: 127

BEST_PARAMS_TRIP = {'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.05863472041870775, 'num_leaves': 57, 'min_data_in_leaf': 150, 'feature_fraction': 0.9, 'bagging_fraction': 0.95, 'bagging_freq': 1, 'lambda_l2': 5.977738741509028, 'seed': 42, 'verbosity': -1, 'max_depth': 9, 'lambda_l1': 0.07095869171859365, 'min_gain_to_split': 5.926169992780822e-06, 'extra_trees': True}
BEST_PARAMS_NONR = {'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.055280376387490913, 'num_leaves': 57, 'min_data_in_leaf': 75, 'feature_fraction': 

  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df


In [2]:
def train_predict_with_best_params_two_windows(
    target: str,
    best_params: dict,
    num_boost_round: int,
    early_stop: int,
):
    """
    Train on train_all (with internal time-based valid split),
    predict on test_period (12/15-12/28),
    evaluate using TWO-WINDOW sum RMSE across 24 groups (ops_type_merged, city_group).
    """
    # rebuild train/valid split inside train_all
    tr_sub, va_sub = time_based_train_valid_split(train_all, valid_days=VALID_DAYS)
    tr_sub, va_sub = align_categories_for_train_test(tr_sub, va_sub, cat_cols)

    dtrain = lgb.Dataset(
        tr_sub[feature_cols], label=tr_sub[target].astype(float),
        categorical_feature=cat_cols, free_raw_data=False
    )
    dvalid = lgb.Dataset(
        va_sub[feature_cols], label=va_sub[target].astype(float),
        categorical_feature=cat_cols, free_raw_data=False
    )

    model = lgb.train(
        params=best_params,
        train_set=dtrain,
        num_boost_round=num_boost_round,
        valid_sets=[dvalid],
        callbacks=[lgb.early_stopping(stopping_rounds=early_stop, verbose=False)],
    )

    # predict on test_period (12/15-12/28)
    pred = model.predict(test_period[feature_cols], num_iteration=model.best_iteration)

    pred_df = test_period[GROUP_KEYS + [DATE_COL]].copy()
    pred_col = f"pred_{target}"
    pred_df[pred_col] = pred

    # ✅ two-week-window RMSE (24 groups * 2 windows)
    score = two_week_window_sum_rmse(
        test_df=test_period,
        pred_df=pred_df,
        target_col=target,
        pred_col=pred_col,
    )

    return score, int(model.best_iteration), pred_df


print("\n==================== BEST PARAMS EVALUATION (TWO WINDOWS) ====================")

# --- TRIP
rmse_trip_best, best_iter_trip, pred_trip_df = train_predict_with_best_params_two_windows(
    target=Y_TRIP,
    best_params=BEST_PARAMS_TRIP,
    num_boost_round=NUM_BOOST_ROUND_TRIP,
    early_stop=EARLY_STOPPING_TRIP,
)
print("\n[TRIP] two-window-sum RMSE:", f"{rmse_trip_best:.6f}")
print("[TRIP] best_iteration:", best_iter_trip)
print("[TRIP] BEST_PARAMS_TRIP:")
for k in sorted(BEST_PARAMS_TRIP.keys()):
    print(f"  {k}: {BEST_PARAMS_TRIP[k]}")

# --- NONREPEAT
rmse_nonr_best, best_iter_nonr, pred_nonr_df = train_predict_with_best_params_two_windows(
    target=Y_NONR,
    best_params=BEST_PARAMS_NONR,
    num_boost_round=NUM_BOOST_ROUND_NONR,
    early_stop=EARLY_STOPPING_NONR,
)
print("\n[NONREPEAT] two-window-sum RMSE:", f"{rmse_nonr_best:.6f}")
print("[NONREPEAT] best_iteration:", best_iter_nonr)
print("[NONREPEAT] BEST_PARAMS_NONR:")
for k in sorted(BEST_PARAMS_NONR.keys()):
    print(f"  {k}: {BEST_PARAMS_NONR[k]}")

# summary
print("\n=== SUMMARY (TWO WINDOWS) ===")
print(f"TRIP two-window-sum RMSE: {rmse_trip_best:.6f}")
print(f"NONREPEAT two-window-sum RMSE: {rmse_nonr_best:.6f}")





  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df



[TRIP] two-window-sum RMSE: 0.026980
[TRIP] best_iteration: 88
[TRIP] BEST_PARAMS_TRIP:
  bagging_fraction: 0.95
  bagging_freq: 1
  extra_trees: True
  feature_fraction: 0.9
  lambda_l1: 0.07095869171859365
  lambda_l2: 5.977738741509028
  learning_rate: 0.05863472041870775
  max_depth: 9
  metric: rmse
  min_data_in_leaf: 150
  min_gain_to_split: 5.926169992780822e-06
  num_leaves: 57
  objective: regression
  seed: 42
  verbosity: -1

[NONREPEAT] two-window-sum RMSE: 0.038267
[NONREPEAT] best_iteration: 127
[NONREPEAT] BEST_PARAMS_NONR:
  bagging_fraction: 0.95
  bagging_freq: 1
  extra_trees: False
  feature_fraction: 0.95
  lambda_l1: 0.029691351349368955
  lambda_l2: 0.32730934126132305
  learning_rate: 0.055280376387490913
  max_depth: 10
  metric: rmse
  min_data_in_leaf: 75
  min_gain_to_split: 0.00022954203225914035
  num_leaves: 57
  objective: regression
  seed: 42
  verbosity: -1

=== SUMMARY (TWO WINDOWS) ===
TRIP two-window-sum RMSE: 0.026980
NONREPEAT two-window-sum RM

  n_groups = eval_df.groupby(GROUP_KEYS, dropna=False).ngroups
  eval_df
