In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

# --------------------------
# Config
# --------------------------
DATA_PATH = "../../final_data/data_260125_random.csv"
DATE_COL = "experiment_date"

TARGET_WEEKS = [pd.Timestamp("2025-12-15"), pd.Timestamp("2025-12-22")]

Y_COLS = ["nonrepeat_cnt_per_user", "trip_cnt_per_user"]

CAT_COLS = ["treatment", "source", "ops_type_merged", "city_group"]

NUM_FEATURES = [
    "avg_rainy_day",
    "avg_rainy_weekday",
    "avg_rainy_weekend",
    "mgm_day",
    "nonrepeat_cnt_per_user_lag2",
    "nonrepeat_cnt_per_user_roll4",
    "trip_cnt_per_user_lag2",
    "trip_cnt_per_user_roll4",
    "weekday_nonrepeat_cnt_per_user_lag2",
    "weekday_nonrepeat_cnt_per_user_roll4",
    "weekday_trip_cnt_per_user_lag2",
    "weekday_trip_cnt_per_user_roll4",
    "weekday_match_rate_lag2",
    "weekday_match_rate_roll4",
    "weekend_nonrepeat_cnt_per_user_lag2",
    "weekend_nonrepeat_cnt_per_user_roll4",
    "weekend_trip_cnt_per_user_lag2",
    "weekend_trip_cnt_per_user_roll4",
    "weekend_match_rate_lag2",
    "weekend_match_rate_roll4",
    "has_national_holiday",
    "coupon_BD_per_user_log1p_lag2",
    "coupon_BD_per_user_log1p_roll4",
    "coupon_CDP_per_user_log1p_lag2",
    "coupon_CDP_per_user_log1p_roll4",
    "coupon_folk_per_user_log1p_lag2",
    "coupon_folk_per_user_log1p_roll4",
    "coupon_growth_other_per_user_log1p_lag2",
    "coupon_growth_other_per_user_log1p_roll4",
    "coupon_MGM_per_user_log1p_lag2",
    "coupon_MGM_per_user_log1p_roll4",
    "coupon_MKT_per_user_log1p_lag2",
    "coupon_MKT_per_user_log1p_roll4",
    "coupon_register_per_user_log1p_lag2",
    "coupon_register_per_user_log1p_roll4",
    "coupon_daily_per_user_log1p_lag2",
    "coupon_daily_per_user_log1p_roll4",
]

# --------------------------
# Load
# --------------------------
df = pd.read_csv(DATA_PATH)
df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce")
df = df[df[DATE_COL].notna()].copy()

# 只保留到最大 target week 為止（後面不會用到）
df = df[df[DATE_COL] <= max(TARGET_WEEKS)].copy()

# categorical cast
for c in CAT_COLS:
    if c in df.columns:
        df[c] = df[c].astype("category")

# feature availability check
missing_num = [c for c in NUM_FEATURES if c not in df.columns]
missing_cat = [c for c in CAT_COLS if c not in df.columns]
if missing_cat:
    print("[WARN] Missing categorical cols (will be dropped):", missing_cat)
if missing_num:
    print("[WARN] Missing numeric features (will be dropped):", missing_num)

X_COLS = [c for c in CAT_COLS if c in df.columns] + [c for c in NUM_FEATURES if c in df.columns]
CAT_IN_X = [c for c in CAT_COLS if c in df.columns]

# --------------------------
# Helpers
# --------------------------
def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def mae(y_true, y_pred) -> float:
    return float(mean_absolute_error(y_true, y_pred))

def fit_predict_one_week(df_all: pd.DataFrame, target_week: pd.Timestamp, y_col: str):
    # 嚴格遵守：只能拿 target_week 之前的資料訓練
    train_df = df_all[df_all[DATE_COL] < target_week].copy()
    test_df  = df_all[df_all[DATE_COL] == target_week].copy()

    # drop rows with missing target
    train_df = train_df[train_df[y_col].notna()].copy()
    test_df  = test_df[test_df[y_col].notna()].copy()

    if len(train_df) == 0 or len(test_df) == 0:
        return None, None, len(train_df), len(test_df)

    X_train = train_df[X_COLS]
    y_train = train_df[y_col].astype(float)

    X_test = test_df[X_COLS]
    y_test = test_df[y_col].astype(float)

    model = lgb.LGBMRegressor(
        objective="regression",
        n_estimators=2000,
        learning_rate=0.02,
        num_leaves=63,
        min_child_samples=300,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=1.0,
        reg_lambda=5.0,
        random_state=42,
        n_jobs=-1,
        verbose=-1,
    )

    # categorical_feature: 用欄名即可
    model.fit(X_train, y_train, categorical_feature=[c for c in CAT_IN_X if c in X_train.columns])

    y_pred = model.predict(X_test)
    return y_test.values, y_pred, len(train_df), len(test_df)

# --------------------------
# Run + Print metrics
# --------------------------
for y_col in Y_COLS:
    print(f"\n==================== {y_col} ====================")
    all_y_true = []
    all_y_pred = []

    for tw in TARGET_WEEKS:
        y_true, y_pred, n_tr, n_te = fit_predict_one_week(df, tw, y_col)
        if y_true is None:
            print(f"[{tw.date()}] SKIP (train_rows={n_tr:,}, test_rows={n_te:,})")
            continue

        r = rmse(y_true, y_pred)
        m = mae(y_true, y_pred)
        print(f"[{tw.date()}] train_rows={n_tr:,} test_rows={n_te:,}  RMSE={r:.6f}  MAE={m:.6f}")

        all_y_true.append(y_true)
        all_y_pred.append(y_pred)

    if all_y_true:
        all_y_true = np.concatenate(all_y_true)
        all_y_pred = np.concatenate(all_y_pred)
        print(f"[Overall 2 weeks] test_rows={len(all_y_true):,}  RMSE={rmse(all_y_true, all_y_pred):.6f}  MAE={mae(all_y_true, all_y_pred):.6f}")



[2025-12-15] train_rows=1,536 test_rows=192  RMSE=0.049356  MAE=0.034668
[2025-12-22] train_rows=1,728 test_rows=192  RMSE=0.071611  MAE=0.050776
[Overall 2 weeks] test_rows=384  RMSE=0.061499  MAE=0.042722

[2025-12-15] train_rows=1,536 test_rows=192  RMSE=0.040214  MAE=0.026897
[2025-12-22] train_rows=1,728 test_rows=192  RMSE=0.053683  MAE=0.035547
[Overall 2 weeks] test_rows=384  RMSE=0.047429  MAE=0.031222


In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

# --------------------------
# Config
# --------------------------
DATA_PATH = "../../final_data/data_260125_random.csv"
DATE_COL = "experiment_date"
TARGET_WEEKS = [pd.Timestamp("2025-12-15"), pd.Timestamp("2025-12-22")]
Y_COL = "trip_cnt_per_user"

CAT_COLS = ["ops_type_merged", "city_group"]
NUM_FEATURES = [
    "avg_rainy_day",
    "avg_rainy_weekday",
    "avg_rainy_weekend",
    "mgm_day",
    "nonrepeat_cnt_per_user_lag2",
    "nonrepeat_cnt_per_user_roll4",
    "trip_cnt_per_user_lag2",
    "trip_cnt_per_user_roll4",
    "weekday_nonrepeat_cnt_per_user_lag2",
    "weekday_nonrepeat_cnt_per_user_roll4",
    "weekday_trip_cnt_per_user_lag2",
    "weekday_trip_cnt_per_user_roll4",
    "weekday_match_rate_lag2",
    "weekday_match_rate_roll4",
    "weekend_nonrepeat_cnt_per_user_lag2",
    "weekend_nonrepeat_cnt_per_user_roll4",
    "weekend_trip_cnt_per_user_lag2",
    "weekend_trip_cnt_per_user_roll4",
    "weekend_match_rate_lag2",
    "weekend_match_rate_roll4",
    "has_national_holiday",
    "coupon_BD_per_user_log1p_lag2",
    "coupon_BD_per_user_log1p_roll4",
    "coupon_CDP_per_user_log1p_lag2",
    "coupon_CDP_per_user_log1p_roll4",
    "coupon_folk_per_user_log1p_lag2",
    "coupon_folk_per_user_log1p_roll4",
    "coupon_growth_other_per_user_log1p_lag2",
    "coupon_growth_other_per_user_log1p_roll4",
    "coupon_MGM_per_user_log1p_lag2",
    "coupon_MGM_per_user_log1p_roll4",
    "coupon_MKT_per_user_log1p_lag2",
    "coupon_MKT_per_user_log1p_roll4",
    "coupon_register_per_user_log1p_lag2",
    "coupon_register_per_user_log1p_roll4",
    "coupon_daily_per_user_log1p_lag2",
    "coupon_daily_per_user_log1p_roll4",
    "face_value",
    "face_value_num"
]

# --------------------------
# 指定要評估的組別清單（兩週；只考慮 1 張；排除 25）
# key: (experiment_date, city_group, ops_type_merged, treatment)
# --------------------------
GROUP_ROWS = []

def add_rows(week_str, city, sub_group, face_values):
    for fv in face_values:
        if fv == 25:
            continue  # 排除 25
        GROUP_ROWS.append({
            "experiment_date": pd.Timestamp(week_str),
            "city_group": city,
            "ops_type_merged": sub_group,
            "treatment": f"{int(fv)}元1張",
        })

for w in ["2025-12-15", "2025-12-22"]:
    for city in ["中區", "北區", "南區"]:
        add_rows(w, city, "14天在其他尖峰預估車資", [15, 20])
        add_rows(w, city, "14天在晚尖峰預估車資", [15, 20])
        add_rows(w, city, "90天在尖峰預估車資", [20, 30])
        add_rows(w, city, "喚回-其他", [20, 30])
        add_rows(w, city, "喚回-高優惠敏感", [20, 30])
        add_rows(w, city, "既有regular鞏固", [15, 20, 25])  # 25 自動排除
        add_rows(w, city, "養成Regular-其他", [20, 30])
        add_rows(w, city, "養成Regular-高優惠敏感", [20, 30])

groups_df = pd.DataFrame(GROUP_ROWS).drop_duplicates()

# --------------------------
# Load
# --------------------------
df = pd.read_csv(DATA_PATH)
df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce")
df = df[df[DATE_COL].notna()].copy()

# 只保留到最大 target week（後面不會用到）
df = df[df[DATE_COL] <= max(TARGET_WEEKS)].copy()

# categorical cast（只在這裡做一次）
for c in CAT_COLS:
    if c in df.columns:
        df[c] = df[c].astype("category")

# feature availability check
missing_num = [c for c in NUM_FEATURES if c not in df.columns]
missing_cat = [c for c in CAT_COLS if c not in df.columns]
if missing_cat:
    print("[WARN] Missing categorical cols (will be dropped):", missing_cat)
if missing_num:
    print("[WARN] Missing numeric features (will be dropped):", missing_num)

X_COLS = [c for c in CAT_COLS if c in df.columns] + [c for c in NUM_FEATURES if c in df.columns]
CAT_IN_X = [c for c in CAT_COLS if c in df.columns]

# --------------------------
# Helpers
# --------------------------
def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def mae(y_true, y_pred) -> float:
    return float(mean_absolute_error(y_true, y_pred))

def align_categories(train_df: pd.DataFrame, test_df: pd.DataFrame, cat_cols):
    """
    保證 train/test 的 categorical dtype & categories 完全一致：
    test categories 強制沿用 train 的 categories（同集合、同順序）。
    """
    for c in cat_cols:
        if c in train_df.columns and c in test_df.columns:
            train_df[c] = train_df[c].astype("category")
            test_df[c] = test_df[c].astype("category")
            test_df[c] = test_df[c].cat.set_categories(train_df[c].cat.categories)
    return train_df, test_df

def filter_to_groups(test_df: pd.DataFrame, target_week: pd.Timestamp) -> pd.DataFrame:
    """
    只保留你表格中有出現的組別 (week+city+ops_type+treatment)。
    重要：不轉 dtype、不 merge，避免 categorical mismatch。
    """
    key_cols = ["experiment_date", "city_group", "ops_type_merged", "treatment"]
    g = groups_df[groups_df["experiment_date"] == target_week][key_cols].drop_duplicates()

    g_keys = set(map(tuple, g.itertuples(index=False, name=None)))
    test_keys = list(map(tuple, test_df[key_cols].itertuples(index=False, name=None)))
    mask = [k in g_keys for k in test_keys]
    return test_df.loc[mask].copy()

def fit_predict_one_week_on_groups(df_all: pd.DataFrame, target_week: pd.Timestamp):
    # 只能拿 target_week 之前的資料訓練
    train_df = df_all[df_all[DATE_COL] < target_week].copy()
    test_df  = df_all[df_all[DATE_COL] == target_week].copy()

    # test 只挑指定組別（不改 dtype）
    test_df = filter_to_groups(test_df, target_week)

    # drop rows with missing target
    train_df = train_df[train_df[Y_COL].notna()].copy()
    test_df  = test_df[test_df[Y_COL].notna()].copy()

    if len(train_df) == 0 or len(test_df) == 0:
        return None, None, len(train_df), len(test_df)

    # ✅ 對齊 categories，確保 predict 不會 categorical mismatch
    train_df, test_df = align_categories(train_df, test_df, CAT_IN_X)

    X_train = train_df[X_COLS]
    y_train = train_df[Y_COL].astype(float)

    X_test = test_df[X_COLS]
    y_test = test_df[Y_COL].astype(float)

    model = lgb.LGBMRegressor(
        objective="regression",
        n_estimators=2200,
        learning_rate=0.02,
        num_leaves=63,
        min_child_samples=150,
        subsample=0.7,
        colsample_bytree=1.0,
        reg_alpha=3.5,
        reg_lambda=2.0,
        random_state=42,
        n_jobs=-1,
        verbose=-1,
    )

    model.fit(
        X_train, y_train,
        categorical_feature=[c for c in CAT_IN_X if c in X_train.columns]
    )

    y_pred = model.predict(X_test)
    return y_test.values, y_pred, len(train_df), len(test_df)

# --------------------------
# Run + Print metrics (只對「指定組別」算 trip_cnt_per_user)
# --------------------------
print(f"\n==================== {Y_COL} (ONLY specified groups) ====================")
all_y_true, all_y_pred = [], []

for tw in TARGET_WEEKS:
    y_true, y_pred, n_tr, n_te = fit_predict_one_week_on_groups(df, tw)
    if y_true is None:
        print(f"[{tw.date()}] SKIP (train_rows={n_tr:,}, test_rows={n_te:,})")
        continue
    print(f"[{tw.date()}] train_rows={n_tr:,} test_rows={n_te:,} RMSE={rmse(y_true, y_pred):.6f} MAE={mae(y_true, y_pred):.6f}")
    all_y_true.append(y_true)
    all_y_pred.append(y_pred)

if all_y_true:
    all_y_true = np.concatenate(all_y_true)
    all_y_pred = np.concatenate(all_y_pred)
    print(f"[Overall 2 weeks] test_rows={len(all_y_true):,} RMSE={rmse(all_y_true, all_y_pred):.6f} MAE={mae(all_y_true, all_y_pred):.6f}")



[2025-12-15] train_rows=1,536 test_rows=48 RMSE=0.029264 MAE=0.022822
[2025-12-22] train_rows=1,728 test_rows=48 RMSE=0.068643 MAE=0.041979
[Overall 2 weeks] test_rows=96 RMSE=0.052765 MAE=0.032401


In [35]:
# ==========================
# Optuna tuning (around your current params)
# - Objective: avg RMSE over 2025-12-15 & 2025-12-22
# - Evaluation: ONLY specified groups (your groups_df)
# - Training constraint: ONLY use data strictly before each target week
# ==========================

import optuna

# ---- (optional) make optuna quieter
optuna.logging.set_verbosity(optuna.logging.WARNING)

BASE_PARAMS = dict(
    objective="regression",
    n_estimators=2000,
    learning_rate=0.02,
    num_leaves=63,
    min_child_samples=300,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1.0,
    reg_lambda=5.0,
    random_state=42,
    n_jobs=-1,
    verbose=-1,
)

def fit_predict_one_week_on_groups_with_params(df_all: pd.DataFrame, target_week: pd.Timestamp, model_params: dict):
    train_df = df_all[df_all[DATE_COL] < target_week].copy()
    test_df  = df_all[df_all[DATE_COL] == target_week].copy()

    test_df = filter_to_groups(test_df, target_week)

    train_df = train_df[train_df[Y_COL].notna()].copy()
    test_df  = test_df[test_df[Y_COL].notna()].copy()

    if len(train_df) == 0 or len(test_df) == 0:
        return None, None, len(train_df), len(test_df)

    # align categorical categories
    train_df, test_df = align_categories(train_df, test_df, CAT_IN_X)

    X_train = train_df[X_COLS]
    y_train = train_df[Y_COL].astype(float)

    X_test = test_df[X_COLS]
    y_test = test_df[Y_COL].astype(float)

    model = lgb.LGBMRegressor(**model_params)

    model.fit(
        X_train, y_train,
        categorical_feature=[c for c in CAT_IN_X if c in X_train.columns],
    )

    y_pred = model.predict(X_test)
    return y_test.values, y_pred, len(train_df), len(test_df)

def objective(trial: optuna.Trial) -> float:
    # --- search space (centered around your current settings)
    params = dict(BASE_PARAMS)

    # keep learning_rate & n_estimators in a sensible tradeoff range
    params["learning_rate"] = trial.suggest_float("learning_rate", 0.012, 0.035, log=True)
    params["n_estimators"]  = trial.suggest_int("n_estimators", 1200, 3200, step=200)

    params["num_leaves"] = trial.suggest_int("num_leaves", 31, 255, step=8)

    # min_child_samples around 300 but allow +- a lot
    params["min_child_samples"] = trial.suggest_int("min_child_samples", 50, 600, step=25)

    params["subsample"] = trial.suggest_float("subsample", 0.6, 1.0, step=0.05)
    params["colsample_bytree"] = trial.suggest_float("colsample_bytree", 0.6, 1.0, step=0.05)

    # regularization around (1,5)
    params["reg_alpha"] = trial.suggest_float("reg_alpha", 0.0, 5.0)
    params["reg_lambda"] = trial.suggest_float("reg_lambda", 0.0, 15.0)

    # ---- evaluate on your 2 target weeks (ONLY specified groups)
    rmses = []
    for tw in TARGET_WEEKS:
        y_true, y_pred, n_tr, n_te = fit_predict_one_week_on_groups_with_params(df, tw, params)
        if y_true is None or len(y_true) == 0:
            # no test rows -> heavily penalize (shouldn't happen if groups exist)
            return 1e9
        rmses.append(rmse(y_true, y_pred))

        # report intermediate value for pruning
        trial.report(float(np.mean(rmses)), step=len(rmses))
        if trial.should_prune():
            raise optuna.TrialPruned()

    return float(np.mean(rmses))

# ---- run study
sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=0)

study = optuna.create_study(direction="minimize", sampler=sampler, pruner=pruner)
study.optimize(objective, n_trials=60, show_progress_bar=True)

print("\n===== Optuna Best Result =====")
print("Best RMSE (avg over 2 weeks):", study.best_value)
print("Best params:", study.best_params)

# ---- evaluate best params with RMSE + MAE (and print per-week + overall)
best_params = dict(BASE_PARAMS)
best_params.update(study.best_params)

print(f"\n==================== {Y_COL} (BEST params, ONLY specified groups) ====================")
all_y_true, all_y_pred = [], []

for tw in TARGET_WEEKS:
    y_true, y_pred, n_tr, n_te = fit_predict_one_week_on_groups_with_params(df, tw, best_params)
    if y_true is None:
        print(f"[{tw.date()}] SKIP (train_rows={n_tr:,}, test_rows={n_te:,})")
        continue
    print(f"[{tw.date()}] train_rows={n_tr:,} test_rows={n_te:,} RMSE={rmse(y_true, y_pred):.6f} MAE={mae(y_true, y_pred):.6f}")
    all_y_true.append(y_true)
    all_y_pred.append(y_pred)

if all_y_true:
    all_y_true = np.concatenate(all_y_true)
    all_y_pred = np.concatenate(all_y_pred)
    print(f"[Overall 2 weeks] test_rows={len(all_y_true):,} RMSE={rmse(all_y_true, all_y_pred):.6f} MAE={mae(all_y_true, all_y_pred):.6f}")


  0%|          | 0/60 [00:00<?, ?it/s]


===== Optuna Best Result =====
Best RMSE (avg over 2 weeks): 0.04877249529813014
Best params: {'learning_rate': 0.014672871245679103, 'n_estimators': 2200, 'num_leaves': 63, 'min_child_samples': 125, 'subsample': 0.65, 'colsample_bytree': 1.0, 'reg_alpha': 3.486636659858854, 'reg_lambda': 2.0235598656898537}

[2025-12-15] train_rows=1,536 test_rows=48 RMSE=0.029175 MAE=0.022394
[2025-12-22] train_rows=1,728 test_rows=48 RMSE=0.068370 MAE=0.042161
[Overall 2 weeks] test_rows=96 RMSE=0.052562 MAE=0.032278
