In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# =========================
# Config
# =========================
# NOTE: 如果你在這個環境跑，檔案也可能在 /mnt/data/data_260129_random.csv
DATA_PATH = "../final_data/data_260129_random.csv"
DATE_COL = "day"  # 你的資料欄位名

Y_TRIP = "trip_cnt_per_user"
Y_NONR = "nonrepeat_cnt_per_user"
TARGETS = [Y_TRIP, Y_NONR]

TEST_START = pd.Timestamp("2025-12-15")
TEST_END   = pd.Timestamp("2025-12-28")  # inclusive

# Random 組有 treatment，也把它當作類別特徵
CAT_COLS = ["treatment", "ops_type_merged", "city_group"]

# ✅ 白名單：這些欄位才會放進模型
BASE_FEATURES = [
    "treatment", "ops_type_merged", "city_group",
    "is_weekend_holiday", "is_rainy",
    "trip_cnt_per_user_lag14", "trip_cnt_per_user_lag21",
    "nonrepeat_cnt_per_user_lag14", "nonrepeat_cnt_per_user_lag21",
    "match_rate_lag14", "match_rate_lag21",
]

# =========================
# Separate params for each model
# =========================
PARAMS_TRIP = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.05,
    num_leaves=63,
    min_data_in_leaf=200,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    lambda_l2=1.0,
    seed=42,
    verbosity=-1,
)

PARAMS_NONR = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.05,
    num_leaves=31,
    min_data_in_leaf=50,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    lambda_l2=1.0,
    seed=42,
    verbosity=-1,
)

# Training controls
NUM_BOOST_ROUND_TRIP = 5000
NUM_BOOST_ROUND_NONR = 5000
EARLY_STOPPING_TRIP = 200
EARLY_STOPPING_NONR = 200
VALID_DAYS = 28

# =========================
# Helpers
# =========================
def rmse(y_true, y_pred) -> float:
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    return float(np.sqrt(mean_squared_error(y_true[mask], y_pred[mask])))

def align_categories(train_df: pd.DataFrame, test_df: pd.DataFrame, cat_cols):
    # 確保 test 的類別集合不會造成 unseen category 的問題（用 union 後轉 category）
    for c in cat_cols:
        if c not in train_df.columns or c not in test_df.columns:
            continue
        train_vals = train_df[c].astype("object")
        test_vals  = test_df[c].astype("object")
        cats = pd.Index(train_vals.dropna().unique()).union(pd.Index(test_vals.dropna().unique()))
        train_df[c] = pd.Categorical(train_vals, categories=cats)
        test_df[c]  = pd.Categorical(test_vals,  categories=cats)
    return train_df, test_df

def get_feature_cols(df, base_features, cat_cols):
    feats = [c for c in base_features if c in df.columns]
    # 防呆：同日目標絕不進 X（只允許 lag 特徵）
    feats = [c for c in feats if c not in [Y_TRIP, Y_NONR]]
    cat = [c for c in cat_cols if c in feats]
    return feats, cat

def get_model_config(target: str):
    if target == Y_TRIP:
        return PARAMS_TRIP, NUM_BOOST_ROUND_TRIP, EARLY_STOPPING_TRIP
    elif target == Y_NONR:
        return PARAMS_NONR, NUM_BOOST_ROUND_NONR, EARLY_STOPPING_NONR
    else:
        raise ValueError(f"Unknown target: {target}")

# =========================
# 1) Build whitelist grid (你的篩選條件)
# =========================
GROUP_ROWS = []

def add_rows(week_str, city, sub_group, face_values):
    for fv in face_values:
        if fv == 25:
            continue  # 排除 25
        GROUP_ROWS.append({
            DATE_COL: pd.Timestamp(week_str),     # 這裡用 DATE_COL 對齊你的資料
            "city_group": city,
            "ops_type_merged": sub_group,
            "treatment": f"{int(fv)}元1張",
        })

for w in ["2025-12-15", "2025-12-22"]:
    for city in ["中區", "北區", "南區"]:
        add_rows(w, city, "14天在其他尖峰預估車資", [15, 20])
        add_rows(w, city, "14天在晚尖峰預估車資", [15, 20])
        add_rows(w, city, "90天在尖峰預估車資", [20, 30])
        add_rows(w, city, "喚回-其他", [20, 30])
        add_rows(w, city, "喚回-高優惠敏感", [20, 30])
        add_rows(w, city, "既有regular鞏固", [15, 20, 25])  # 25 自動排除
        add_rows(w, city, "養成Regular-其他", [20, 30])
        add_rows(w, city, "養成Regular-高優惠敏感", [20, 30])

group_filter = pd.DataFrame(GROUP_ROWS).drop_duplicates()

# =========================
# 2) Load
# =========================
df = pd.read_csv(DATA_PATH)
df[DATE_COL] = pd.to_datetime(df[DATE_COL])
df = df.replace({"": np.nan})

# =========================
# 3) Train/Test split
# =========================
train_df = df[df[DATE_COL] < TEST_START].copy()

# 先取出 test 區間的原始資料（用來計 RMSE 時可對照實際值）
raw_test_df = df[(df[DATE_COL] >= TEST_START) & (df[DATE_COL] <= TEST_END)].copy()

# =========================
# 4) 只保留「白名單組合」要預測的列
#    - 用 group_filter 建「你要的完整 grid」
#    - 再 left-join 回 df 拿特徵/真值
#    - 即使 df 缺某些列，也會保留（特徵缺就會是 NaN；LGBM 可吃 NaN）
# =========================
test_df = group_filter.merge(
    raw_test_df,
    on=[DATE_COL, "city_group", "ops_type_merged", "treatment"],
    how="left",
    validate="one_to_one",
)

# 若你想知道哪些組合在原始資料不存在，可用這個（不影響模型）
# missing_mask = test_df[[Y_TRIP, Y_NONR]].isna().all(axis=1)
# print("Missing rows in raw_test_df (kept for prediction):", missing_mask.sum())

# =========================
# 5) Category alignment
# =========================
train_df, test_df = align_categories(train_df, test_df, CAT_COLS)

# Features used
feature_cols, cat_cols = get_feature_cols(df, BASE_FEATURES, CAT_COLS)
print("Features used:", feature_cols)
print("Categorical:", cat_cols)

# =========================
# 6) Train & Predict
# =========================
pred_out = test_df[[DATE_COL] + [c for c in CAT_COLS if c in test_df.columns]].copy()

def train_one_target(target: str):
    # 確保同日目標沒有被放進特徵
    assert Y_TRIP not in feature_cols and Y_NONR not in feature_cols, "Leakage: same-day targets in features!"

    tr = train_df.dropna(subset=[target]).copy()

    # time-based valid（最後 VALID_DAYS 當 valid）
    cutoff = tr[DATE_COL].max() - pd.Timedelta(days=VALID_DAYS)
    tr_sub = tr[tr[DATE_COL] < cutoff]
    va_sub = tr[tr[DATE_COL] >= cutoff]

    dtrain = lgb.Dataset(
        tr_sub[feature_cols], label=tr_sub[target],
        categorical_feature=cat_cols, free_raw_data=False
    )
    dvalid = lgb.Dataset(
        va_sub[feature_cols], label=va_sub[target],
        categorical_feature=cat_cols, free_raw_data=False
    )

    params, num_boost_round, early_stop = get_model_config(target)

    print(f"\n=== Training target: {target} ===")
    print("Params:", params)

    model = lgb.train(
        params=params,
        train_set=dtrain,
        num_boost_round=num_boost_round,
        valid_sets=[dvalid],
        callbacks=[
            lgb.early_stopping(stopping_rounds=early_stop),
            lgb.log_evaluation(200),
        ],
    )

    pred = model.predict(test_df[feature_cols], num_iteration=model.best_iteration)
    return pred

pred_out[f"pred_{Y_TRIP}"] = train_one_target(Y_TRIP)
pred_out[f"pred_{Y_NONR}"] = train_one_target(Y_NONR)

# =========================
# 7) Quick RMSE (只針對「白名單 test_df」且有真值的列)
# =========================
print("\n=== Row-level RMSE on filtered test (only rows with ground truth) ===")
mask_trip = np.isfinite(test_df.get(Y_TRIP, np.nan).astype(float).values)
mask_nonr = np.isfinite(test_df.get(Y_NONR, np.nan).astype(float).values)

print("filtered test rows (grid):", len(test_df))
print("rows with trip truth:", int(mask_trip.sum()))
print("rows with nonrepeat truth:", int(mask_nonr.sum()))

if mask_trip.sum() > 0:
    print(f"{Y_TRIP}: {rmse(test_df.loc[mask_trip, Y_TRIP].values, pred_out.loc[mask_trip, f'pred_{Y_TRIP}'].values):.6f}")
if mask_nonr.sum() > 0:
    print(f"{Y_NONR}: {rmse(test_df.loc[mask_nonr, Y_NONR].values, pred_out.loc[mask_nonr, f'pred_{Y_NONR}'].values):.6f}")

# =========================
# 8) Save (optional)
# =========================
# pred_out.to_csv("../result/random/pred_1215_1228_filtered.csv", index=False, encoding="utf-8-sig")


Features used: ['treatment', 'ops_type_merged', 'city_group', 'is_weekend_holiday', 'is_rainy', 'trip_cnt_per_user_lag2w', 'trip_cnt_per_user_lag3w', 'nonrepeat_cnt_per_user_lag2w', 'nonrepeat_cnt_per_user_lag3w', 'match_rate_lag2w', 'match_rate_lag3w']
Categorical: ['treatment', 'ops_type_merged', 'city_group']

=== Training target: trip_cnt_per_user ===
Params: {'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.05, 'num_leaves': 63, 'min_data_in_leaf': 200, 'feature_fraction': 0.9, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'lambda_l2': 1.0, 'seed': 42, 'verbosity': -1}
Training until validation scores don't improve for 200 rounds
[200]	valid_0's rmse: 0.0134059
Early stopping, best iteration is:
[53]	valid_0's rmse: 0.0133059

=== Training target: nonrepeat_cnt_per_user ===
Params: {'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.05, 'num_leaves': 31, 'min_data_in_leaf': 50, 'feature_fraction': 0.9, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'lambda_l

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# =========================
# Config
# =========================
DATA_PATH = "../final_data/data_260129_random.csv"
DATE_COL = "day"

Y_TRIP = "trip_cnt_per_user"
Y_NONR = "nonrepeat_cnt_per_user"

TEST_START = pd.Timestamp("2025-12-15")
TEST_END   = pd.Timestamp("2025-12-28")  # inclusive

# --- 你要的 ops+treatment 規則（evaluation 時才用）
OPS_TREAT_RULES = {
    "14天在其他尖峰預估車資": [15, 20],
    "14天在晚尖峰預估車資":   [15, 20],
    "90天在尖峰預估車資":     [20, 30],
    "喚回-其他":              [20, 30],
    "喚回-高優惠敏感":        [20, 30],
    "既有regular鞏固":        [15, 20],
    "養成Regular-其他":       [20, 30],
    "養成Regular-高優惠敏感": [20, 30],
}

# ✅ 你要包含 city_group 的組別定義（RMSE 會在這個層級做兩週加總）
EVAL_GROUP_KEYS = ["city_group", "ops_type_merged", "treatment"]

# --- 模型的類別欄位
CAT_COLS = ["treatment", "ops_type_merged", "city_group"]

# --- 特徵白名單
BASE_FEATURES = [
    "treatment", "ops_type_merged", "city_group",
    "is_weekend_holiday", "is_rainy",
    "trip_cnt_per_user_lag14", "trip_cnt_per_user_lag21",
    "nonrepeat_cnt_per_user_lag14", "nonrepeat_cnt_per_user_lag21",
    "match_rate_lag14", "match_rate_lag21",
]

# --- LGB params（你可替換成 optuna best）
PARAMS_TRIP = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.05,
    num_leaves=63,
    min_data_in_leaf=200,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    lambda_l2=1.0,
    seed=42,
    verbosity=-1,
)

PARAMS_NONR = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.05,
    num_leaves=31,
    min_data_in_leaf=50,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    lambda_l2=1.0,
    seed=42,
    verbosity=-1,
)

NUM_BOOST_ROUND_TRIP = 5000
NUM_BOOST_ROUND_NONR = 5000
EARLY_STOPPING_TRIP = 200
EARLY_STOPPING_NONR = 200
VALID_DAYS = 28

# evaluation 子集合如果缺某些 (city, ops, treat, week_window) 要不要補 0？
FILL_MISSING_WITH_ZERO = True


# =========================
# Helpers
# =========================
def align_categories(train_df: pd.DataFrame, test_df: pd.DataFrame, cat_cols):
    for c in cat_cols:
        if c not in train_df.columns or c not in test_df.columns:
            continue
        tr = train_df[c].astype("object")
        te = test_df[c].astype("object")
        cats = pd.Index(tr.dropna().unique()).union(pd.Index(te.dropna().unique()))
        train_df[c] = pd.Categorical(tr, categories=cats)
        test_df[c]  = pd.Categorical(te, categories=cats)
    return train_df, test_df

def get_feature_cols(df: pd.DataFrame):
    feats = [c for c in BASE_FEATURES if c in df.columns]
    feats = [c for c in feats if c not in [Y_TRIP, Y_NONR]]  # 防 leakage
    cat = [c for c in CAT_COLS if c in feats]
    return feats, cat

def time_based_train_valid_split(train_df: pd.DataFrame, valid_days=28):
    cutoff = train_df[DATE_COL].max() - pd.Timedelta(days=valid_days)
    tr_sub = train_df[train_df[DATE_COL] < cutoff].copy()
    va_sub = train_df[train_df[DATE_COL] >= cutoff].copy()
    return tr_sub, va_sub

def build_allowed_pairs(rules: dict) -> pd.DataFrame:
    rows = []
    for ops, fvs in rules.items():
        for fv in fvs:
            rows.append({"ops_type_merged": ops, "treatment": f"{int(fv)}元1張"})
    return pd.DataFrame(rows).drop_duplicates()

def build_expected_eval_groups(df_test_all: pd.DataFrame, rules: dict, eval_group_keys: list) -> pd.DataFrame:
    """
    依照「test 期間實際出現的 city_group」×（允許的 ops,treat）建立 expected groups，
    再乘上兩個 week_window，作為缺值補 0 或 sanity check 的基準。
    """
    allowed_pairs = build_allowed_pairs(rules)

    cities = (
        df_test_all[["city_group"]]
        .drop_duplicates()
        .assign(_k=1)
    )

    base_groups = (
        cities.merge(allowed_pairs.assign(_k=1), on="_k", how="inner")
        .drop(columns=["_k"])
    )

    windows = pd.DataFrame({"week_window": ["w1_1215_1221", "w2_1222_1228"]}).assign(_k=1)

    expected = (
        base_groups.assign(_k=1)
        .merge(windows, on="_k", how="inner")
        .drop(columns=["_k"])
    )

    # columns order: eval_group_keys + week_window
    return expected[eval_group_keys + ["week_window"]].drop_duplicates()

def two_week_sum_rmse_on_subset_city(
    df_test_all: pd.DataFrame,
    target_col: str,
    pred_col: str,
    rules: dict,
    eval_group_keys: list,
    fill_missing_with_zero: bool = True,
) -> float:
    """
    先用全部 test_pred，再篩選指定的 (ops,treat)，並且「包含 city_group」：
    - (city, ops, treat) 分別加總 12/15-12/21 與 12/22-12/28
    - 每個 (city,ops,treat) 會有 2 個加總值
    - 對所有 (city,ops,treat,week_window) 的加總誤差算 RMSE
    """
    allowed_pairs = build_allowed_pairs(rules)

    # ---- evaluation 子集合（只篩 ops+treat；city 不篩）
    sub = df_test_all.merge(allowed_pairs, on=["ops_type_merged", "treatment"], how="inner")
    sub = sub[(sub[DATE_COL] >= TEST_START) & (sub[DATE_COL] <= TEST_END)].copy()

    # ---- week_window
    d = sub[DATE_COL]
    cond1 = (d >= pd.Timestamp("2025-12-15")) & (d <= pd.Timestamp("2025-12-21"))
    cond2 = (d >= pd.Timestamp("2025-12-22")) & (d <= pd.Timestamp("2025-12-28"))

    sub["week_window"] = pd.NA
    sub.loc[cond1, "week_window"] = "w1_1215_1221"
    sub.loc[cond2, "week_window"] = "w2_1222_1228"
    sub = sub[sub["week_window"].notna()].copy()

    # ---- missing check
    if sub[target_col].isna().any():
        miss = sub[sub[target_col].isna()][eval_group_keys + [DATE_COL]].head(10)
        raise ValueError(f"Missing ground truth in subset for {target_col}. Sample:\n{miss}")
    if sub[pred_col].isna().any():
        miss = sub[sub[pred_col].isna()][eval_group_keys + [DATE_COL]].head(10)
        raise ValueError(f"Missing prediction in subset for {pred_col}. Sample:\n{miss}")

    # ---- 直接在 (city, ops, treat, week_window) 做加總（跨日）
    g = (
        sub
        .groupby(eval_group_keys + ["week_window"], dropna=False)
        .agg(actual_sum=(target_col, "sum"), pred_sum=(pred_col, "sum"))
        .reset_index()
    )

    # ---- 對齊 expected (city × allowed ops,treat × 2 windows)
    expected = build_expected_eval_groups(df_test_all, rules, eval_group_keys)
    idx = pd.MultiIndex.from_frame(expected)

    g2 = g.set_index(eval_group_keys + ["week_window"]).reindex(idx)

    if g2[["actual_sum", "pred_sum"]].isna().any().any():
        if fill_missing_with_zero:
            g2[["actual_sum", "pred_sum"]] = g2[["actual_sum", "pred_sum"]].fillna(0.0)
        else:
            missing_rows = g2[g2["actual_sum"].isna() | g2["pred_sum"].isna()].head(10)
            raise ValueError(f"Missing some (city,ops,treat,week_window) sums. Sample:\n{missing_rows}")

    g2 = g2.reset_index()
    err = (g2["actual_sum"] - g2["pred_sum"]).to_numpy(dtype=float)
    return float(np.sqrt(np.mean(err ** 2)))


# =========================
# Load & Split (ALL rows)
# =========================
df = pd.read_csv(DATA_PATH)
df[DATE_COL] = pd.to_datetime(df[DATE_COL])
df = df.replace({"": np.nan})

train_df = df[df[DATE_COL] < TEST_START].copy()
test_df  = df[(df[DATE_COL] >= TEST_START) & (df[DATE_COL] <= TEST_END)].copy()

# Align categories
train_df, test_df = align_categories(train_df, test_df, CAT_COLS)

# Features
feature_cols, cat_cols = get_feature_cols(df)
print("Features used:", feature_cols)
print("Categorical:", cat_cols)

# internal valid split (within train)
tr_sub, va_sub = time_based_train_valid_split(train_df, valid_days=VALID_DAYS)
tr_sub, va_sub = align_categories(tr_sub, va_sub, cat_cols)

# =========================
# Train & Predict (ALL TEST rows)
# =========================
def train_and_predict_one_target(target: str, params: dict, num_boost_round: int, early_stop: int):
    tr = tr_sub.dropna(subset=[target]).copy()
    va = va_sub.dropna(subset=[target]).copy()

    dtrain = lgb.Dataset(
        tr[feature_cols], label=tr[target].astype(float),
        categorical_feature=cat_cols, free_raw_data=False
    )
    dvalid = lgb.Dataset(
        va[feature_cols], label=va[target].astype(float),
        categorical_feature=cat_cols, free_raw_data=False
    )

    model = lgb.train(
        params=params,
        train_set=dtrain,
        num_boost_round=num_boost_round,
        valid_sets=[dvalid],
        callbacks=[lgb.early_stopping(stopping_rounds=early_stop, verbose=False)],
    )

    pred = model.predict(test_df[feature_cols], num_iteration=model.best_iteration)
    return pred, int(model.best_iteration)

pred_trip, best_it_trip = train_and_predict_one_target(
    Y_TRIP, PARAMS_TRIP, NUM_BOOST_ROUND_TRIP, EARLY_STOPPING_TRIP
)
pred_nonr, best_it_nonr = train_and_predict_one_target(
    Y_NONR, PARAMS_NONR, NUM_BOOST_ROUND_NONR, EARLY_STOPPING_NONR
)

# attach predictions to full test
test_pred = test_df[[DATE_COL, "city_group", "ops_type_merged", "treatment", Y_TRIP, Y_NONR]].copy()
test_pred["pred_trip_cnt_per_user"] = pred_trip
test_pred["pred_nonrepeat_cnt_per_user"] = pred_nonr

print("best_iteration trip:", best_it_trip)
print("best_iteration nonrepeat:", best_it_nonr)

# =========================
# Evaluate RMSE on SUBSET ONLY (include city_group)
# =========================
rmse_trip_subset = two_week_sum_rmse_on_subset_city(
    df_test_all=test_pred,
    target_col=Y_TRIP,
    pred_col="pred_trip_cnt_per_user",
    rules=OPS_TREAT_RULES,
    eval_group_keys=EVAL_GROUP_KEYS,
    fill_missing_with_zero=FILL_MISSING_WITH_ZERO,
)

rmse_nonr_subset = two_week_sum_rmse_on_subset_city(
    df_test_all=test_pred,
    target_col=Y_NONR,
    pred_col="pred_nonrepeat_cnt_per_user",
    rules=OPS_TREAT_RULES,
    eval_group_keys=EVAL_GROUP_KEYS,
    fill_missing_with_zero=FILL_MISSING_WITH_ZERO,
)

print("\n=== Two-week-sum RMSE on subset (city_group + ops + treat rules) ===")
print("TRIP:", rmse_trip_subset)
print("NONR:", rmse_nonr_subset)

# (optional) save all predictions
# test_pred.to_csv("../result/random/pred_all_1215_1228.csv", index=False, encoding="utf-8-sig")


Features used: ['treatment', 'ops_type_merged', 'city_group', 'is_weekend_holiday', 'is_rainy', 'trip_cnt_per_user_lag2w', 'trip_cnt_per_user_lag3w', 'nonrepeat_cnt_per_user_lag2w', 'nonrepeat_cnt_per_user_lag3w', 'match_rate_lag2w', 'match_rate_lag3w']
Categorical: ['treatment', 'ops_type_merged', 'city_group']
best_iteration trip: 53
best_iteration nonrepeat: 56

=== Two-week-sum RMSE on subset (city_group + ops + treat rules) ===
TRIP: 0.05468680028747053
NONR: 0.07246133067691629


  sub
  sub


In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.metrics import mean_squared_error

optuna.logging.set_verbosity(optuna.logging.WARNING)

# =========================
# Config
# =========================
DATA_PATH = "../final_data/data_260129_random.csv"
DATE_COL = "day"

Y_TRIP = "trip_cnt_per_user"
Y_NONR = "nonrepeat_cnt_per_user"

TEST_START = pd.Timestamp("2025-12-15")
TEST_END   = pd.Timestamp("2025-12-28")  # inclusive

OPS_TREAT_RULES = {
    "14天在其他尖峰預估車資": [15, 20],
    "14天在晚尖峰預估車資":   [15, 20],
    "90天在尖峰預估車資":     [20, 30],
    "喚回-其他":              [20, 30],
    "喚回-高優惠敏感":        [20, 30],
    "既有regular鞏固":        [15, 20],
    "養成Regular-其他":       [20, 30],
    "養成Regular-高優惠敏感": [20, 30],
}

EVAL_GROUP_KEYS = ["city_group", "ops_type_merged", "treatment"]
CAT_COLS = ["treatment", "ops_type_merged", "city_group"]

BASE_FEATURES = [
    "treatment", "ops_type_merged", "city_group",
    "is_weekend_holiday", "is_rainy",
    "trip_cnt_per_user_lag14", "trip_cnt_per_user_lag21",
    "nonrepeat_cnt_per_user_lag14", "nonrepeat_cnt_per_user_lag21",
    "match_rate_lag14", "match_rate_lag21",
]

# baseline centers (optuna 會在附近搜尋)
PARAMS_TRIP_BASE = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.05,
    num_leaves=63,
    min_data_in_leaf=200,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    lambda_l2=1.0,
    seed=42,
    verbosity=-1,
)

PARAMS_NONR_BASE = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.05,
    num_leaves=31,
    min_data_in_leaf=50,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    lambda_l2=1.0,
    seed=42,
    verbosity=-1,
)

NUM_BOOST_ROUND_TRIP = 5000
NUM_BOOST_ROUND_NONR = 5000
EARLY_STOPPING_TRIP = 200
EARLY_STOPPING_NONR = 200
VALID_DAYS = 28

N_TRIALS = 120
FILL_MISSING_WITH_ZERO = True


# =========================
# Helpers
# =========================
def align_categories(train_df: pd.DataFrame, test_df: pd.DataFrame, cat_cols):
    for c in cat_cols:
        if c not in train_df.columns or c not in test_df.columns:
            continue
        tr = train_df[c].astype("object")
        te = test_df[c].astype("object")
        cats = pd.Index(tr.dropna().unique()).union(pd.Index(te.dropna().unique()))
        train_df[c] = pd.Categorical(tr, categories=cats)
        test_df[c]  = pd.Categorical(te, categories=cats)
    return train_df, test_df

def get_feature_cols(df: pd.DataFrame):
    feats = [c for c in BASE_FEATURES if c in df.columns]
    feats = [c for c in feats if c not in [Y_TRIP, Y_NONR]]
    cat = [c for c in CAT_COLS if c in feats]
    return feats, cat

def time_based_train_valid_split(train_df: pd.DataFrame, valid_days=28):
    cutoff = train_df[DATE_COL].max() - pd.Timedelta(days=valid_days)
    tr_sub = train_df[train_df[DATE_COL] < cutoff].copy()
    va_sub = train_df[train_df[DATE_COL] >= cutoff].copy()
    return tr_sub, va_sub

def build_allowed_pairs(rules: dict) -> pd.DataFrame:
    rows = []
    for ops, fvs in rules.items():
        for fv in fvs:
            rows.append({"ops_type_merged": ops, "treatment": f"{int(fv)}元1張"})
    return pd.DataFrame(rows).drop_duplicates()

def build_expected_eval_groups(df_test_all: pd.DataFrame, rules: dict, eval_group_keys: list) -> pd.DataFrame:
    allowed_pairs = build_allowed_pairs(rules)

    cities = (
        df_test_all[["city_group"]]
        .drop_duplicates()
        .assign(_k=1)
    )

    base_groups = (
        cities.merge(allowed_pairs.assign(_k=1), on="_k", how="inner")
        .drop(columns=["_k"])
    )

    windows = pd.DataFrame({"week_window": ["w1_1215_1221", "w2_1222_1228"]}).assign(_k=1)

    expected = (
        base_groups.assign(_k=1)
        .merge(windows, on="_k", how="inner")
        .drop(columns=["_k"])
    )
    return expected[eval_group_keys + ["week_window"]].drop_duplicates()

def two_week_sum_rmse_on_subset_city(
    df_test_all: pd.DataFrame,
    target_col: str,
    pred_col: str,
    rules: dict,
    eval_group_keys: list,
    fill_missing_with_zero: bool = True,
) -> float:
    allowed_pairs = build_allowed_pairs(rules)

    sub = df_test_all.merge(allowed_pairs, on=["ops_type_merged", "treatment"], how="inner")
    sub = sub[(sub[DATE_COL] >= TEST_START) & (sub[DATE_COL] <= TEST_END)].copy()

    d = sub[DATE_COL]
    cond1 = (d >= pd.Timestamp("2025-12-15")) & (d <= pd.Timestamp("2025-12-21"))
    cond2 = (d >= pd.Timestamp("2025-12-22")) & (d <= pd.Timestamp("2025-12-28"))

    sub["week_window"] = pd.NA
    sub.loc[cond1, "week_window"] = "w1_1215_1221"
    sub.loc[cond2, "week_window"] = "w2_1222_1228"
    sub = sub[sub["week_window"].notna()].copy()

    if sub[target_col].isna().any():
        miss = sub[sub[target_col].isna()][eval_group_keys + [DATE_COL]].head(10)
        raise ValueError(f"Missing ground truth in subset for {target_col}. Sample:\n{miss}")
    if sub[pred_col].isna().any():
        miss = sub[sub[pred_col].isna()][eval_group_keys + [DATE_COL]].head(10)
        raise ValueError(f"Missing prediction in subset for {pred_col}. Sample:\n{miss}")

    g = (
        sub
        .groupby(eval_group_keys + ["week_window"], dropna=False)
        .agg(actual_sum=(target_col, "sum"), pred_sum=(pred_col, "sum"))
        .reset_index()
    )

    expected = build_expected_eval_groups(df_test_all, rules, eval_group_keys)
    idx = pd.MultiIndex.from_frame(expected)
    g2 = g.set_index(eval_group_keys + ["week_window"]).reindex(idx)

    if g2[["actual_sum", "pred_sum"]].isna().any().any():
        if fill_missing_with_zero:
            g2[["actual_sum", "pred_sum"]] = g2[["actual_sum", "pred_sum"]].fillna(0.0)
        else:
            missing_rows = g2[g2["actual_sum"].isna() | g2["pred_sum"].isna()].head(10)
            raise ValueError(f"Missing some (city,ops,treat,week_window) sums. Sample:\n{missing_rows}")

    g2 = g2.reset_index()
    err = (g2["actual_sum"] - g2["pred_sum"]).to_numpy(dtype=float)
    return float(np.sqrt(np.mean(err ** 2)))

def build_params_from_trial(trial: optuna.Trial, base_params: dict, target: str):
    params = dict(base_params)

    base_lr = float(base_params.get("learning_rate", 0.05))
    lr_low  = max(0.01, base_lr * 0.6)
    lr_high = min(0.12, base_lr * 1.6)
    params["learning_rate"] = trial.suggest_float("learning_rate", lr_low, lr_high, log=True)

    params["num_leaves"] = trial.suggest_int("num_leaves", 15, 127, step=2)

    if target == Y_TRIP:
        params["min_data_in_leaf"] = trial.suggest_int("min_data_in_leaf", 50, 400, step=10)
    else:
        params["min_data_in_leaf"] = trial.suggest_int("min_data_in_leaf", 20, 200, step=5)

    params["max_depth"] = trial.suggest_categorical("max_depth", [-1, 4, 5, 6, 7, 8, 9, 10])

    params["feature_fraction"] = trial.suggest_float("feature_fraction", 0.7, 1.0, step=0.05)
    params["bagging_fraction"] = trial.suggest_float("bagging_fraction", 0.7, 1.0, step=0.05)
    params["bagging_freq"] = trial.suggest_categorical("bagging_freq", [0, 1, 2, 5])

    params["lambda_l2"] = trial.suggest_float("lambda_l2", 0.2, 8.0, log=True)
    params["lambda_l1"] = trial.suggest_float("lambda_l1", 0.0, 3.0)

    params["min_gain_to_split"] = trial.suggest_float("min_gain_to_split", 0.0, 0.05)
    params["extra_trees"] = trial.suggest_categorical("extra_trees", [False, True])

    params["objective"] = "regression"
    params["metric"] = "rmse"
    params["seed"] = 42
    params["verbosity"] = -1
    return params


# =========================
# Load & Split
# =========================
df = pd.read_csv(DATA_PATH)
df[DATE_COL] = pd.to_datetime(df[DATE_COL])
df = df.replace({"": np.nan})

train_all = df[df[DATE_COL] < TEST_START].copy()
test_df   = df[(df[DATE_COL] >= TEST_START) & (df[DATE_COL] <= TEST_END)].copy()

feature_cols, cat_cols = get_feature_cols(df)

# Align categories for train/test
train_all, test_df = align_categories(train_all, test_df, cat_cols)

# internal valid split
tr_sub, va_sub = time_based_train_valid_split(train_all, valid_days=VALID_DAYS)
tr_sub, va_sub = align_categories(tr_sub, va_sub, cat_cols)


# =========================
# Objective factory
# =========================
def make_objective(target: str, base_params: dict, num_boost_round: int, early_stop: int):
    X_tr = tr_sub[feature_cols]
    y_tr = tr_sub[target].astype(float)
    X_va = va_sub[feature_cols]
    y_va = va_sub[target].astype(float)

    dtrain = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols, free_raw_data=False)
    dvalid = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_cols, free_raw_data=False)

    X_test = test_df[feature_cols]

    def objective(trial: optuna.Trial) -> float:
        params = build_params_from_trial(trial, base_params, target)

        model = lgb.train(
            params=params,
            train_set=dtrain,
            num_boost_round=num_boost_round,
            valid_sets=[dvalid],
            callbacks=[lgb.early_stopping(stopping_rounds=early_stop, verbose=False)],
        )

        pred = model.predict(X_test, num_iteration=model.best_iteration)

        # attach pred to full test_df, then evaluate subset windows RMSE
        test_pred = test_df[[DATE_COL, "city_group", "ops_type_merged", "treatment", target]].copy()
        pred_col = f"pred_{target}"
        test_pred[pred_col] = pred

        score = two_week_sum_rmse_on_subset_city(
            df_test_all=test_pred,
            target_col=target,
            pred_col=pred_col,
            rules=OPS_TREAT_RULES,
            eval_group_keys=EVAL_GROUP_KEYS,
            fill_missing_with_zero=FILL_MISSING_WITH_ZERO,
        )

        trial.set_user_attr("best_iteration", int(model.best_iteration))
        return score

    return objective


# =========================
# Run studies
# =========================
sampler = optuna.samplers.TPESampler(seed=42)

# --- TRIP
study_trip = optuna.create_study(direction="minimize", sampler=sampler)
study_trip.optimize(
    make_objective(Y_TRIP, PARAMS_TRIP_BASE, NUM_BOOST_ROUND_TRIP, EARLY_STOPPING_TRIP),
    n_trials=N_TRIALS,
    show_progress_bar=True
)
print("\n=== BEST (TRIP | subset two-window sum RMSE) ===")
print("best value:", study_trip.best_value)
print("best params:", study_trip.best_params)
print("best_iteration:", study_trip.best_trial.user_attrs.get("best_iteration"))

# --- NONREPEAT
study_nonr = optuna.create_study(direction="minimize", sampler=sampler)
study_nonr.optimize(
    make_objective(Y_NONR, PARAMS_NONR_BASE, NUM_BOOST_ROUND_NONR, EARLY_STOPPING_NONR),
    n_trials=N_TRIALS,
    show_progress_bar=True
)
print("\n=== BEST (NONREPEAT | subset two-window sum RMSE) ===")
print("best value:", study_nonr.best_value)
print("best params:", study_nonr.best_params)
print("best_iteration:", study_nonr.best_trial.user_attrs.get("best_iteration"))

def merge_best(base_params, best_params):
    merged = dict(base_params)
    merged.update(best_params)
    merged["objective"] = "regression"
    merged["metric"] = "rmse"
    merged["seed"] = 42
    merged["verbosity"] = -1
    return merged

BEST_PARAMS_TRIP = merge_best(PARAMS_TRIP_BASE, study_trip.best_params)
BEST_PARAMS_NONR = merge_best(PARAMS_NONR_BASE, study_nonr.best_params)

print("\nBEST_PARAMS_TRIP =", BEST_PARAMS_TRIP)
print("BEST_PARAMS_NONR =", BEST_PARAMS_NONR)

  0%|          | 0/120 [00:00<?, ?it/s]

  sub
[LightGBM] [Fatal] Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You need to set `feature_pre_filter=false` to dynamically change the `min_data_in_leaf`.
[LightGBM] [Fatal] Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You need to set `feature_pre_filter=false` to dynamically change the `min_data_in_leaf`.
  sub
[LightGBM] [Fatal] Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You need to set `feature_pre_filter=false` to dynamically change the `min_data_in_leaf`.
[LightGBM] [Fatal] Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You n


=== BEST (TRIP | subset two-window sum RMSE) ===
best value: 0.05247441293908558
best params: {'learning_rate': 0.07123313101669058, 'num_leaves': 99, 'min_data_in_leaf': 270, 'max_depth': 10, 'feature_fraction': 0.85, 'bagging_fraction': 0.8999999999999999, 'bagging_freq': 1, 'lambda_l2': 0.7074849462783035, 'lambda_l1': 1.0915777681191887, 'min_gain_to_split': 0.00012024869366484924, 'extra_trees': False}
best_iteration: 611


  sub


  0%|          | 0/120 [00:00<?, ?it/s]

  sub
[LightGBM] [Fatal] Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You need to set `feature_pre_filter=false` to dynamically change the `min_data_in_leaf`.
[LightGBM] [Fatal] Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause unexpected behaviour for features that were pre-filtered by the larger `min_data_in_leaf`.
You need to set `feature_pre_filter=false` to dynamically change the `min_data_in_leaf`.
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  sub
  


=== BEST (NONREPEAT | subset two-window sum RMSE) ===
best value: 0.06797062812404076
best params: {'learning_rate': 0.0488352361261404, 'num_leaves': 53, 'min_data_in_leaf': 155, 'max_depth': 4, 'feature_fraction': 0.7, 'bagging_fraction': 0.75, 'bagging_freq': 5, 'lambda_l2': 3.754002727329246, 'lambda_l1': 0.041454698251695454, 'min_gain_to_split': 0.002106948977215376, 'extra_trees': True}
best_iteration: 100

BEST_PARAMS_TRIP = {'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.07123313101669058, 'num_leaves': 99, 'min_data_in_leaf': 270, 'feature_fraction': 0.85, 'bagging_fraction': 0.8999999999999999, 'bagging_freq': 1, 'lambda_l2': 0.7074849462783035, 'seed': 42, 'verbosity': -1, 'max_depth': 10, 'lambda_l1': 1.0915777681191887, 'min_gain_to_split': 0.00012024869366484924, 'extra_trees': False}
BEST_PARAMS_NONR = {'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.0488352361261404, 'num_leaves': 53, 'min_data_in_leaf': 155, 'feature_fraction': 0.7, '

  sub
