In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# -------------------------
# 0) Helpers
# -------------------------
def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# -------------------------
# 1) Load
# -------------------------
path = "../final_data/data_260125_random.csv"
df = pd.read_csv(path)
df["experiment_date"] = pd.to_datetime(df["experiment_date"])

TEST_WEEK = pd.Timestamp("2026-01-05")

# -------------------------
# 2) Columns
# -------------------------
id_cols = ["experiment_date", "treatment", "source", "ops_type_merged", "city_group"]
target_cols = ["nonrepeat_cnt_per_user", "trip_cnt_per_user"]

base_cols = [
    "face_value",
    "face_value_num",
    "avg_rainy_day",
    "avg_rainy_weekday",
    "avg_rainy_weekend",
    "mgm_day",
    "delta_trip_per_user",
    "delta_nonrepeat_per_user",
    "ratio_trip_per_user",
    "ratio_nonrepeat_per_user",
]
roll4_cols = [c for c in df.columns if "roll4" in c.lower()]
feature_cols = base_cols + roll4_cols

need_cols = id_cols + target_cols + feature_cols
missing = [c for c in need_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}")

# -------------------------
# 3) Split (train < test_week, test == test_week)
# -------------------------
train_df = df.loc[df["experiment_date"] < TEST_WEEK, need_cols].copy()
test_df  = df.loc[df["experiment_date"] == TEST_WEEK, need_cols].copy()

if len(test_df) == 0:
    raise ValueError("No rows found for experiment_date == 2026-01-05")

# -------------------------
# 4) Build X / y
# -------------------------
X_train = train_df[feature_cols].copy()
X_test  = test_df[feature_cols].copy()

y_train_nonrepeat = train_df["nonrepeat_cnt_per_user"].astype(float)
y_test_nonrepeat  = test_df["nonrepeat_cnt_per_user"].astype(float)

y_train_trip = train_df["trip_cnt_per_user"].astype(float)
y_test_trip  = test_df["trip_cnt_per_user"].astype(float)

# one-hot for face_value (RF 需要數值特徵)
X_train["face_value"] = X_train["face_value"].astype("category")
X_test["face_value"]  = X_test["face_value"].astype("category")
X_train = pd.get_dummies(X_train, columns=["face_value"], dummy_na=True)
X_test  = pd.get_dummies(X_test, columns=["face_value"], dummy_na=True)

# align columns
X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

# -------------------------
# 5) Train two RF models (one per target)
# -------------------------
rf_params = dict(
    n_estimators=800,
    max_depth=None,
    min_samples_leaf=5,
    n_jobs=-1,
    random_state=42
)

rf_nonrepeat = RandomForestRegressor(**rf_params)
rf_nonrepeat.fit(X_train, y_train_nonrepeat)
pred_nonrepeat = rf_nonrepeat.predict(X_test)

rf_trip = RandomForestRegressor(**rf_params)
rf_trip.fit(X_train, y_train_trip)
pred_trip = rf_trip.predict(X_test)

# -------------------------
# 6) Evaluate + keep prediction vs true
# -------------------------
eval_df = test_df[id_cols + target_cols].copy()
eval_df["pred_nonrepeat_cnt_per_user"] = pred_nonrepeat
eval_df["pred_trip_cnt_per_user"] = pred_trip
eval_df["err_nonrepeat"] = eval_df["pred_nonrepeat_cnt_per_user"] - eval_df["nonrepeat_cnt_per_user"]
eval_df["err_trip"] = eval_df["pred_trip_cnt_per_user"] - eval_df["trip_cnt_per_user"]

print("RMSE (nonrepeat) on 2026-01-05:", rmse(y_test_nonrepeat, pred_nonrepeat))
print("RMSE (trip) on 2026-01-05:", rmse(y_test_trip, pred_trip))

# (optional) save
# eval_df.to_csv("/mnt/data/rf_pred_eval_2026-01-05.csv", index=False, encoding="utf-8-sig")


RMSE (nonrepeat) on 2026-01-05: 0.053941036180851665
RMSE (trip) on 2026-01-05: 0.04065525766944273
