# Add user_cnt

In [6]:
import pandas as pd

WEEKLY_PATH = "../../final_data/data_260116.csv"
DAILY_PATH  = "../data/daily_trip_merged_with_weekend_and_weather.csv"
OUT_PATH    = "../data/daily_trip_merged_with_weekend_and_weather_with_user_cnt.csv"

wk = pd.read_csv(WEEKLY_PATH)
dy = pd.read_csv(DAILY_PATH)

wk["experiment_date"] = pd.to_datetime(wk["experiment_date"])
dy["day"] = pd.to_datetime(dy["day"])

# 用週一當週別（只用來對齊，不會留在最後輸出）
dy["_week_monday"] = dy["day"] - pd.to_timedelta(dy["day"].dt.weekday, unit="D")

keys_wk = ["experiment_date", "treatment", "source", "ops_type_merged", "city_group"]
keys_dy = ["_week_monday",     "treatment", "source", "ops_type_merged", "city_group"]

wk_cnt = (
    wk.groupby(keys_wk, as_index=False)["user_cnt"]
      .max()
)

dy2 = dy.merge(
    wk_cnt,
    how="left",
    left_on=keys_dy,
    right_on=keys_wk
)

# 清掉 merge 用的欄位，不把 experiment_date 加進最後資料
dy2 = dy2.drop(columns=["_week_monday", "experiment_date"])

dy2.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")
print("Saved:", OUT_PATH)

Saved: ../data/daily_trip_merged_with_weekend_and_weather_with_user_cnt.csv


# Takeout 8/11 week (no data)

In [16]:
import pandas as pd

IN_PATH  = "../data/daily_trip_merged_with_weekend_and_weather_with_user_cnt.csv"
OUT_PATH = "../data/daily_trip_merged_with_weekend_and_weather_with_user_cnt.csv"

df = pd.read_csv(IN_PATH)
df["day"] = pd.to_datetime(df["day"])

start = pd.Timestamp("2025-08-11")
end   = pd.Timestamp("2025-08-18")

before = len(df)
df2 = df[~df["day"].between(start, end, inclusive="both")].copy()
after = len(df2)

df2.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")
print("Saved:", OUT_PATH)

Saved: ../data/daily_trip_merged_with_weekend_and_weather_with_user_cnt.csv


# per_user + NaN->0

In [18]:
import pandas as pd
import numpy as np

IN_PATH  = "../data/daily_trip_merged_with_weekend_and_weather_with_user_cnt.csv"
OUT_PATH = "../data/daily_trip_merged_add_cnt_per_user.csv"

df = pd.read_csv(IN_PATH)

# 需要轉數字、且要補 0 的欄位（依你目前的欄位清單）
num_cols = [
    "trip_cnt", "nonrepeat_cnt", "match_rate",
    "is_weekend_holiday", "is_rainy", "user_cnt"
]

# 轉成數字（轉不了就變 NaN）
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# 空值補 0
df[num_cols] = df[num_cols].fillna(0)

# 計算 per_user：user_cnt=0 時避免除以 0，直接給 0
den = df["user_cnt"].replace(0, np.nan)
df["trip_cnt_per_user"] = (df["trip_cnt"] / den).fillna(0)
df["nonrepeat_cnt_per_user"] = (df["nonrepeat_cnt"] / den).fillna(0)

# 輸出
df.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")
print("Saved:", OUT_PATH)

Saved: ../data/daily_trip_merged_add_cnt_per_user.csv


# lag

In [26]:
import pandas as pd
import numpy as np

IN_PATH  = "../data/daily_trip_merged_add_cnt_per_user.csv"
OUT_PATH = "../data/daily_trip_with_lag.csv"

df = pd.read_csv(IN_PATH)
df["day"] = pd.to_datetime(df["day"])

keys = ["treatment", "source", "ops_type_merged", "city_group"]
lag_cols = ["trip_cnt_per_user", "nonrepeat_cnt_per_user", "match_rate"]

# ensure numeric
for c in lag_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df = df.sort_values(keys + ["day"]).reset_index(drop=True)

base = df[keys + ["day"] + lag_cols].copy()

# build lag10~lag21 (days)
for k in range(10, 22):  # 10..21 inclusive
    lagk = base.copy()
    lagk["day"] = lagk["day"] + pd.Timedelta(days=k)  # shift forward so it aligns as "k days ago"
    lagk = lagk.rename(columns={c: f"{c}_lag{k}" for c in lag_cols})
    df = df.merge(lagk, on=keys + ["day"], how="left")

df.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")
print("Saved:", OUT_PATH)

Saved: ../data/daily_trip_with_lag.csv


# mgm

In [30]:
import pandas as pd
import numpy as np

IN_PATH  = "../data/daily_trip_with_lag.csv"
OUT_PATH = "../data/daily_trip_with_lag_mgm.csv"

df = pd.read_csv(IN_PATH)
df["day"] = pd.to_datetime(df["day"])

# MGM date ranges (inclusive)
mgm_ranges = [
    ("2025-07-28", "2025-08-03"),
    ("2025-08-27", "2025-09-02"),
    ("2025-09-27", "2025-10-06"),
    ("2025-10-23", "2025-10-31"),
    ("2025-11-26", "2025-12-02"),
    ("2025-12-24", "2026-01-01"),
]

mgm_day = pd.Series(False, index=df.index)
for s, e in mgm_ranges:
    s = pd.Timestamp(s)
    e = pd.Timestamp(e)
    mgm_day |= df["day"].between(s, e, inclusive="both")

df["mgm_day"] = mgm_day.astype(int)

df.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")
print("Saved:", OUT_PATH)

Saved: ../data/daily_trip_with_lag_mgm.csv


# Face value

In [31]:
import pandas as pd
import re

IN_PATH  = "../data/daily_trip_with_lag_mgm.csv"
OUT_PATH = "../final_data/data_260129.csv"

df = pd.read_csv(IN_PATH)

def parse_treatment(t):
    if pd.isna(t):
        return 0, 0
    s = str(t).strip()

    # 不發
    if s == "不發":
        return 0, 0

    # 1) 15x2元1張 / 15 x 2 ...
    m = re.search(r"(\d+)\s*[xX]\s*(\d+)", s)
    if m:
        return int(m.group(1)), int(m.group(2))

    # 2) 15元1張 / 15 元 1 張
    m = re.search(r"(\d+)\s*元\s*(\d+)\s*張", s)
    if m:
        return int(m.group(1)), int(m.group(2))

    # 其他格式：先回 NA（你也可改成 0,0）
    return pd.NA, pd.NA

df[["face_value", "face_value_num"]] = df["treatment"].apply(
    lambda x: pd.Series(parse_treatment(x))
)

df["face_value"] = df["face_value"].astype("Int64")
df["face_value_num"] = df["face_value_num"].astype("Int64")

df.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")
print("Saved:", OUT_PATH)

Saved: ../final_data/data_260129.csv


# split random and control

In [32]:
import pandas as pd

IN_PATH = "../final_data/data_260129.csv"
OUT_RANDOM  = "../final_data/data_260129_random.csv"
OUT_CONTROL = "../final_data/data_260129_control.csv"

df = pd.read_csv(IN_PATH)

# 以 source 分組（避免有空白）
s = df["source"].astype(str).str.strip()

df_random = df[s.eq("隨機組")].copy()
df_control = df[s.eq("控制組")].copy()

# 如果控制組命名有變體（例如含「控制」兩字），把下面取消註解
# if df_control.empty:
#     df_control = df[s.str.contains("控制", na=False)].copy()

df_random.to_csv(OUT_RANDOM, index=False, encoding="utf-8-sig")
df_control.to_csv(OUT_CONTROL, index=False, encoding="utf-8-sig")

print("rows total  :", len(df))
print("rows random :", len(df_random), "->", OUT_RANDOM)
print("rows control:", len(df_control), "->", OUT_CONTROL)
print("unique source:", sorted(s.dropna().unique()))


rows total  : 19968
rows random : 16128 -> ../final_data/data_260129_random.csv
rows control: 3840 -> ../final_data/data_260129_control.csv
unique source: ['控制組', '隨機組']
