# 合併資料

In [1]:
from pathlib import Path
import pandas as pd

# =========================
# 0) 路徑設定：以「目前執行的資料夾」為準
# =========================
BASE_DIR = Path.cwd()
PARENT_DIR = BASE_DIR.parent

MERGED_DIR = PARENT_DIR / "merged_data"
DATA_DIR = PARENT_DIR / "data"

MERGED_DIR.mkdir(parents=True, exist_ok=True)  # 確保資料夾存在

# =========================
# 1) 找到 coupon_1 ~ coupon_4
#    （我同時在幾個常見位置找：目前資料夾 / 上一層 / 上一層data）
# =========================
search_dirs = [BASE_DIR, PARENT_DIR, DATA_DIR]

coupon_paths = []
for i in range(1, 5):
    fname = f"coupon_{i}.csv"
    found = None
    for d in search_dirs:
        p = d / fname
        if p.exists():
            found = p
            break
    if found is None:
        raise FileNotFoundError(
            f"找不到 {fname}。\n"
            f"我有搜尋這些路徑：\n" + "\n".join([str(x) for x in search_dirs])
        )
    coupon_paths.append(found)

print("✅ 找到以下檔案：")
for p in coupon_paths:
    print(" -", p)

# =========================
# 2) 讀取 + 合併
# =========================
date_cols_coupon = ["expiry_date", "receive_time", "first_use", "redeem_time", "last_update"]

dfs = []
for p in coupon_paths:
    df = pd.read_csv(p)
    # 這些欄位如果不存在也不會報錯（errors='ignore'）
    for col in date_cols_coupon:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], utc=True, errors="coerce")
    dfs.append(df)

coupon = pd.concat(dfs, ignore_index=True)

# （可選）如果你確定 id 是唯一鍵、且四份可能重疊，可以去重：
# coupon = coupon.sort_values("last_update").drop_duplicates(subset=["id"], keep="last")

# =========================
# 3) 輸出到上一層 merged_data
# =========================
out_path = MERGED_DIR / "coupon_merged.csv"
coupon.to_csv(out_path, index=False, encoding="utf-8-sig")
print(f"✅ 已輸出：{out_path}  | rows={len(coupon):,} cols={coupon.shape[1]}")

# =========================
# 4) 讀 coupon_category（上一層 data）
# =========================
coupon_category_path = DATA_DIR / "coupon_category.csv"
if not coupon_category_path.exists():
    raise FileNotFoundError(f"找不到 coupon_category.csv：{coupon_category_path}")

coupon_category = pd.read_csv(coupon_category_path)

date_cols_cat = ["start_date", "end_date", "enable_date", "created_at", "update_date"]
for col in date_cols_cat:
    if col in coupon_category.columns:
        coupon_category[col] = pd.to_datetime(coupon_category[col], errors="coerce")

print(f"✅ coupon_category loaded | rows={len(coupon_category):,} cols={coupon_category.shape[1]}")

# =========================
# 5) 小檢查：promo_id 對齊型態（後面 join 會更穩）
# =========================
if "promo_id" in coupon.columns:
    coupon["promo_id"] = pd.to_numeric(coupon["promo_id"], errors="coerce").astype("Int64")
if "promo_id" in coupon_category.columns:
    coupon_category["promo_id"] = pd.to_numeric(coupon_category["promo_id"], errors="coerce").astype("Int64")

print("promo_id dtype:", coupon["promo_id"].dtype, coupon_category["promo_id"].dtype)


✅ 找到以下檔案：
 - d:\minhsiang.chang\Desktop\2026winter_project\data\coupon_1.csv
 - d:\minhsiang.chang\Desktop\2026winter_project\data\coupon_2.csv
 - d:\minhsiang.chang\Desktop\2026winter_project\data\coupon_3.csv
 - d:\minhsiang.chang\Desktop\2026winter_project\data\coupon_4.csv
✅ 已輸出：d:\minhsiang.chang\Desktop\2026winter_project\merged_data\coupon_merged.csv  | rows=18,282,581 cols=15
✅ coupon_category loaded | rows=7,327 cols=14
promo_id dtype: Int64 Int64


# 處理資料

In [3]:
from pathlib import Path
import pandas as pd
import numpy as np

# =========================
# 0) Paths
# =========================
BASE_DIR = Path.cwd()
PARENT_DIR = BASE_DIR.parent

USER_PATH = PARENT_DIR / "cleaned_data" / "user_with_cnt_cleaned.csv"
COUPON_PATH = PARENT_DIR / "merged_data" / "coupon_merged.csv"
CAT_PATH = PARENT_DIR / "data" / "coupon_category.csv"

group_cols = ["experiment_date", "treatment", "source", "ops_type_merged", "city"]

# =========================
# 1) Load user (用來決定每週每用戶在哪個組)
# =========================
user = pd.read_csv(USER_PATH)

# experiment_date 轉 date
user["experiment_date"] = pd.to_datetime(user["experiment_date"], errors="coerce").dt.date

# user_id 轉數字（避免字串 join 爆炸）
user["user_id"] = pd.to_numeric(user["user_id"], errors="coerce").astype("Int64")

# 這張表是關鍵：每個 user 在每個 experiment_date 對應到哪個組別
user_map = user[["user_id", "experiment_date", "treatment", "source", "ops_type_merged", "city"]].drop_duplicates()

# =========================
# 2) Load coupon_category (通常不大，可整張載)
# =========================
coupon_category = pd.read_csv(CAT_PATH)
coupon_category["promo_id"] = pd.to_numeric(coupon_category["promo_id"], errors="coerce").astype("Int64")

# DATE(cc.enable_date)
coupon_category["enable_date"] = pd.to_datetime(coupon_category["enable_date"], errors="coerce").dt.date

# SQL: NOT REGEXP_CONTAINS(promo_title, '(?i)機場|機接|接機|送機|出國|test|測試|租車券|旅遊')
pattern = r"(機場|機接|接機|送機|出國|test|測試|租車券|旅遊)"
coupon_category["promo_title"] = coupon_category["promo_title"].astype(str)

coupon_category_f = coupon_category.loc[
    (coupon_category["promo_id"].fillna(0) >= 10508458) &
    (~coupon_category["promo_title"].str.contains(pattern, case=False, regex=True, na=False))
, ["promo_id", "coupon_category", "coupon_type", "enable_date"]].copy()

# 建議用 promo_id 當 key（merge 會更快）
# coupon_category_f = coupon_category_f.set_index("promo_id")

# =========================
# 3) Helper：UTC時間字串 -> Asia/Taipei 的 date
# =========================
def utcstr_to_taipei_date(series: pd.Series) -> pd.Series:
    dt = pd.to_datetime(series, utc=True, errors="coerce")
    # 只有 tz-aware 才能 tz_convert，dt 會是 datetime64[ns, UTC]
    return dt.dt.tz_convert("Asia/Taipei").dt.date

# =========================
# 4) Chunk 讀 coupon_merged.csv，邊讀邊算「每個組別各券種總數」
# =========================
usecols = ["user_id", "promo_id", "expiry_date", "redeem_time"]  # 只讀必要欄位（省超多 RAM）
dtype = {
    "user_id": "Int64",
    "promo_id": "Int64",
    # date 欄位先用 object 讀，後面再 to_datetime
    "expiry_date": "string",
    "redeem_time": "string",
}

coupon_cols = [
    "coupon_BD", "coupon_CDP", "coupon_folk", "coupon_growth_other",
    "coupon_MGM", "coupon_MKT", "coupon_register", "coupon_daily"
]

# 用 dict 累加（key=tuple(group_cols), value=np.array(8)）
acc = {}

chunksize = 1_000_000  # 可視你的機器調整：500k~2M 都行

reader = pd.read_csv(
    COUPON_PATH,
    usecols=usecols,
    dtype=dtype,
    chunksize=chunksize,
    low_memory=True
)

for i, chunk in enumerate(reader, start=1):
    # user_id / promo_id 轉 numeric（保險）
    chunk["user_id"] = pd.to_numeric(chunk["user_id"], errors="coerce").astype("Int64")
    chunk["promo_id"] = pd.to_numeric(chunk["promo_id"], errors="coerce").astype("Int64")

    # 先 join coupon_category（等價 SQL coupon CTE 的 JOIN + promo_title 排除 + promo_id 條件）
    c = chunk.merge(coupon_category_f, on="promo_id", how="inner")

    if len(c) == 0:
        continue

    # enable_date 來自 coupon_category_f
    # expiry_date = IFNULL(DATE(redeem_time), DATE(c.expiry_date)) with Asia/Taipei
    c["expiry_date_eff"] = utcstr_to_taipei_date(c["redeem_time"])
    m = c["expiry_date_eff"].isna()
    if m.any():
        c.loc[m, "expiry_date_eff"] = utcstr_to_taipei_date(c.loc[m, "expiry_date"])

    # 沒有 enable 或 expiry 的丟掉（避免比較錯）
    c = c.loc[c["enable_date"].notna() & c["expiry_date_eff"].notna()].copy()
    if len(c) == 0:
        continue

    # 再 join user_map，把 coupon 展到「這個 user 在哪些 experiment_date 屬於哪些組」
    # 然後套用 SQL: week.experiment_date BETWEEN enable_date AND expiry_date
    cu = c.merge(user_map, on="user_id", how="inner")

    if len(cu) == 0:
        continue

    cu = cu.loc[
        (cu["experiment_date"] >= cu["enable_date"]) &
        (cu["experiment_date"] <= cu["expiry_date_eff"])
    ].copy()

    if len(cu) == 0:
        continue

    # COUNTIF 轉成 0/1，再 groupby sum（等價 coupon_agg 再匯總到組別）
    cu["coupon_BD"] = (cu["coupon_type"].isin(["BD", "異業合作"])).astype(np.int32)
    cu["coupon_CDP"] = ((cu["coupon_type"] == "Folksonomy") & (cu["coupon_category"] == "Event-Triggered")).astype(np.int32)
    cu["coupon_folk"] = ((cu["coupon_type"] == "Folksonomy") & (cu["coupon_category"] == "Gene")).astype(np.int32)
    cu["coupon_growth_other"] = (
        (cu["coupon_type"] == "Folksonomy") &
        (~cu["coupon_category"].isin(["Event-Triggered", "Gene"]))
    ).astype(np.int32)
    cu["coupon_MGM"] = (cu["coupon_type"] == "MGM").astype(np.int32)
    cu["coupon_MKT"] = (cu["coupon_type"] == "MKT").astype(np.int32)
    cu["coupon_register"] = (cu["coupon_type"] == "新註冊").astype(np.int32)
    cu["coupon_daily"] = (cu["coupon_type"] == "天天領").astype(np.int32)

    g = cu.groupby(group_cols, as_index=False)[coupon_cols].sum()

    # 累加進 acc
    for row in g.itertuples(index=False):
        key = (row.experiment_date, row.treatment, row.source, row.ops_type_merged, row.city)
        vec = np.array([getattr(row, col) for col in coupon_cols], dtype=np.int64)
        if key in acc:
            acc[key] += vec
        else:
            acc[key] = vec

    if i % 5 == 0:
        print(f"processed chunks: {i}, acc groups: {len(acc):,}")

print(f"✅ Finished reading coupons. Total groups aggregated: {len(acc):,}")

# =========================
# 5) acc -> DataFrame，merge 回 user
# =========================
if len(acc) == 0:
    # 沒有券資料也要補欄位
    group_coupon_total = user[group_cols].drop_duplicates().copy()
    for col in coupon_cols:
        group_coupon_total[f"{col}_total"] = 0
else:
    rows = []
    for k, v in acc.items():
        rows.append(list(k) + list(v))
    group_coupon_total = pd.DataFrame(rows, columns=group_cols + coupon_cols)
    group_coupon_total = group_coupon_total.rename(columns={c: f"{c}_total" for c in coupon_cols})

final = user.merge(group_coupon_total, on=group_cols, how="left")

for c in [f"{x}_total" for x in coupon_cols]:
    final[c] = final[c].fillna(0).astype(np.int64)

  (~coupon_category["promo_title"].str.contains(pattern, case=False, regex=True, na=False))


processed chunks: 5, acc groups: 7,923
processed chunks: 10, acc groups: 14,833
processed chunks: 15, acc groups: 18,446
✅ Finished reading coupons. Total groups aggregated: 18,754


In [7]:
pd.set_option("display.max_columns", None)
final.head()

Unnamed: 0,experiment_date,user_id,treatment,source,ops_type_merged,city,user_cnt,nonrepeat_cnt,trip_cnt,weekday_nonrepeat_cnt,weekend_nonrepeat_cnt,weekday_trip_cnt,weekend_trip_cnt,nonrepeat_cnt_per_user,trip_cnt_per_user,weekday_nonrepeat_cnt_per_user,weekend_nonrepeat_cnt_per_user,weekday_trip_cnt_per_user,weekend_trip_cnt_per_user,weekday_match_rate,weekend_match_rate,coupon_BD_total,coupon_CDP_total,coupon_folk_total,coupon_growth_other_total,coupon_MGM_total,coupon_MKT_total,coupon_register_total,coupon_daily_total
0,2025-12-22,5145040,15x2元1張,隨機組,14天在其他尖峰預估車資,臺北市,447,150.0,111.0,88.0,62.0,60.0,51.0,0.33557,0.248322,0.196868,0.138702,0.134228,0.114094,0.82,0.77,83,85,894,1,92,18,24,0
1,2025-11-17,302812,15x2元1張,隨機組,14天在其他尖峰預估車資,新北市,297,101.0,61.0,82.0,19.0,47.0,14.0,0.340067,0.205387,0.276094,0.063973,0.158249,0.047138,0.67,0.84,93,153,594,12,0,1,7,0
2,2025-12-01,4375821,15x2元1張,隨機組,14天在其他尖峰預估車資,新北市,260,54.0,41.0,39.0,15.0,27.0,14.0,0.207692,0.157692,0.15,0.057692,0.103846,0.053846,0.76,0.88,69,110,520,0,8,1,8,0
3,2025-11-24,2273154,15x2元1張,隨機組,14天在其他尖峰預估車資,臺北市,304,75.0,54.0,52.0,23.0,34.0,20.0,0.246711,0.177632,0.171053,0.075658,0.111842,0.065789,0.84,0.74,111,172,608,2,0,15,5,0
4,2025-12-22,433188,15x2元1張,隨機組,14天在其他尖峰預估車資,臺北市,447,150.0,111.0,88.0,62.0,60.0,51.0,0.33557,0.248322,0.196868,0.138702,0.134228,0.114094,0.82,0.77,83,85,894,1,92,18,24,0


In [8]:
group_cols = ["experiment_date", "treatment", "source", "ops_type_merged", "city"]

final_grouped = (
    final
    .drop(columns=["user_id"], errors="ignore")
    .groupby(group_cols, as_index=False)
    .max(numeric_only=True)   # 同組別重複列，max 會保留那個值
)

In [10]:
group_cols = ["experiment_date", "treatment", "source", "ops_type_merged", "city"]

dup_check = (
    final.drop(columns=["user_id"], errors="ignore")
         .groupby(group_cols)
         .nunique()
)

# 找出在同一組內「不是唯一值」的欄位（nunique>1）
bad = (dup_check > 1).any(axis=1)
print("有問題的組別數：", bad.sum())


有問題的組別數： 0


In [14]:
final_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19326 entries, 0 to 19325
Data columns (total 28 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   experiment_date                 19326 non-null  object 
 1   treatment                       19326 non-null  object 
 2   source                          19326 non-null  object 
 3   ops_type_merged                 19326 non-null  object 
 4   city                            19326 non-null  object 
 5   user_cnt                        19326 non-null  int64  
 6   nonrepeat_cnt                   19326 non-null  float64
 7   trip_cnt                        19326 non-null  float64
 8   weekday_nonrepeat_cnt           19326 non-null  float64
 9   weekend_nonrepeat_cnt           19326 non-null  float64
 10  weekday_trip_cnt                19326 non-null  float64
 11  weekend_trip_cnt                19326 non-null  float64
 12  nonrepeat_cnt_per_user          

In [13]:
final_grouped.head()

Unnamed: 0,experiment_date,treatment,source,ops_type_merged,city,user_cnt,nonrepeat_cnt,trip_cnt,weekday_nonrepeat_cnt,weekend_nonrepeat_cnt,weekday_trip_cnt,weekend_trip_cnt,nonrepeat_cnt_per_user,trip_cnt_per_user,weekday_nonrepeat_cnt_per_user,weekend_nonrepeat_cnt_per_user,weekday_trip_cnt_per_user,weekend_trip_cnt_per_user,weekday_match_rate,weekend_match_rate,coupon_BD_total,coupon_CDP_total,coupon_folk_total,coupon_growth_other_total,coupon_MGM_total,coupon_MKT_total,coupon_register_total,coupon_daily_total
0,2025-07-28,不發,控制組,14天在其他尖峰預估車資,南投縣,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0,3,0,0,0,0,0,0
1,2025-07-28,不發,控制組,14天在其他尖峰預估車資,嘉義市,15,4.0,4.0,1.0,3.0,0.0,4.0,0.266667,0.266667,0.066667,0.2,0.0,0.266667,,1.0,0,0,0,0,0,0,0,0
2,2025-07-28,不發,控制組,14天在其他尖峰預估車資,嘉義縣,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0,3,0,0,0,0,0,0
3,2025-07-28,不發,控制組,14天在其他尖峰預估車資,基隆市,50,13.0,10.0,6.0,7.0,7.0,3.0,0.26,0.2,0.12,0.14,0.14,0.06,0.89,0.75,0,28,0,0,4,0,0,0
4,2025-07-28,不發,控制組,14天在其他尖峰預估車資,宜蘭縣,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0,3,0,0,0,0,0,0


In [15]:
from pathlib import Path

out_path = Path("..") / "cleaned_data" / "user_with_cnt_coupon_cleaned.csv"
final_grouped.to_csv(out_path, index=False, encoding="utf-8-sig")

print("Saved:", out_path.resolve())

Saved: D:\minhsiang.chang\Desktop\2026winter_project\cleaned_data\user_with_cnt_coupon_cleaned.csv
