# 整理趟次資料

## 合併資料

In [1]:
import os
import pandas as pd

DATA_DIR = "../data"

files = [os.path.join(DATA_DIR, f"RDS_trip_{i}.csv") for i in range(1, 6)]

# 檢查檔案是否存在
missing = [f for f in files if not os.path.exists(f)]
if missing:
    raise FileNotFoundError("找不到以下檔案：\n" + "\n".join(missing))

dfs = []
for f in files:
    df = pd.read_csv(f)
    dfs.append(df)

# 合併（保留所有欄位；欄位不一致會自動對齊）
rds_trip_merged = pd.concat(dfs, ignore_index=True, sort=False)

out_path = os.path.join("../merged_data", "RDS_trip_merged.csv")
rds_trip_merged.to_csv(out_path, index=False, encoding="utf-8-sig")

print("Merged shape:", rds_trip_merged.shape)
print("Saved to:", out_path)

  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


Merged shape: (6333679, 57)
Saved to: ../merged_data\RDS_trip_merged.csv


In [2]:
import os
import pandas as pd

DATA_DIR = "../data"

files = [os.path.join(DATA_DIR, f"trip_{i}.csv") for i in range(1, 4)]

# 檢查檔案是否存在
missing = [f for f in files if not os.path.exists(f)]
if missing:
    raise FileNotFoundError("找不到以下檔案：\n" + "\n".join(missing))

dfs = []
for f in files:
    df = pd.read_csv(f)
    dfs.append(df)

# 合併（保留所有欄位；欄位不一致會自動對齊）
trip_merged = pd.concat(dfs, ignore_index=True, sort=False)

out_path = os.path.join("../merged_data", "trip_merged.csv")
trip_merged.to_csv(out_path, index=False, encoding="utf-8-sig")

print("Merged shape:", trip_merged.shape)
print("Saved to:", out_path)

Merged shape: (6335051, 21)
Saved to: ../merged_data\trip_merged.csv


In [5]:
import os
import pandas as pd

DATA_DIR = "../data"

files = [os.path.join(DATA_DIR, f"trip_label_{i}.csv") for i in range(1, 3)]

# 檢查檔案是否存在
missing = [f for f in files if not os.path.exists(f)]
if missing:
    raise FileNotFoundError("找不到以下檔案：\n" + "\n".join(missing))

dfs = []
for f in files:
    df = pd.read_csv(f)
    dfs.append(df)

# 合併（保留所有欄位；欄位不一致會自動對齊）
trip_label_merged = pd.concat(dfs, ignore_index=True, sort=False)

out_path = os.path.join("../merged_data", "trip_label_merged.csv")
trip_label_merged.to_csv(out_path, index=False, encoding="utf-8-sig")

print("Merged shape:", trip_label_merged.shape)
print("Saved to:", out_path)

  df = pd.read_csv(f)
  df = pd.read_csv(f)


Merged shape: (4061235, 66)
Saved to: ../merged_data\trip_label_merged.csv


## 處理資料

## 準備key

In [None]:
import numpy as np
import pandas as pd

TZ = "Asia/Taipei"
HOLIDAYS_MD = {(9,29),(10,6),(10,10),(10,24),(12,25),(1,1)}

def rds_unixtime_to_local_date(s):
    return pd.to_datetime(s, unit="s", utc=True, errors="coerce").dt.tz_convert(TZ).dt.date

def week_monday_from_date(d):
    dd = pd.to_datetime(d, errors="coerce")
    return (dd - pd.to_timedelta(dd.dt.weekday, unit="D")).dt.date

def add_weekend_flag_from_weekday_md(df):
    is_weekend = df["weekday"].isin([1,7])
    md = list(zip(df["month"], df["day"]))
    is_holiday = pd.Series([x in HOLIDAYS_MD for x in md], index=df.index)
    df["is_weekend"] = is_weekend | is_holiday
    df["is_weekday"] = ~df["is_weekend"]
    return df

# --- 讀小檔 ---
user_cleaned = pd.read_csv("../cleaned_data/user_cleaned.csv", dtype={"user_id":"int64"})
target_user_ids = set(user_cleaned["user_id"].unique())

test_trip = pd.read_csv("../data/test_trip.csv", usecols=["trip_id"], dtype={"trip_id":"int64"})
test_trip_ids = set(test_trip["trip_id"].unique())

target_date_start = pd.to_datetime("2025-07-28").date()
target_date_end   = pd.to_datetime("2026-01-11").date()
start_minus_60 = (pd.to_datetime(target_date_start) - pd.Timedelta(days=60)).date()

## trip_stats_agg

In [None]:
from collections import defaultdict

RDS_PATH = "../merged_data/RDS_trip_merged.csv"

usecols_rds = ["trip_id","user_id","reserve_time","duplicate_id","month","day","weekday"]
dtype_rds = {
    "trip_id":"int64",
    "user_id":"int64",
    "reserve_time":"int64",
    "duplicate_id":"int64",
    "month":"int16",
    "day":"int16",
    "weekday":"int8",
}

agg = defaultdict(lambda: [0,0,0])  # [weekday_nonrepeat, weekend_nonrepeat, nonrepeat]
ts_trip_ids = set()

chunksize = 500_000

for chunk in pd.read_csv(RDS_PATH, usecols=usecols_rds, dtype=dtype_rds, chunksize=chunksize):
    # filters
    chunk = chunk[chunk["duplicate_id"].fillna(0).astype(int) == 0]
    chunk = chunk[~chunk["trip_id"].isin(test_trip_ids)]
    chunk = chunk[chunk["user_id"].isin(target_user_ids)]

    # trip_date & date filter
    chunk["trip_date"] = rds_unixtime_to_local_date(chunk["reserve_time"])
    chunk = chunk[chunk["trip_date"].between(target_date_start, target_date_end)]
    if chunk.empty:
        continue

    # week & weekend flag
    chunk["week"] = week_monday_from_date(chunk["trip_date"])
    chunk = add_weekend_flag_from_weekday_md(chunk)

    # collect trip ids for later join with trip
    ts_trip_ids.update(chunk["trip_id"].tolist())

    # group and accumulate
    g = chunk.groupby(["user_id","week"], as_index=False).agg(
        weekday_nonrepeat_cnt=("is_weekday","sum"),
        weekend_nonrepeat_cnt=("is_weekend","sum"),
        nonrepeat_cnt=("trip_id","size"),
    )
    for row in g.itertuples(index=False):
        key = (int(row.user_id), row.week)
        agg[key][0] += int(row.weekday_nonrepeat_cnt)
        agg[key][1] += int(row.weekend_nonrepeat_cnt)
        agg[key][2] += int(row.nonrepeat_cnt)

trip_stats_agg = pd.DataFrame(
    [(k[0], k[1], v[0], v[1], v[2]) for k, v in agg.items()],
    columns=["user_id","week","weekday_nonrepeat_cnt","weekend_nonrepeat_cnt","nonrepeat_cnt"]
)

## trip_trip

In [None]:
TRIP_PATH = "../merged_data/trip_merged.csv"

usecols_trip = ["trip_id","user_id","request_time","reserve_time","driver_id"]
dtype_trip = {"trip_id":"int64", "user_id":"int64"}

def utc_string_to_local_date(s):
    return pd.to_datetime(s, utc=True, errors="coerce").dt.tz_convert(TZ).dt.date

def add_weekend_flag_from_date(df, date_col):
    d = pd.to_datetime(df[date_col], errors="coerce")
    pd_dow = d.dt.dayofweek
    bq_dow = ((pd_dow + 1) % 7) + 1
    is_weekend = bq_dow.isin([1,7])
    md = list(zip(d.dt.month, d.dt.day))
    is_holiday = pd.Series([x in HOLIDAYS_MD for x in md], index=df.index)
    df["is_weekend"] = is_weekend | is_holiday
    df["is_weekday"] = ~df["is_weekend"]
    return df

agg2 = defaultdict(lambda: [0,0,0,0])  # [weekday_match, weekday_total, weekend_match, weekend_total]

for chunk in pd.read_csv(TRIP_PATH, usecols=usecols_trip, dtype=dtype_trip, chunksize=chunksize):
    chunk = chunk[chunk["trip_id"].isin(ts_trip_ids)]
    chunk = chunk[chunk["user_id"].isin(target_user_ids)]
    if chunk.empty:
        continue

    chunk["trip_date"] = utc_string_to_local_date(chunk["request_time"])
    chunk["reserve_date"] = utc_string_to_local_date(chunk["reserve_time"])

    chunk = chunk[
        chunk["trip_date"].between(start_minus_60, target_date_end) &
        chunk["reserve_date"].between(target_date_start, target_date_end)
    ]
    if chunk.empty:
        continue

    chunk["week"] = week_monday_from_date(chunk["trip_date"])
    chunk["is_match"] = chunk["driver_id"].notna()
    chunk = add_weekend_flag_from_date(chunk, "trip_date")

    chunk["weekday_match"] = (chunk["is_match"] & chunk["is_weekday"]).astype(int)
    chunk["weekend_match"] = (chunk["is_match"] & chunk["is_weekend"]).astype(int)
    chunk["weekday_total"] = chunk["is_weekday"].astype(int)
    chunk["weekend_total"] = chunk["is_weekend"].astype(int)

    g = chunk.groupby(["user_id","week"], as_index=False)[
        ["weekday_match","weekday_total","weekend_match","weekend_total"]
    ].sum()

    for row in g.itertuples(index=False):
        key = (int(row.user_id), row.week)
        agg2[key][0] += int(row.weekday_match)
        agg2[key][1] += int(row.weekday_total)
        agg2[key][2] += int(row.weekend_match)
        agg2[key][3] += int(row.weekend_total)

trip_trip = pd.DataFrame(
    [(k[0], k[1], v[0], v[1], v[2], v[3]) for k, v in agg2.items()],
    columns=["user_id","week","weekday_match","weekday_total","weekend_match","weekend_total"]
)
trip_trip["weekday_match_rate"] = (trip_trip["weekday_match"] / trip_trip["weekday_total"].replace({0:np.nan})).round(2)
trip_trip["weekend_match_rate"] = (trip_trip["weekend_match"] / trip_trip["weekend_total"].replace({0:np.nan})).round(2)

## trip_label_agg

In [None]:
TL_PATH = "../merged_data/trip_label_merged.csv"

usecols_tl = ["trip_id","user_id","trip_date","week"]
dtype_tl = {"trip_id":"int64","user_id":"int64"}

agg3 = defaultdict(lambda: [0,0,0])  # [weekday_trip_cnt, weekend_trip_cnt, trip_cnt]

for chunk in pd.read_csv(TL_PATH, usecols=usecols_tl, dtype=dtype_tl, chunksize=chunksize):
    chunk = chunk[~chunk["trip_id"].isin(test_trip_ids)]
    chunk = chunk[chunk["user_id"].isin(target_user_ids)]
    if chunk.empty:
        continue

    chunk["trip_date"] = pd.to_datetime(chunk["trip_date"], errors="coerce").dt.date
    chunk = chunk[chunk["trip_date"].between(target_date_start, target_date_end)]
    if chunk.empty:
        continue

    chunk["week"] = pd.to_datetime(chunk["week"], errors="coerce").dt.date

    chunk = add_weekend_flag_from_date(chunk, "trip_date")

    g = chunk.groupby(["user_id","week"], as_index=False).agg(
        weekday_trip_cnt=("is_weekday","sum"),
        weekend_trip_cnt=("is_weekend","sum"),
        trip_cnt=("trip_id","size"),
    )

    for row in g.itertuples(index=False):
        key = (int(row.user_id), row.week)
        agg3[key][0] += int(row.weekday_trip_cnt)
        agg3[key][1] += int(row.weekend_trip_cnt)
        agg3[key][2] += int(row.trip_cnt)

trip_label_agg = pd.DataFrame(
    [(k[0], k[1], v[0], v[1], v[2]) for k, v in agg3.items()],
    columns=["user_id","week","weekday_trip_cnt","weekend_trip_cnt","trip_cnt"]
)

In [None]:
weekly_features = trip_stats_agg.merge(trip_trip, on=["user_id","week"], how="outer")
weekly_features = weekly_features.merge(trip_label_agg, on=["user_id","week"], how="outer")

print("weekly_features:", weekly_features.shape)
weekly_features.head()

weekly_features: (1405448, 14)


Unnamed: 0,user_id,week,weekday_nonrepeat_cnt,weekend_nonrepeat_cnt,nonrepeat_cnt,weekday_match,weekday_total,weekend_match,weekend_total,weekday_match_rate,weekend_match_rate,weekday_trip_cnt,weekend_trip_cnt,trip_cnt
0,93,2025-12-01,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,,1.0,0.0,1.0
1,93,2025-12-22,1.0,0.0,1.0,0.0,0.0,0.0,1.0,,0.0,,,
2,96,2025-08-11,1.0,0.0,1.0,0.0,0.0,1.0,1.0,,1.0,0.0,1.0,1.0
3,96,2025-09-01,3.0,0.0,3.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,3.0
4,96,2025-09-08,1.0,0.0,1.0,0.0,0.0,1.0,1.0,,1.0,0.0,1.0,1.0


In [18]:
import numpy as np
import pandas as pd

GROUP_KEYS = ["experiment_date", "treatment", "source", "ops_type_merged", "city"]

def week_monday_from_date(d: pd.Series) -> pd.Series:
    dd = pd.to_datetime(d, errors="coerce")
    return (dd - pd.to_timedelta(dd.dt.weekday, unit="D")).dt.date

def safe_divide(a, b):
    b = b.replace({0: np.nan})
    return a / b

# 1) user_cleaned: 補 week
uc = user_cleaned.copy()
uc["experiment_date"] = pd.to_datetime(uc["experiment_date"], errors="coerce").dt.date
uc["week"] = week_monday_from_date(uc["experiment_date"])
uc["user_id"] = pd.to_numeric(uc["user_id"], errors="coerce").astype("Int64")

# 2) 把 weekly_features 併回每一列 user_cleaned（每人/那週）
wf = weekly_features.copy()
wf["user_id"] = pd.to_numeric(wf["user_id"], errors="coerce").astype("Int64")
wf["week"] = pd.to_datetime(wf["week"], errors="coerce").dt.date

ucw = uc.merge(wf, on=["user_id", "week"], how="left")

# 3) 計數缺值補 0（表示該 user 在該週沒發生）
count_cols = [
    "nonrepeat_cnt", "trip_cnt",
    "weekday_nonrepeat_cnt", "weekend_nonrepeat_cnt",
    "weekday_trip_cnt", "weekend_trip_cnt",
    "weekday_match", "weekday_total", "weekend_match", "weekend_total",
]
for c in count_cols:
    if c in ucw.columns:
        ucw[c] = pd.to_numeric(ucw[c], errors="coerce").fillna(0)

# 4) 依你要的維度做「組別彙總」
g = ucw.groupby(GROUP_KEYS, as_index=False).agg(
    user_cnt=("user_id", "nunique"),
    # 叫車數/趟次數（總和）
    nonrepeat_cnt=("nonrepeat_cnt", "sum"),
    trip_cnt=("trip_cnt", "sum"),
    # 平假日叫車數/趟次
    weekday_nonrepeat_cnt=("weekday_nonrepeat_cnt", "sum"),
    weekend_nonrepeat_cnt=("weekend_nonrepeat_cnt", "sum"),
    weekday_trip_cnt=("weekday_trip_cnt", "sum"),
    weekend_trip_cnt=("weekend_trip_cnt", "sum"),
    # 媒合聚合用 totals（最正確）
    weekday_match=("weekday_match", "sum"),
    weekday_total=("weekday_total", "sum"),
    weekend_match=("weekend_match", "sum"),
    weekend_total=("weekend_total", "sum"),
)

# 5) 人均欄位
g["nonrepeat_cnt_per_user"] = safe_divide(g["nonrepeat_cnt"], g["user_cnt"])
g["trip_cnt_per_user"] = safe_divide(g["trip_cnt"], g["user_cnt"])
g["weekday_nonrepeat_cnt_per_user"] = safe_divide(g["weekday_nonrepeat_cnt"], g["user_cnt"])
g["weekend_nonrepeat_cnt_per_user"] = safe_divide(g["weekend_nonrepeat_cnt"], g["user_cnt"])
g["weekday_trip_cnt_per_user"] = safe_divide(g["weekday_trip_cnt"], g["user_cnt"])
g["weekend_trip_cnt_per_user"] = safe_divide(g["weekend_trip_cnt"], g["user_cnt"])

# 6) 媒合率（組別 = match總和 / total總和）
g["weekday_match_rate"] = safe_divide(g["weekday_match"], g["weekday_total"]).round(2)
g["weekend_match_rate"] = safe_divide(g["weekend_match"], g["weekend_total"]).round(2)

# 如果你不想把中間的 match/total 留著，可以 drop
g = g.drop(columns=["weekday_match","weekday_total","weekend_match","weekend_total"])

# 7) merge 回 user_cleaned：同組每列都會帶同一套數字
user_cleaned_with_metrics = uc.merge(g, on=GROUP_KEYS, how="left").drop(columns=["week"], errors="ignore")

print("group_metrics shape:", g.shape)
print("user_cleaned_with_metrics shape:", user_cleaned_with_metrics.shape)


group_metrics shape: (19326, 20)
user_cleaned_with_metrics shape: (1569791, 21)


In [19]:
pd.set_option("display.max_columns", None)
user_cleaned_with_metrics.head()

Unnamed: 0,experiment_date,user_id,treatment,source,ops_type_merged,city,user_cnt,nonrepeat_cnt,trip_cnt,weekday_nonrepeat_cnt,weekend_nonrepeat_cnt,weekday_trip_cnt,weekend_trip_cnt,nonrepeat_cnt_per_user,trip_cnt_per_user,weekday_nonrepeat_cnt_per_user,weekend_nonrepeat_cnt_per_user,weekday_trip_cnt_per_user,weekend_trip_cnt_per_user,weekday_match_rate,weekend_match_rate
0,2025-12-22,5145040,15x2元1張,隨機組,14天在其他尖峰預估車資,臺北市,447,150.0,111.0,88.0,62.0,60.0,51.0,0.33557,0.248322,0.196868,0.138702,0.134228,0.114094,0.82,0.77
1,2025-11-17,302812,15x2元1張,隨機組,14天在其他尖峰預估車資,新北市,297,101.0,61.0,82.0,19.0,47.0,14.0,0.340067,0.205387,0.276094,0.063973,0.158249,0.047138,0.67,0.84
2,2025-12-01,4375821,15x2元1張,隨機組,14天在其他尖峰預估車資,新北市,260,54.0,41.0,39.0,15.0,27.0,14.0,0.207692,0.157692,0.15,0.057692,0.103846,0.053846,0.76,0.88
3,2025-11-24,2273154,15x2元1張,隨機組,14天在其他尖峰預估車資,臺北市,304,75.0,54.0,52.0,23.0,34.0,20.0,0.246711,0.177632,0.171053,0.075658,0.111842,0.065789,0.84,0.74
4,2025-12-22,433188,15x2元1張,隨機組,14天在其他尖峰預估車資,臺北市,447,150.0,111.0,88.0,62.0,60.0,51.0,0.33557,0.248322,0.196868,0.138702,0.134228,0.114094,0.82,0.77


In [20]:
from pathlib import Path

out_path = Path("..") / "cleaned_data" / "user_with_cnt_cleaned.csv"
user_cleaned_with_metrics.to_csv(out_path, index=False, encoding="utf-8-sig")

print("Saved:", out_path.resolve())

Saved: D:\minhsiang.chang\Desktop\2026winter_project\cleaned_data\user_with_cnt_cleaned.csv
