In [None]:
import pandas as pd
import numpy as np

# =========================
# 0) Read data
# =========================
weather = pd.read_csv("../data/weather.csv")
user = pd.read_csv("../cleaned_data/user_with_cnt_coupon_cleaned.csv")

weather["forecastDate"] = pd.to_datetime(weather["forecastDate"])
weather["publishDate"] = pd.to_datetime(weather["publishDate"])
user["experiment_date"] = pd.to_datetime(user["experiment_date"])

# =========================
# 1) 國定假日（視為假日）
#    依你提供：9/29, 10/6, 10/10, 10/24, 12/25, 1/1
#    這裡用 2025 的 9/29~12/25 + 2026/1/1
# =========================
holiday_dates = pd.to_datetime([
    "2025-09-29",
    "2025-10-06",
    "2025-10-10",
    "2025-10-24",
    "2025-12-25",
    "2026-01-01",
]).normalize()

holiday_set = set(holiday_dates)

# =========================
# 2) SQL inner query 等價（先做到 cityName/publishDate/forecastDate 每天一列）
#    rainy_period = COUNTIF(precipChance > 60)
#    rainfall_amount_mm = SUM(REGEXP_EXTRACT(subNarrative, r'([\d.]+)公釐'))
# =========================
mm = (
    weather["subNarrative"]
    .astype(str)
    .str.extract(r"([\d.]+)公釐", expand=False)
)
weather["rainfall_amount_mm_part"] = pd.to_numeric(mm, errors="coerce").fillna(0.0)
weather["rainy_period_part"] = (weather["precipChance"] >= 60).astype(int)

daily = (
    weather.groupby(["cityName", "publishDate", "forecastDate"], as_index=False)
    .agg(
        rainy_period=("rainy_period_part", "sum"),
        rainfall_amount_mm=("rainfall_amount_mm_part", "sum"),
    )
)

# =========================
# 3) 對應 SQL where：
#    publishDate = target_publish_date
#    forecastDate between target_date_start and target_date_end
#
#    我們做成：experiment_date = publishDate + 3
#            forecastDate in [experiment_date, experiment_date+6]
# =========================
daily["experiment_date"] = daily["publishDate"] + pd.Timedelta(days=3)

start = daily["experiment_date"]
end = daily["experiment_date"] + pd.Timedelta(days=6)
daily_win = daily[(daily["forecastDate"] >= start) & (daily["forecastDate"] <= end)].copy()

# =========================
# 4) 定義平日/假日（六日 + 國定假日）
#    注意：daily_win 已經是「每天一列」(city/publish/forecastDate) 的資料
# =========================
dow = daily_win["forecastDate"].dt.dayofweek  # Mon=0 ... Sun=6
daily_win["is_holiday"] = daily_win["forecastDate"].dt.normalize().isin(holiday_set)

daily_win["is_weekend"] = dow.isin([5, 6]) | daily_win["is_holiday"]
daily_win["is_weekday"] = ~daily_win["is_weekend"]

# =========================
# 5) 依 (cityName, experiment_date) 聚合出：
#    rainy_day: rainy_period>0 的天數
#    is_rainy_weekday/weekend: rainfall>10 且平日/假日 的天數
#    分母：該週窗口內實際平日/假日天數（動態）
# =========================
def safe_div(num, den):
    return np.where(den > 0, num / den, np.nan)

feat = (
    daily_win.groupby(["cityName", "experiment_date"], as_index=False)
    .apply(lambda g: pd.Series({
        # 分子（天數）
        "rainy_day": (g["rainy_period"] > 0).sum(),
        "is_rainy_weekday": ((g["rainfall_amount_mm"] > 10) & (g["is_weekday"])).sum(),
        "is_rainy_weekend": ((g["rainfall_amount_mm"] > 10) & (g["is_weekend"])).sum(),

        # 分母（實際天數；因為 daily_win 已是「每天一列」，所以直接 sum bool 就是天數）
        "den_all_days": len(g),
        "den_weekday_days": g["is_weekday"].sum(),
        "den_weekend_days": g["is_weekend"].sum(),
    }))
    .reset_index(drop=True)
)

# 平均（比例）— 分母動態
feat["avg_rainy_day"] = safe_div(feat["rainy_day"], feat["den_all_days"])
feat["avg_rainy_weekday"] = safe_div(feat["is_rainy_weekday"], feat["den_weekday_days"])
feat["avg_rainy_weekend"] = safe_div(feat["is_rainy_weekend"], feat["den_weekend_days"])

# join key 對齊
feat = feat.rename(columns={"cityName": "city"})

# =========================
# 6) Left join 回 user
# =========================
user_with_weather = user.merge(
    feat[[
        "city", "experiment_date",
        "avg_rainy_day", "avg_rainy_weekday", "avg_rainy_weekend",
        # 下面三個分母我強烈建議先留著做 sanity check
        "den_all_days", "den_weekday_days", "den_weekend_days",
    ]],
    on=["city", "experiment_date"],
    how="left",
)

print(user_with_weather.shape)


(19326, 34)


  .apply(lambda g: pd.Series({


In [17]:
# daily_win 是你用來做特徵的那張 (cityName, experiment_date, forecastDate, rainy_period, rainfall_amount_mm, ...)
cnt_days = (daily_win.groupby(["cityName","experiment_date"])["forecastDate"]
            .nunique()
            .reset_index(name="n_days_in_window"))

cnt_days["n_days_in_window"].value_counts().sort_index()

n_days_in_window
7    4070
Name: count, dtype: int64

In [18]:
(weather["subNarrative"].astype(str).str.contains("公釐")).mean()

np.float64(0.06899709626982355)

In [20]:
CITY = "屏東縣"
EXP_DATE = pd.to_datetime("2025-07-28")

tmp = daily_win[(daily_win["cityName"]==CITY) & (daily_win["experiment_date"]==EXP_DATE)].copy()
tmp = tmp.sort_values("forecastDate")

tmp["dow"] = tmp["forecastDate"].dt.dayofweek
tmp["is_weekday"] = tmp["dow"].between(0,4)
tmp["is_weekend"] = tmp["dow"].isin([5,6])

tmp[["forecastDate","rainy_period","rainfall_amount_mm","is_weekday","is_weekend"]]

Unnamed: 0,forecastDate,rainy_period,rainfall_amount_mm,is_weekday,is_weekend
10288,2025-07-28,1,0.0,True,False
10289,2025-07-29,2,75.0,True,False
10290,2025-07-30,2,100.0,True,False
10291,2025-07-31,2,75.0,True,False
10292,2025-08-01,1,25.0,True,False
10293,2025-08-02,0,75.0,False,True
10294,2025-08-03,0,75.0,False,True


In [None]:
pd.set_option("display.max_columns", None)
user_with_weather.head()

Unnamed: 0,experiment_date,treatment,source,ops_type_merged,city,user_cnt,nonrepeat_cnt,trip_cnt,weekday_nonrepeat_cnt,weekend_nonrepeat_cnt,weekday_trip_cnt,weekend_trip_cnt,nonrepeat_cnt_per_user,trip_cnt_per_user,weekday_nonrepeat_cnt_per_user,weekend_nonrepeat_cnt_per_user,weekday_trip_cnt_per_user,weekend_trip_cnt_per_user,weekday_match_rate,weekend_match_rate,coupon_BD_total,coupon_CDP_total,coupon_folk_total,coupon_growth_other_total,coupon_MGM_total,coupon_MKT_total,coupon_register_total,coupon_daily_total,avg_rainy_day,avg_rainy_weekday,avg_rainy_weekend
0,2025-07-28,不發,控制組,14天在其他尖峰預估車資,南投縣,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0,3,0,0,0,0,0,0,1.0,0.0,0.5
1,2025-07-28,不發,控制組,14天在其他尖峰預估車資,嘉義市,15,4.0,4.0,1.0,3.0,0.0,4.0,0.266667,0.266667,0.066667,0.2,0.0,0.266667,,1.0,0,0,0,0,0,0,0,0,1.0,0.0,1.0
2,2025-07-28,不發,控制組,14天在其他尖峰預估車資,嘉義縣,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0,3,0,0,0,0,0,0,1.0,0.0,0.5
3,2025-07-28,不發,控制組,14天在其他尖峰預估車資,基隆市,50,13.0,10.0,6.0,7.0,7.0,3.0,0.26,0.2,0.12,0.14,0.14,0.06,0.89,0.75,0,28,0,0,4,0,0,0,0.714286,0.0,0.0
4,2025-07-28,不發,控制組,14天在其他尖峰預估車資,宜蘭縣,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0,3,0,0,0,0,0,0,0.857143,0.0,0.0
5,2025-07-28,不發,控制組,14天在其他尖峰預估車資,屏東縣,13,2.0,2.0,1.0,1.0,2.0,0.0,0.153846,0.153846,0.076923,0.076923,0.153846,0.0,1.0,,0,7,0,0,0,0,4,0,1.0,0.8,1.0
6,2025-07-28,不發,控制組,14天在其他尖峰預估車資,彰化縣,18,2.0,2.0,2.0,0.0,2.0,0.0,0.111111,0.111111,0.111111,0.0,0.111111,0.0,1.0,,0,3,0,0,0,0,0,0,1.0,0.0,0.5
7,2025-07-28,不發,控制組,14天在其他尖峰預估車資,新北市,437,68.0,50.0,49.0,19.0,39.0,11.0,0.155606,0.114416,0.112128,0.043478,0.089245,0.025172,0.77,0.86,57,187,0,6,34,0,2,0,0.857143,0.0,0.0
8,2025-07-28,不發,控制組,14天在其他尖峰預估車資,新竹市,27,5.0,5.0,2.0,3.0,5.0,0.0,0.185185,0.185185,0.074074,0.111111,0.185185,0.0,1.0,,0,7,0,2,2,0,1,0,0.571429,0.0,0.0
9,2025-07-28,不發,控制組,14天在其他尖峰預估車資,新竹縣,25,2.0,0.0,2.0,0.0,0.0,0.0,0.08,0.0,0.08,0.0,0.0,0.0,0.0,,0,8,0,0,2,0,1,0,0.571429,0.0,0.0


In [13]:
user_with_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19326 entries, 0 to 19325
Data columns (total 31 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   experiment_date                 19326 non-null  datetime64[ns]
 1   treatment                       19326 non-null  object        
 2   source                          19326 non-null  object        
 3   ops_type_merged                 19326 non-null  object        
 4   city                            19326 non-null  object        
 5   user_cnt                        19326 non-null  int64         
 6   nonrepeat_cnt                   19326 non-null  float64       
 7   trip_cnt                        19326 non-null  float64       
 8   weekday_nonrepeat_cnt           19326 non-null  float64       
 9   weekend_nonrepeat_cnt           19326 non-null  float64       
 10  weekday_trip_cnt                19326 non-null  float64       
 11  we

In [12]:
user_with_weather.describe()

Unnamed: 0,experiment_date,user_cnt,nonrepeat_cnt,trip_cnt,weekday_nonrepeat_cnt,weekend_nonrepeat_cnt,weekday_trip_cnt,weekend_trip_cnt,nonrepeat_cnt_per_user,trip_cnt_per_user,weekday_nonrepeat_cnt_per_user,weekend_nonrepeat_cnt_per_user,weekday_trip_cnt_per_user,weekend_trip_cnt_per_user,weekday_match_rate,weekend_match_rate,coupon_BD_total,coupon_CDP_total,coupon_folk_total,coupon_growth_other_total,coupon_MGM_total,coupon_MKT_total,coupon_register_total,coupon_daily_total,avg_rainy_day,avg_rainy_weekday,avg_rainy_weekend
count,19326,19326.0,19326.0,19326.0,19326.0,19326.0,19326.0,19326.0,19326.0,19326.0,19326.0,19326.0,19326.0,19326.0,10425.0,9240.0,19326.0,19326.0,19326.0,19326.0,19326.0,19326.0,19326.0,19326.0,19326.0,19326.0,19326.0
mean,2025-11-22 04:48:48.283141632,81.226896,11.042482,7.804305,7.576633,3.465849,4.848805,2.9555,0.119196,0.083621,0.081213,0.037982,0.051637,0.031984,0.762626,0.729455,13.495188,25.236107,63.0549,0.501604,1.551537,0.797061,1.373797,0.020025,0.206081,0.072628,0.013686
min,2025-07-28 00:00:00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2025-11-03 00:00:00,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.67,0.59,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2025-11-24 00:00:00,18.0,1.0,1.0,1.0,0.0,0.0,0.0,0.067073,0.038462,0.03687,0.0,0.0,0.0,0.84,0.8,0.0,3.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2025-12-22 00:00:00,68.0,8.0,6.0,5.0,2.0,3.0,2.0,0.166667,0.117647,0.113377,0.047619,0.068966,0.038647,1.0,1.0,11.0,15.0,50.0,0.0,0.0,0.0,1.0,0.0,0.285714,0.0,0.0
max,2026-01-12 00:00:00,3795.0,573.0,439.0,390.0,227.0,292.0,151.0,4.0,3.666667,3.666667,3.0,3.0,2.5,1.0,1.0,620.0,1729.0,1652.0,82.0,1646.0,101.0,83.0,4.0,1.0,1.0,1.0
std,,182.847088,30.991056,21.865559,21.428368,10.049814,14.063118,8.36436,0.186399,0.146743,0.143449,0.091034,0.109421,0.080558,0.287841,0.306659,33.003034,81.049927,136.373231,2.340061,18.173851,4.097606,4.132387,0.159762,0.298299,0.191166,0.097744


In [14]:
from pathlib import Path

out_path = Path("..") / "cleaned_data" / "user_with_cnt_coupon_weather_cleaned.csv"
user_with_weather.to_csv(out_path, index=False, encoding="utf-8-sig")

print("Saved:", out_path.resolve())

Saved: D:\minhsiang.chang\Desktop\2026winter_project\cleaned_data\user_with_cnt_coupon_weather_cleaned.csv
