# 將用戶分群

In [11]:
import numpy as np
import pandas as pd

def add_ops_type_merged(user: pd.DataFrame) -> pd.DataFrame:
    df = user.copy()

    if "sub_group" not in df.columns:
        raise KeyError("DataFrame must contain column: 'sub_group'")
    if "ops_type" not in df.columns:
        raise KeyError("DataFrame must contain column: 'ops_type'")

    # 1) 先建空欄位：代表 CASE 的 default（沒命中就是 NaN）
    df["ops_type_merged"] = np.nan

    # 2) helper：只填「目前還沒被填過」的列，確保順序優先（像 CASE WHEN）
    def set_when(mask, value):
        df.loc[mask & df["ops_type_merged"].isna(), "ops_type_merged"] = value

    set_when(
        (df["ops_type"] == "喚回") &
        (df["sub_group"].isin([
            "D-90>=2,D-90優惠敏感度高,D-14=0",
            "D-90>=2,90D優惠敏感度高,D-14=0",
        ])),
        "喚回-高優惠敏感"
    )

    set_when(
        (df["ops_type"] == "喚回"),
        "喚回-其他"
    )

    set_when(
        (df["ops_type"] == "既有regular鞏固"),
        "既有regular鞏固"
    )

    set_when(
        (df["ops_type"] == "養成Regular人數") &
        (df["sub_group"].isin([
            "M-1=reactived,D-90優惠敏感度高,D-14=0",
            "M-1=reactived,90D優惠敏感度高,D-14=0",
            "M=reactived,90D優惠敏感度高,D-14=0",
        ])),
        "養成Regular-高優惠敏感"
    )

    set_when(
        (df["ops_type"] == "養成Regular人數"),
        "養成Regular-其他"
    )

    set_when(
        df["sub_group"].isin([
            "D-90小晚尖＋午夜(20~2)預估車資人群>=3,D-14=0",
            "D-90平日晚尖峰(17~19)預估車資人群>=3,D-14=0",
            "D-90平日早尖峰(7~9)預估車資人群>=3,D-14=0",
        ]),
        "90天在尖峰預估車資"
    )

    set_when(
        df["sub_group"].isin([
            "D-14小晚尖＋午夜(20~2)預估車資未完成叫車",
            "D-90潮汐族、D-90>=3、D-14=0",
            "D-90潮汐族,D-90>=3,D-14=0",
            "D-14平日早尖峰(7~9)有預估車資未完成叫車",
        ]),
        "14天在其他尖峰預估車資"
    )

    set_when(
        df["sub_group"].isin([
            "D-14平日晚尖峰(17~19)預估車資未完成叫車",
        ]),
        "14天在晚尖峰預估車資"
    )

    return df

In [12]:
user = pd.read_csv("../data/user_list.csv", encoding="utf-8")

user = add_ops_type_merged(user)

  df.loc[mask & df["ops_type_merged"].isna(), "ops_type_merged"] = value


In [13]:
user.head()

Unnamed: 0,experiment_date,user_id,treatment,ops_type,sub_group,source,ops_type_merged
0,2025-12-22,5145040,15x2元1張,時段鞏固-小晚尖＋午夜,D-14小晚尖＋午夜(20~2)預估車資未完成叫車,隨機組,14天在其他尖峰預估車資
1,2025-11-17,302812,15x2元1張,時段鞏固-小晚尖＋午夜,D-14小晚尖＋午夜(20~2)預估車資未完成叫車,隨機組,14天在其他尖峰預估車資
2,2025-12-01,4375821,15x2元1張,時段鞏固-小晚尖＋午夜,D-14小晚尖＋午夜(20~2)預估車資未完成叫車,隨機組,14天在其他尖峰預估車資
3,2025-11-24,2273154,15x2元1張,時段鞏固-小晚尖＋午夜,D-14小晚尖＋午夜(20~2)預估車資未完成叫車,隨機組,14天在其他尖峰預估車資
4,2025-12-22,433188,15x2元1張,時段鞏固-小晚尖＋午夜,D-14小晚尖＋午夜(20~2)預估車資未完成叫車,隨機組,14天在其他尖峰預估車資


In [14]:
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1569791 entries, 0 to 1569790
Data columns (total 7 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   experiment_date  1569791 non-null  object
 1   user_id          1569791 non-null  int64 
 2   treatment        1569791 non-null  object
 3   ops_type         1569791 non-null  object
 4   sub_group        1569791 non-null  object
 5   source           1569791 non-null  object
 6   ops_type_merged  1569791 non-null  object
dtypes: int64(1), object(6)
memory usage: 83.8+ MB


In [None]:
# 這個cell跑完之後並無刪減數據，可能此資料集已經有處理過

exclude_ops = ['原排除人群, 加入部分人群塞券', '全客流失率標籤']

user = user[~user["ops_type"].isin(exclude_ops)].copy()

user = user[user.groupby(["user_id", "experiment_date"])["user_id"].transform("size") == 1].copy()

In [16]:
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1569791 entries, 0 to 1569790
Data columns (total 7 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   experiment_date  1569791 non-null  object
 1   user_id          1569791 non-null  int64 
 2   treatment        1569791 non-null  object
 3   ops_type         1569791 non-null  object
 4   sub_group        1569791 non-null  object
 5   source           1569791 non-null  object
 6   ops_type_merged  1569791 non-null  object
dtypes: int64(1), object(6)
memory usage: 83.8+ MB


In [6]:
user.head()

Unnamed: 0,experiment_date,user_id,treatment,ops_type,sub_group,source,ops_type_merged
0,2025-12-22,5145040,15x2元1張,時段鞏固-小晚尖＋午夜,D-14小晚尖＋午夜(20~2)預估車資未完成叫車,隨機組,14天在其他尖峰預估車資
1,2025-11-17,302812,15x2元1張,時段鞏固-小晚尖＋午夜,D-14小晚尖＋午夜(20~2)預估車資未完成叫車,隨機組,14天在其他尖峰預估車資
2,2025-12-01,4375821,15x2元1張,時段鞏固-小晚尖＋午夜,D-14小晚尖＋午夜(20~2)預估車資未完成叫車,隨機組,14天在其他尖峰預估車資
3,2025-11-24,2273154,15x2元1張,時段鞏固-小晚尖＋午夜,D-14小晚尖＋午夜(20~2)預估車資未完成叫車,隨機組,14天在其他尖峰預估車資
4,2025-12-22,433188,15x2元1張,時段鞏固-小晚尖＋午夜,D-14小晚尖＋午夜(20~2)預估車資未完成叫車,隨機組,14天在其他尖峰預估車資


In [17]:
user = user.drop(columns=["ops_type", "sub_group"])

In [18]:
user.head()

Unnamed: 0,experiment_date,user_id,treatment,source,ops_type_merged
0,2025-12-22,5145040,15x2元1張,隨機組,14天在其他尖峰預估車資
1,2025-11-17,302812,15x2元1張,隨機組,14天在其他尖峰預估車資
2,2025-12-01,4375821,15x2元1張,隨機組,14天在其他尖峰預估車資
3,2025-11-24,2273154,15x2元1張,隨機組,14天在其他尖峰預估車資
4,2025-12-22,433188,15x2元1張,隨機組,14天在其他尖峰預估車資


# 加入用戶的縣市資料

In [21]:
import pandas as pd

# 讀兩個檔
user_city_1 = pd.read_csv("../data/user_city_1.csv", encoding="utf-8")
user_city_2 = pd.read_csv("../data/user_city_2.csv", encoding="utf-8")

# 合併（直向疊加）
user_city = pd.concat([user_city_1, user_city_2], ignore_index=True)

# 欄位名清理（避免有前後空白導致後面用不到）
user_city.columns = user_city.columns.str.strip()

# 日期型別統一（後面要 join 會比較穩）
if "experiment_date" in user_city.columns:
    user_city["experiment_date"] = pd.to_datetime(user_city["experiment_date"], errors="coerce")

# 去重：同 key 多筆只留第一筆（若你想改成最後一筆，把 keep="last"）
if set(["experiment_date", "user_id"]).issubset(user_city.columns):
    user_city = user_city.drop_duplicates(subset=["experiment_date", "user_id"], keep="first")

In [22]:
import pandas as pd

# 1) key 欄位型別統一（避免 join 不到）
user = user.copy()
user_city = user_city.copy()

user["experiment_date"] = pd.to_datetime(user["experiment_date"])
user_city["experiment_date"] = pd.to_datetime(user_city["experiment_date"])

# 2) 如果 user_city 同一個 key 有多筆，先做去重（保留第一筆；你也可改成最多次/最新等規則）
user_city_key = user_city[["experiment_date", "user_id", "city"]].drop_duplicates(
    subset=["experiment_date", "user_id"],
    keep="first"
)

# 3) left join 回 user
user = user.merge(
    user_city_key,
    on=["experiment_date", "user_id"],
    how="left"
)

# 4) city 缺值補「臺北市」；順便把空字串也當缺值
user["city"] = user["city"].replace("", pd.NA).fillna("臺北市")

In [23]:
user.head()

Unnamed: 0,experiment_date,user_id,treatment,source,ops_type_merged,city
0,2025-12-22,5145040,15x2元1張,隨機組,14天在其他尖峰預估車資,臺北市
1,2025-11-17,302812,15x2元1張,隨機組,14天在其他尖峰預估車資,新北市
2,2025-12-01,4375821,15x2元1張,隨機組,14天在其他尖峰預估車資,新北市
3,2025-11-24,2273154,15x2元1張,隨機組,14天在其他尖峰預估車資,臺北市
4,2025-12-22,433188,15x2元1張,隨機組,14天在其他尖峰預估車資,臺北市


In [24]:
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1569791 entries, 0 to 1569790
Data columns (total 6 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   experiment_date  1569791 non-null  datetime64[ns]
 1   user_id          1569791 non-null  int64         
 2   treatment        1569791 non-null  object        
 3   source           1569791 non-null  object        
 4   ops_type_merged  1569791 non-null  object        
 5   city             1569791 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 71.9+ MB


In [25]:
col = "city"
if col not in user.columns:
    raise KeyError(f"Missing column: {col}. Available: {list(user.columns)}")

# 取不重複值（保留原出現順序），NaN 會印成 <NA>
u = pd.unique(user[col].astype("object").where(~user[col].isna(), other=pd.NA))

print("\n" + "=" * 80)
print(f"Column: {col} | dtype: {user[col].dtype} | n_unique: {len(u)}")
print("-" * 80)
for v in u:
    print("<NA>" if pd.isna(v) else repr(v))

print("\nDONE.")


Column: city | dtype: object | n_unique: 22
--------------------------------------------------------------------------------
'臺北市'
'新北市'
'臺中市'
'桃園市'
'彰化縣'
'高雄市'
'雲林縣'
'臺南市'
'屏東縣'
'基隆市'
'新竹縣'
'嘉義市'
'花蓮縣'
'臺東縣'
'新竹市'
'苗栗縣'
'宜蘭縣'
'澎湖縣'
'金門縣'
'嘉義縣'
'南投縣'
'連江縣'

DONE.


In [26]:
import pandas as pd

s = user["city"]

counts = s.value_counts(dropna=False)
shares = s.value_counts(dropna=False, normalize=True)

summary = pd.DataFrame({
    "count": counts,
    "share": shares
})

# 想要百分比欄位（可選）
summary["share_pct"] = (summary["share"] * 100).round(2)

print(summary)

       count     share  share_pct
city                             
臺北市   407188  0.259390      25.94
新北市   338403  0.215572      21.56
臺中市   212011  0.135057      13.51
高雄市   161468  0.102860      10.29
桃園市   128943  0.082140       8.21
臺南市    84652  0.053926       5.39
花蓮縣    45655  0.029083       2.91
基隆市    38000  0.024207       2.42
新竹市    25681  0.016360       1.64
新竹縣    21803  0.013889       1.39
嘉義市    16677  0.010624       1.06
雲林縣    15793  0.010061       1.01
臺東縣    13238  0.008433       0.84
彰化縣    12967  0.008260       0.83
宜蘭縣    12457  0.007935       0.79
嘉義縣    12074  0.007691       0.77
屏東縣    11538  0.007350       0.74
苗栗縣     6433  0.004098       0.41
南投縣     2696  0.001717       0.17
澎湖縣     1155  0.000736       0.07
金門縣      952  0.000606       0.06
連江縣        7  0.000004       0.00


In [27]:
from pathlib import Path

out_path = Path("..") / "cleaned_data" / "user_cleaned.csv"
user.to_csv(out_path, index=False, encoding="utf-8-sig")

print("Saved:", out_path.resolve())

Saved: D:\minhsiang.chang\Desktop\2026winter_project\cleaned_data\user_cleaned.csv
