In [1]:
%pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.4
Note: you may need to restart the kernel to use updated packages.


In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
from sklearn.metrics import top_k_accuracy_score


df = pd.read_csv('retail_store_sales_cleaned_feature_engineering.csv')
# 原始排序
df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])
df = df.sort_values(['Customer ID', 'Transaction Date']).reset_index(drop=True)

# 建立 Category LabelEncoder
le = LabelEncoder()
df['Category_ID'] = le.fit_transform(df['Category'])

# 建立嵌入矩陣（例如每個類別對應 8 維向量）
NUM_CATEGORIES = df['Category_ID'].nunique()
EMBEDDING_DIM = 16
embedding_layer = nn.Embedding(NUM_CATEGORIES, EMBEDDING_DIM)

# 記錄所有特徵向量
feature_rows = []

# 設定 N：考慮過去 N 筆購買行為
N = 10

# 在原有特徵基礎上，新增更多時間序列特徵
for cust_id, cust_df in df.groupby('Customer ID'):
    cust_df = cust_df.sort_values('Transaction Date')
    for i in range(N, len(cust_df) - 1):
        history = cust_df.iloc[i-N:i]
        current = cust_df.iloc[i]
        next_row = cust_df.iloc[i + 1]

        # 嵌入特徵
        cat_ids = torch.tensor(history['Category_ID'].tolist(), dtype=torch.long)
        embedded = embedding_layer(cat_ids)
        embedded_flat = embedded.flatten().detach().numpy()

        # 增強的特徵工程
        row = {
            # 原有特徵
            'TotalSpent_Mean': history['Total Spent'].mean(),
            'TotalSpent_Std': history['Total Spent'].std(),
            'TotalSpent_Last': history['Total Spent'].iloc[-1],
            'Quantity_Mean': history['Quantity'].mean(),
            'Quantity_Sum': history['Quantity'].sum(),
            'Discount_Used_Count': history['Disc_True'].sum(),
            'Discount_Rate': history['Disc_True'].mean(),
            
            # 時間特徵
            'Recency_Days': (current['Transaction Date'] - history['Transaction Date'].max()).days,
            'Frequency': len(history),
            'Days_Between_Purchases': history['Transaction Date'].diff().dt.days.mean(),
            
            # 類別序列特徵
            'Most_Frequent_Category': history['Category'].mode().iloc[0] if not history['Category'].mode().empty else 'Unknown',
            'Category_Diversity': history['Category'].nunique(),
            'Last_Category': history['Category'].iloc[-1],
            'Category_Repeat_Rate': (history['Category'] == history['Category'].iloc[-1]).mean(),
            
            # 現在的交易特徵
            'Is_Weekend': current['Is_Weekend'],
            'Is_Holiday': current['Is_Holiday'],
            'Is_NonWorkday': current['Is_NonWorkday'],
            'PM_Credit Card': current['PM_Credit Card'],
            'PM_Digital Wallet': current['PM_Digital Wallet'],
            'Loc_Online': current['Loc_Online'],
            'Disc_True': current['Disc_True'],
            'Disc_Unknown': current['Disc_Unknown'],
            'Current_Category': current['Category'],
            'Next_Category': next_row['Category']
        }

        # 拼接嵌入特徵
        for j, val in enumerate(embedded_flat):
            row[f'Embed_{j}'] = val

        feature_rows.append(row)
        

feature_df = pd.DataFrame(feature_rows).fillna(0)

# 對目標變數進行標籤編碼
y_encoder = LabelEncoder()
feature_df['Next_Category_Encoded'] = y_encoder.fit_transform(feature_df['Next_Category'])

# 對字串類型的特徵進行編碼
category_encoder = LabelEncoder()
feature_df['Most_Frequent_Category_Encoded'] = category_encoder.fit_transform(feature_df['Most_Frequent_Category'])

last_category_encoder = LabelEncoder()
feature_df['Last_Category_Encoded'] = last_category_encoder.fit_transform(feature_df['Last_Category'])

# 移除原始字串特徵，保留編碼後的特徵
feature_df = feature_df.drop(columns=['Most_Frequent_Category', 'Last_Category'])

# One-hot encode 現在購買的類別
X = pd.get_dummies(feature_df.drop(columns=['Next_Category', 'Next_Category_Encoded']), columns=['Current_Category'])
y = feature_df['Next_Category_Encoded']  # 使用編碼後的標籤

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# XGBoost 模型訓練
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
model.fit(X_train, y_train)

# 預測與評估
y_pred = model.predict(X_test)

# 將預測結果轉換回原始類別名稱以便查看
y_test_labels = y_encoder.inverse_transform(y_test)
y_pred_labels = y_encoder.inverse_transform(y_pred)

print(classification_report(y_test_labels, y_pred_labels))

# 顯示類別映射
print("\n類別編碼映射:")
for i, category in enumerate(y_encoder.classes_):
    print(f"{i}: {category}")

y_pred_proba = model.predict_proba(X_test)

# 使用 sklearn 的 top_k_accuracy_score
top1_acc = top_k_accuracy_score(y_test, y_pred_proba, k=1)
top3_acc = top_k_accuracy_score(y_test, y_pred_proba, k=3)

print(f"Top-1 準確率: {top1_acc:.4f}")
print(f"Top-3 準確率: {top3_acc:.4f}")

Parameters: { "use_label_encoder" } are not used.



                                    precision    recall  f1-score   support

                         Beverages       0.10      0.10      0.10       308
                          Butchers       0.13      0.13      0.13       305
Computers and electric accessories       0.14      0.14      0.14       305
     Electric household essentials       0.12      0.12      0.12       312
                              Food       0.10      0.10      0.10       313
                         Furniture       0.11      0.11      0.11       310
                     Milk Products       0.13      0.13      0.13       308
                        Patisserie       0.12      0.11      0.11       299

                          accuracy                           0.12      2460
                         macro avg       0.12      0.12      0.12      2460
                      weighted avg       0.12      0.12      0.12      2460


類別編碼映射:
0: Beverages
1: Butchers
2: Computers and electric accessories
3: Electric h

In [3]:
# 計算每位顧客的購買次數
cust_counts = df.groupby('Customer ID').size()

# 最大購買次數
max_n = cust_counts.max()
print(f"✅ 所有顧客中最多的購買次數是：{max_n}")


✅ 所有顧客中最多的購買次數是：544


In [7]:
purchase_counts = df.groupby('Customer ID').size()
max_available_N = purchase_counts.max() - 1  # 最長可產生序列的長度

print(f"🔧 你最多可以設 N 為：{max_available_N}")

🔧 你最多可以設 N 為：543


📊 Total training samples created: 232


In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm
PAD_TOKEN = -1
df = pd.read_csv('retail_store_sales_cleaned_feature_engineering.csv')
# =======================
# 資料處理：組成序列資料
# =======================
df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])
df = df.sort_values(['Customer ID', 'Transaction Date']).reset_index(drop=True)

# 編碼類別
le = LabelEncoder()
df['Category_ID'] = le.fit_transform(df['Category'])
num_classes = df['Category_ID'].nunique()



PAD_TOKEN = df['Category_ID'].nunique()  # 類別數，保證 PAD 不重複
N = 500  # 固定序列長度
seq_data = []

for cust_id, cust_df in df.groupby('Customer ID'):
    category_ids = cust_df['Category_ID'].tolist()
    T = len(category_ids)

    for i in range(1, T):  # 每一筆要預測第 i 筆（從第 1 筆開始）
        history = category_ids[:i]           # 顧客第 i 筆前的所有紀錄
        padded = [PAD_TOKEN] * max(0, N - len(history)) + history[-N:]  # 補 PAD 至長度 N
        label = category_ids[i]              # 第 i 筆的類別就是 label
        seq_data.append((padded, label))



print(f"📊 Total training samples created: {len(seq_data)}")
# 分訓練測試集
X_seq, y_seq = zip(*seq_data)
X_train, X_test, y_train, y_test = train_test_split(list(X_seq), list(y_seq), test_size=0.1, stratify=y_seq)

# =======================
# 建立 Dataset & DataLoader
# =======================
class PurchaseDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.LongTensor(X)
        self.y = torch.LongTensor(y)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = PurchaseDataset(X_train, y_train)
test_ds = PurchaseDataset(X_test, y_test)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32)

# =======================
# 建立 LSTM 模型
# =======================
class LSTMPredictor(nn.Module):
    def __init__(self, num_classes, emb_dim=32, hidden_dim=64):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=num_classes + 1, embedding_dim=32, padding_idx=PAD_TOKEN)
        self.lstm = nn.LSTM(input_size=emb_dim, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
    def forward(self, x):
        x = self.embedding(x)  # (batch, seq_len, emb_dim)
        _, (h_n, _) = self.lstm(x)  # 只取最後一層 hidden
        out = self.fc(h_n.squeeze(0))  # (batch, num_classes)
        return out

# =======================
# 模型訓練
# =======================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMPredictor(num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

EPOCHS = 100
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    correct_top1 = 0
    correct_top3 = 0
    total_samples = 0

    print(f"\n🚀 Epoch {epoch+1}/{EPOCHS}")
    for X_batch, y_batch in tqdm(train_loader, desc="Training", leave=False):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)  # (batch_size, num_classes)

        # 計算損失
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # -------- 準確率計算 --------
        _, pred_top1 = torch.max(outputs, dim=1)
        correct_top1 += (pred_top1 == y_batch).sum().item()

        top3 = torch.topk(outputs, 3, dim=1).indices
        for i in range(len(y_batch)):
            if y_batch[i] in top3[i]:
                correct_top3 += 1

        total_samples += y_batch.size(0)
        # ---------------------------

    avg_loss = total_loss / len(train_loader)
    top1_acc = correct_top1 / total_samples
    top3_acc = correct_top3 / total_samples

    print(f"✅ Epoch {epoch+1} - Loss: {avg_loss:.4f} | Top-1 Accuracy: {top1_acc:.2%} | Top-3 Accuracy: {top3_acc:.2%}")
# =======================
# Top-3 預測與評估
# =======================
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        logits = model(X_batch)
        top3 = torch.topk(logits, 3, dim=1).indices.cpu().numpy()
        all_preds.extend(top3)
        all_labels.extend(y_batch.numpy())

# Top-3 命中率
hits = [label in pred for label, pred in zip(all_labels, all_preds)]
top3_acc = np.mean(hits)
print(f"\n✅ Top-3 Accuracy: {top3_acc:.2%}")


📊 Total training samples created: 12550

🚀 Epoch 1/100


                                                          

KeyboardInterrupt: 

In [1]:
%pip install  lightgbm
%pip install xgboost

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m202.7 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
from collections import deque
from scipy.stats import randint, uniform

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, make_scorer

# -----------------------------
# 1. 讀檔 + 時間排序
# -----------------------------
df = pd.read_csv(
    "retail_store_sales_cleaned_feature_engineering.csv",
    parse_dates=["Transaction Date"]
)
df = df.sort_values(["Customer ID", "Transaction Date"])\
       .reset_index(drop=True)

# -----------------------------
# 2. 目標編碼
# -----------------------------
le = LabelEncoder()
df["Category_ID"] = le.fit_transform(df["Category"])

# -----------------------------
# 3. 衍生特徵：Inter_Days、30天RFM、週期編碼、flags
# -----------------------------
df["Prev_Date"] = df.groupby("Customer ID")["Transaction Date"].shift(1)
df["Inter_Days"] = (df["Transaction Date"] - df["Prev_Date"])\
                   .dt.days.fillna(0)

window_times, window_amounts = deque(), deque()
freq30 = np.zeros(len(df), int)
amt30_sum = np.zeros(len(df))
amt30_count = np.zeros(len(df))

for i, (cust, t, amt) in enumerate(zip(
    df["Customer ID"],
    df["Transaction Date"],
    df["Total Spent"]
)):
    if i == 0 or cust != df.loc[i-1, "Customer ID"]:
        window_times.clear(); window_amounts.clear()
    window_times.append(t); window_amounts.append(amt)
    while (t - window_times[0]).days > 30:
        window_times.popleft(); window_amounts.popleft()
    freq30[i]     = len(window_times) - 1
    amt30_sum[i]  = sum(window_amounts)
    amt30_count[i]= len(window_times)

df["freq30"]     = freq30
df["amt30_mean"] = amt30_sum / amt30_count

# 週期編碼
df["month_sin"] = np.sin(2*np.pi*df["Month"]  / 12)
df["month_cos"] = np.cos(2*np.pi*df["Month"]  / 12)
df["day_sin"]   = np.sin(2*np.pi*(df["Day"]-1)/31)
df["day_cos"]   = np.cos(2*np.pi*(df["Day"]-1)/31)

bool_cols = [
    "Is_Weekend","Is_Holiday","Is_NonWorkday",
    "PM_Credit Card","PM_Digital Wallet",
    "Loc_Online","Disc_True","Disc_Unknown"
]
df[bool_cols] = df[bool_cols].astype(int)

feature_cols = [
    "Price Per Unit","Quantity","Total Spent","Recency_Cust",
    "Inter_Days","freq30","amt30_mean",
    "month_sin","month_cos","day_sin","day_cos"
] + bool_cols

# -----------------------------
# 4. per-user 時間切分 80%/20%
# -----------------------------
train_list, test_list = [], []
for _, grp in df.groupby("Customer ID"):
    grp = grp.sort_values("Transaction Date").reset_index(drop=True)
    cut = int(len(grp) * 0.8)
    train_list.append(grp.iloc[:cut])
    test_list.append(grp.iloc[cut:])

train_df = pd.concat(train_list).reset_index(drop=True)
test_df  = pd.concat(test_list).reset_index(drop=True)

# -----------------------------
# 5. 標準化
# -----------------------------
scaler = StandardScaler()
train_df[feature_cols] = scaler.fit_transform(train_df[feature_cols])
test_df [feature_cols] = scaler.transform(test_df[feature_cols])

X_train, y_train = train_df[feature_cols], train_df["Category_ID"]
X_test,  y_test  = test_df [feature_cols], test_df ["Category_ID"]

# -----------------------------
# 6. Recall@3 打分函式（先轉成 numpy）
# -----------------------------
def recall_at_3(y_true, y_score):
    y_true = np.array(y_true)  # <- 這行很重要
    top3 = np.argsort(y_score, axis=1)[:, -3:]
    return np.mean([y_true[i] in top3[i] for i in range(len(y_true))])

recall3_scorer = make_scorer(recall_at_3, needs_proba=True)

# -----------------------------
# 7. 模型與超參搜尋空間
# -----------------------------
models_and_spaces = {
    "LogisticRegression": (
        LogisticRegression(random_state=42, max_iter=1000),
        {
            "C": uniform(0.01, 10),
            "penalty": ["l2"],
            "solver": ["lbfgs"]
        }
    ),
    "RandomForest": (
        RandomForestClassifier(random_state=42),
        {
            "n_estimators": randint(50, 300),
            "max_depth": randint(3, 20),
            "max_features": ["sqrt", "log2", None]
        }
    ),
    "HistGradientBoost": (
        HistGradientBoostingClassifier(random_state=42),
        {
            "learning_rate": uniform(0.01, 0.3),
            "max_iter": randint(50, 300),
            "max_leaf_nodes": randint(10, 100)
        }
    ),
    "ExtraTrees": (
        ExtraTreesClassifier(random_state=42),
        {
            "n_estimators": randint(50, 300),
            "max_depth": randint(3, 20),
            "max_features": ["sqrt", "log2", None]
        }
    ),
    "LightGBM": (
        LGBMClassifier(random_state=42),
        {
            "n_estimators": randint(50, 300),
            "learning_rate": uniform(0.01, 0.3),
            "num_leaves": randint(10, 150)
        }
    ),
    # "XGBoost": (
    #     XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="mlogloss"),
    #     {
    #         "n_estimators": randint(50, 300),
    #         "learning_rate": uniform(0.01, 0.3),
    #         "max_depth": randint(3, 20),
    #         "subsample": uniform(0.5, 0.5)
    #     }
    # )
}

# -----------------------------
# 8. RandomizedSearchCV
# -----------------------------
best_results = []
for name, (estimator, param_dist) in models_and_spaces.items():
    print(f"\n>> Tuning {name} ...")
    search = RandomizedSearchCV(
        estimator=estimator,
        param_distributions=param_dist,
        n_iter=20,
        scoring={"acc": "accuracy", "rec3": recall3_scorer},
        refit="rec3",
        cv=3,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    search.fit(X_train, y_train)

    best = search.best_estimator_
    test_acc  = accuracy_score(y_test, best.predict(X_test))
    test_rec3 = recall_at_3(y_test, best.predict_proba(X_test))

    best_results.append({
        "Model": name,
        "BestParams": search.best_params_,
        "Val_Recall@3": f"{search.best_score_:.2%}",
        "Test_Acc": f"{test_acc:.2%}",
        "Test_Recall@3": f"{test_rec3:.2%}"
    })

# -----------------------------
# 9. 列印結果
# -----------------------------
results_df = pd.DataFrame(best_results)
print("\n=== Final Comparison ===")
print(results_df.to_string(index=False))



>> Tuning LogisticRegression ...
Fitting 3 folds for each of 20 candidates, totalling 60 fits





>> Tuning RandomForest ...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

>> Tuning HistGradientBoost ...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

>> Tuning ExtraTrees ...
Fitting 3 folds for each of 20 candidates, totalling 60 fits





>> Tuning LightGBM ...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002555 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 690
[LightGBM] [Info] Number of data points in the train set: 6699, number of used features: 19
[LightGBM] [Info] Start training from score -2.093119
[LightGBM] [Info] Start training from score -2.069194
[LightGBM] [Info] Start training from score -2.107753
[LightGBM] [Info] Start training from score -2.053945
[LightGBM] [Info] Start training from score -2.082282
[LightGBM] [Info] Start training from score -2.060954
[LightGBM] [Info] Start training from score -2.064477
[LightGBM] [Info] Start training from score -2.105299
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002902 seconds.
You can set `force_col_wise=



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009535 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 691
[LightGBM] [Info] Number of data points in the train set: 6699, number of used features: 19
[LightGBM] [Info] Start training from score -2.093119
[LightGBM] [Info] Start training from score -2.069194
[LightGBM] [Info] Start training from score -2.107753
[LightGBM] [Info] Start training from score -2.052781
[LightGBM] [Info] Start training from score -2.083480
[LightGBM] [Info] Start training from score -2.060954
[LightGBM] [Info] Start training from score -2.064477
[LightGBM] [Info] Start training from score -2.105299
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003346 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 690
[LightGBM] [Info] Numb

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode


=== Final Comparison ===
             Model                                                                                                  BestParams Val_Recall@3 Test_Acc Test_Recall@3
LogisticRegression                                               {'C': 2.1333911067827613, 'penalty': 'l2', 'solver': 'lbfgs'}       39.54%   14.57%        39.55%
      RandomForest                                                 {'max_depth': 6, 'max_features': None, 'n_estimators': 199}       51.65%   20.78%        53.84%
 HistGradientBoost                              {'learning_rate': 0.010233629752304298, 'max_iter': 237, 'max_leaf_nodes': 30}       49.05%   19.91%        50.71%
        ExtraTrees                                                 {'max_depth': 6, 'max_features': None, 'n_estimators': 199}       52.86%   20.86%        53.64%
          LightGBM                              {'learning_rate': 0.010233629752304298, 'n_estimators': 237, 'num_leaves': 30}       49.36%   18.49%        50.

In [6]:
results_df

Unnamed: 0,Model,BestParams,Val_Recall@3,Test_Acc,Test_Recall@3
0,LogisticRegression,"{'C': 2.1333911067827613, 'penalty': 'l2', 'so...",39.54%,14.57%,39.55%
1,RandomForest,"{'max_depth': 6, 'max_features': None, 'n_esti...",51.65%,20.78%,53.84%
2,HistGradientBoost,"{'learning_rate': 0.010233629752304298, 'max_i...",49.05%,19.91%,50.71%
3,ExtraTrees,"{'max_depth': 6, 'max_features': None, 'n_esti...",52.86%,20.86%,53.64%
4,LightGBM,"{'learning_rate': 0.010233629752304298, 'n_est...",49.36%,18.49%,50.91%
5,XGBoost,"{'learning_rate': 0.05958008171890075, 'max_de...",48.10%,17.97%,50.00%


In [7]:
import pandas as pd
import numpy as np
from collections import deque
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans


import pandas as pd
import numpy as np
from collections import deque
from scipy.stats import randint, uniform

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, make_scorer

# -----------------------------
# 1. 讀檔 + 時間排序
# -----------------------------
df = pd.read_csv(
    "retail_store_sales_cleaned_feature_engineering.csv",
    parse_dates=["Transaction Date"]
)
df = df.sort_values(["Customer ID", "Transaction Date"])\
       .reset_index(drop=True)

# -----------------------------
# 2. 目標編碼
# -----------------------------
le = LabelEncoder()
df["Category_ID"] = le.fit_transform(df["Category"])

# -----------------------------
# 3. 衍生特徵：Inter_Days、30天RFM、週期編碼、flags
# -----------------------------
df["Prev_Date"] = df.groupby("Customer ID")["Transaction Date"].shift(1)
df["Inter_Days"] = (df["Transaction Date"] - df["Prev_Date"])\
                   .dt.days.fillna(0)

window_times, window_amounts = deque(), deque()
freq30 = np.zeros(len(df), int)
amt30_sum = np.zeros(len(df))
amt30_count = np.zeros(len(df))

for i, (cust, t, amt) in enumerate(zip(
    df["Customer ID"],
    df["Transaction Date"],
    df["Total Spent"]
)):
    if i == 0 or cust != df.loc[i-1, "Customer ID"]:
        window_times.clear(); window_amounts.clear()
    window_times.append(t); window_amounts.append(amt)
    while (t - window_times[0]).days > 30:
        window_times.popleft(); window_amounts.popleft()
    freq30[i]     = len(window_times) - 1
    amt30_sum[i]  = sum(window_amounts)
    amt30_count[i]= len(window_times)

df["freq30"]     = freq30
df["amt30_mean"] = amt30_sum / amt30_count

# 週期編碼
df["month_sin"] = np.sin(2*np.pi*df["Month"]  / 12)
df["month_cos"] = np.cos(2*np.pi*df["Month"]  / 12)
df["day_sin"]   = np.sin(2*np.pi*(df["Day"]-1)/31)
df["day_cos"]   = np.cos(2*np.pi*(df["Day"]-1)/31)

bool_cols = [
    "Is_Weekend","Is_Holiday","Is_NonWorkday",
    "PM_Credit Card","PM_Digital Wallet",
    "Loc_Online","Disc_True","Disc_Unknown"
]
df[bool_cols] = df[bool_cols].astype(int)

feature_cols = [
    "Price Per Unit","Quantity","Total Spent","Recency_Cust",
    "Inter_Days","freq30","amt30_mean",
    "month_sin","month_cos","day_sin","day_cos"
] + bool_cols

def rolling_rfm(df, window_days, amt_col="Total Spent"):
    times, amts, freqs, sums = deque(), deque(), [], []
    for cust, group in df.groupby("Customer ID", sort=False):
        times.clear(); amts.clear()
        for t, amt in zip(group["Transaction Date"], group[amt_col]):
            times.append(t); amts.append(amt)
            # pop 超出 window_days
            while (t - times[0]).days > window_days:
                times.popleft(); amts.popleft()
            freqs.append(len(times)-1)            # 扣掉自己
            sums.append(sum(amts))
    return freqs, sums

df = df.sort_values(["Customer ID","Transaction Date"]).reset_index(drop=True)
# 7 天
df["freq7"],  df["amt7_sum"]  = rolling_rfm(df,  7)
# 30 天（以前已有 freq30, amt30_mean），如果想保留 sum 可覆蓋或改名
# 90 天
df["freq90"], df["amt90_sum"] = rolling_rfm(df, 90)

# 為了跟 amt30_mean 同步，上面 sum 改成 mean 也很簡單：
df["amt7_mean"]  = df["amt7_sum"]  / (df["freq7"]  + 1)  # +1 包含當前交易
df["amt90_mean"] = df["amt90_sum"] / (df["freq90"] + 1)

# --- 2. 距今最後購買日的指數衰減特徵（Recency） ---------------------

# 先算「距今天數」
today = df["Transaction Date"].max()  # or datetime.today()
last_purchase = df.groupby("Customer ID")["Transaction Date"].transform("max")
df["days_since_last"] = (today - last_purchase).dt.days

# 再做指數衰減：exp(-λ·days)，λ 可調
lam = 0.05
df["recency_exp"] = np.exp(-lam * df["days_since_last"])

# --- 3. 客戶整體 RFM 聚類 -----------------------------------------

# 為每位客戶計算整體 RFM 向量
rfm = df.groupby("Customer ID").agg({
    "days_since_last": "min",               # Recency：最近那筆交易距今
    "Transaction ID": "count",              # Frequency：交易次數
    "Total Spent": "sum"                    # Monetary：總花費
}).rename(columns={
    "days_since_last": "R",
    "Transaction ID": "F",
    "Total Spent": "M"
})

# 標準化後做 KMeans
scaler_rfm = StandardScaler()
rfm_scaled = scaler_rfm.fit_transform(rfm[["R","F","M"]])
kmeans = KMeans(n_clusters=5, random_state=42)
rfm["cluster"] = kmeans.fit_predict(rfm_scaled)

# Merge 回原始 df
df = df.merge(rfm["cluster"], left_on="Customer ID", right_index=True)

# --- 4. 更新 feature_cols ---------------------------------------

new_feats = [
    "freq7", "amt7_mean",
    "freq90", "amt90_mean",
    "days_since_last", "recency_exp",
    "cluster"
]
feature_cols += new_feats


train_list, test_list = [], []
for _, grp in df.groupby("Customer ID"):
    grp = grp.sort_values("Transaction Date").reset_index(drop=True)
    cut = int(len(grp) * 0.8)
    train_list.append(grp.iloc[:cut])
    test_list.append(grp.iloc[cut:])

train_df = pd.concat(train_list).reset_index(drop=True)
test_df  = pd.concat(test_list).reset_index(drop=True)

# -----------------------------
# 5. 標準化
# -----------------------------
scaler = StandardScaler()
train_df[feature_cols] = scaler.fit_transform(train_df[feature_cols])
test_df [feature_cols] = scaler.transform(test_df[feature_cols])

X_train, y_train = train_df[feature_cols], train_df["Category_ID"]
X_test,  y_test  = test_df [feature_cols], test_df ["Category_ID"]

# -----------------------------
# 6. Recall@3 打分函式（先轉成 numpy）
# -----------------------------
def recall_at_3(y_true, y_score):
    y_true = np.array(y_true)  # <- 這行很重要
    top3 = np.argsort(y_score, axis=1)[:, -3:]
    return np.mean([y_true[i] in top3[i] for i in range(len(y_true))])

recall3_scorer = make_scorer(recall_at_3, needs_proba=True)

# -----------------------------
# 7. 模型與超參搜尋空間
# -----------------------------
models_and_spaces = {
    "LogisticRegression": (
        LogisticRegression(random_state=42, max_iter=1000),
        {
            "C": uniform(0.01, 10),
            "penalty": ["l2"],
            "solver": ["lbfgs"]
        }
    ),
    "RandomForest": (
        RandomForestClassifier(random_state=42),
        {
            "n_estimators": randint(50, 300),
            "max_depth": randint(3, 20),
            "max_features": ["sqrt", "log2", None]
        }
    ),
    "HistGradientBoost": (
        HistGradientBoostingClassifier(random_state=42),
        {
            "learning_rate": uniform(0.01, 0.3),
            "max_iter": randint(50, 300),
            "max_leaf_nodes": randint(10, 100)
        }
    ),
    "ExtraTrees": (
        ExtraTreesClassifier(random_state=42),
        {
            "n_estimators": randint(50, 300),
            "max_depth": randint(3, 20),
            "max_features": ["sqrt", "log2", None]
        }
    ),
    "LightGBM": (
        LGBMClassifier(random_state=42),
        {
            "n_estimators": randint(50, 300),
            "learning_rate": uniform(0.01, 0.3),
            "num_leaves": randint(10, 150)
        }
    ),
    # "XGBoost": (
    #     XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="mlogloss"),
    #     {
    #         "n_estimators": randint(50, 300),
    #         "learning_rate": uniform(0.01, 0.3),
    #         "max_depth": randint(3, 20),
    #         "subsample": uniform(0.5, 0.5)
    #     }
    # )
}

# -----------------------------
# 8. RandomizedSearchCV
# -----------------------------
best_results = []
for name, (estimator, param_dist) in models_and_spaces.items():
    print(f"\n>> Tuning {name} ...")
    search = RandomizedSearchCV(
        estimator=estimator,
        param_distributions=param_dist,
        n_iter=20,
        scoring={"acc": "accuracy", "rec3": recall3_scorer},
        refit="rec3",
        cv=3,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    search.fit(X_train, y_train)

    best = search.best_estimator_
    test_acc  = accuracy_score(y_test, best.predict(X_test))
    test_rec3 = recall_at_3(y_test, best.predict_proba(X_test))

    best_results.append({
        "Model": name,
        "BestParams": search.best_params_,
        "Val_Recall@3": f"{search.best_score_:.2%}",
        "Test_Acc": f"{test_acc:.2%}",
        "Test_Recall@3": f"{test_rec3:.2%}"
    })

# -----------------------------
# 9. 列印結果
# -----------------------------
results_df = pd.DataFrame(best_results)
print("\n=== Final Comparison ===")
print(results_df.to_string(index=False))



>> Tuning LogisticRegression ...
Fitting 3 folds for each of 20 candidates, totalling 60 fits





>> Tuning RandomForest ...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

>> Tuning HistGradientBoost ...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

>> Tuning ExtraTrees ...
Fitting 3 folds for each of 20 candidates, totalling 60 fits





>> Tuning LightGBM ...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002774 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1293
[LightGBM] [Info] Number of data points in the train set: 6699, number of used features: 26
[LightGBM] [Info] Start training from score -2.093119
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001884 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Start training from score -2.069194[LightGBM] [Info] Total Bins 1296

[LightGBM] [Info] Number of data points in the train set: 6699, number of used features: 26
[LightGBM] [Info] Start training from score -2.107753
[LightGBM] [Info] Start training from score -2.053945
[LightGBM] [Info] Start training from score -2.093119
[Ligh



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001109 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1293
[LightGBM] [Info] Number of data points in the train set: 6700, number of used features: 26
[LightGBM] [Info] Start training from score -2.093268
[LightGBM] [Info] Start training from score -2.069343
[LightGBM] [Info] Start training from score -2.107902
[LightGBM] [Info] Start training from score -2.054094
[LightGBM] [Info] Start training from score -2.082431
[LightGBM] [Info] Start training from score -2.059932
[LightGBM] [Info] Start training from score -2.064626
[LightGBM] [Info] Start training from score -2.105448
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001036 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wi

In [8]:
results_df

Unnamed: 0,Model,BestParams,Val_Recall@3,Test_Acc,Test_Recall@3
0,LogisticRegression,"{'C': 0.21584494295802448, 'penalty': 'l2', 's...",39.89%,13.66%,40.02%
1,RandomForest,"{'max_depth': 6, 'max_features': None, 'n_esti...",51.17%,20.55%,53.52%
2,HistGradientBoost,"{'learning_rate': 0.010233629752304298, 'max_i...",48.19%,18.69%,50.36%
3,ExtraTrees,"{'max_depth': 6, 'max_features': None, 'n_esti...",52.37%,20.59%,53.92%
4,LightGBM,"{'learning_rate': 0.010233629752304298, 'n_est...",48.24%,18.05%,50.36%


In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRanker
from sklearn.metrics import ndcg_score, recall_score
from lightgbm import LGBMRanker, log_evaluation, early_stopping
# 1. 读取并预处理（跟你之前的 pipeline 一致，只留关键步骤）
df = pd.read_csv("retail_store_sales_cleaned_feature_engineering.csv",
                 parse_dates=["Transaction Date"])
df = df.sort_values(["Customer ID", "Transaction Date"]).reset_index(drop=True)

# 假设你已经做完所有特征工程，并把 RFM、多窗口特征、聚类特征都加进来了：

le = LabelEncoder()
df["Category_ID"] = le.fit_transform(df["Category"])
# -----------------------------
# 3. 衍生特徵：Inter_Days、30天RFM、週期編碼、flags
# -----------------------------
df["Prev_Date"] = df.groupby("Customer ID")["Transaction Date"].shift(1)
df["Inter_Days"] = (df["Transaction Date"] - df["Prev_Date"])\
                   .dt.days.fillna(0)

window_times, window_amounts = deque(), deque()
freq30 = np.zeros(len(df), int)
amt30_sum = np.zeros(len(df))
amt30_count = np.zeros(len(df))

for i, (cust, t, amt) in enumerate(zip(
    df["Customer ID"],
    df["Transaction Date"],
    df["Total Spent"]
)):
    if i == 0 or cust != df.loc[i-1, "Customer ID"]:
        window_times.clear(); window_amounts.clear()
    window_times.append(t); window_amounts.append(amt)
    while (t - window_times[0]).days > 30:
        window_times.popleft(); window_amounts.popleft()
    freq30[i]     = len(window_times) - 1
    amt30_sum[i]  = sum(window_amounts)
    amt30_count[i]= len(window_times)

df["freq30"]     = freq30
df["amt30_mean"] = amt30_sum / amt30_count

# 週期編碼
df["month_sin"] = np.sin(2*np.pi*df["Month"]  / 12)
df["month_cos"] = np.cos(2*np.pi*df["Month"]  / 12)
df["day_sin"]   = np.sin(2*np.pi*(df["Day"]-1)/31)
df["day_cos"]   = np.cos(2*np.pi*(df["Day"]-1)/31)

bool_cols = [
    "Is_Weekend","Is_Holiday","Is_NonWorkday",
    "PM_Credit Card","PM_Digital Wallet",
    "Loc_Online","Disc_True","Disc_Unknown"
]
df[bool_cols] = df[bool_cols].astype(int)

feature_cols = [
    "Price Per Unit","Quantity","Total Spent","Recency_Cust",
    "Inter_Days","freq30","amt30_mean",
    "month_sin","month_cos","day_sin","day_cos"
] + bool_cols

def rolling_rfm(df, window_days, amt_col="Total Spent"):
    times, amts, freqs, sums = deque(), deque(), [], []
    for cust, group in df.groupby("Customer ID", sort=False):
        times.clear(); amts.clear()
        for t, amt in zip(group["Transaction Date"], group[amt_col]):
            times.append(t); amts.append(amt)
            # pop 超出 window_days
            while (t - times[0]).days > window_days:
                times.popleft(); amts.popleft()
            freqs.append(len(times)-1)            # 扣掉自己
            sums.append(sum(amts))
    return freqs, sums

df = df.sort_values(["Customer ID","Transaction Date"]).reset_index(drop=True)
# 7 天
df["freq7"],  df["amt7_sum"]  = rolling_rfm(df,  7)
# 30 天（以前已有 freq30, amt30_mean），如果想保留 sum 可覆蓋或改名
# 90 天
df["freq90"], df["amt90_sum"] = rolling_rfm(df, 90)

# 為了跟 amt30_mean 同步，上面 sum 改成 mean 也很簡單：
df["amt7_mean"]  = df["amt7_sum"]  / (df["freq7"]  + 1)  # +1 包含當前交易
df["amt90_mean"] = df["amt90_sum"] / (df["freq90"] + 1)

# --- 2. 距今最後購買日的指數衰減特徵（Recency） ---------------------

# 先算「距今天數」
today = df["Transaction Date"].max()  # or datetime.today()
last_purchase = df.groupby("Customer ID")["Transaction Date"].transform("max")
df["days_since_last"] = (today - last_purchase).dt.days

# 再做指數衰減：exp(-λ·days)，λ 可調
lam = 0.05
df["recency_exp"] = np.exp(-lam * df["days_since_last"])

# --- 3. 客戶整體 RFM 聚類 -----------------------------------------

# 為每位客戶計算整體 RFM 向量
rfm = df.groupby("Customer ID").agg({
    "days_since_last": "min",               # Recency：最近那筆交易距今
    "Transaction ID": "count",              # Frequency：交易次數
    "Total Spent": "sum"                    # Monetary：總花費
}).rename(columns={
    "days_since_last": "R",
    "Transaction ID": "F",
    "Total Spent": "M"
})

# 標準化後做 KMeans
scaler_rfm = StandardScaler()
rfm_scaled = scaler_rfm.fit_transform(rfm[["R","F","M"]])
kmeans = KMeans(n_clusters=5, random_state=42)
rfm["cluster"] = kmeans.fit_predict(rfm_scaled)

# Merge 回原始 df
df = df.merge(rfm["cluster"], left_on="Customer ID", right_index=True)

# --- 4. 更新 feature_cols ---------------------------------------

new_feats = [
    "freq7", "amt7_mean",
    "freq90", "amt90_mean",
    "days_since_last", "recency_exp",
    "cluster"
]
feature_cols += new_feats
# 2. 为每个“查询”（query），即每一次要预测下一品类的历史序列，构造正负样本：
#    把每条正样本 (seq → true next Category_ID) 和随机采的几个负样本拼到一起，用同一个 query_id 分组。
records = []
for cust, grp in df.groupby("Customer ID"):
    cats = grp["Category_ID"].values
    feats = grp[feature_cols].values
    for i in range(1, len(grp)):
        hist_feat = feats[i-1]  # 这里用“上一笔”特征，也可以用窗口聚合等
        true_cat  = cats[i]
        # 正样本
        records.append((cust, hist_feat, true_cat, 1))
        # 负采样 3 个
        for _ in range(3):
            neg = np.random.randint(0, df["Category_ID"].nunique())
            while neg == true_cat:
                neg = np.random.randint(0, df["Category_ID"].nunique())
            records.append((cust, hist_feat, neg, 0))

rank_df = pd.DataFrame(records, columns=["query_id","feat","cat_id","label"])

# 3. 构造训练矩阵：将 feat 展开、cat_id 用 one-hot 或 embedding 特征拼进去
#    这里示例：直接把 feat + “cat_id” 做 one-hot（也可用 embedding lookup）。
X = np.stack(rank_df["feat"].values)
# one-hot encode cat_id：
K = df["Category_ID"].nunique()
onehots = np.eye(K)[rank_df["cat_id"].values]
X = np.hstack([X, onehots])

y = rank_df["label"].values
# group sizes = 每个 query_id 下的样本数量
group = rank_df.groupby("query_id").size().astype(int).tolist()

# 4. 切分（保证 query 整体不拆分）
qids = rank_df["query_id"].unique()
q_train, q_test = train_test_split(qids, test_size=0.2, random_state=42)
train_mask = rank_df["query_id"].isin(q_train)
X_train, y_train, g_train = X[train_mask], y[train_mask], rank_df[train_mask]\
    .groupby("query_id")\
    .size()\
    .astype(int)\
    .tolist()
X_test, y_test, g_test = X[~train_mask], y[~train_mask], rank_df[~train_mask]\
    .groupby("query_id")\
    .size()\
    .astype(int)\
    .tolist()

# 5. 用 LGBMRanker 训练
callbacks = [
    log_evaluation(period=10),

]

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="gbdt",
    n_estimators=1000,
    learning_rate=0.01,
    random_state=42,
    importance_type="gain"
)

ranker.fit(
    X_train, y_train,
    group=g_train,
    eval_set=[(X_test, y_test)],
    eval_group=[g_test],
    eval_at=[3],          # NDCG@3
    callbacks=callbacks    # ← 在这里传入 callbacks，取代 verbose
)
# 6. 评估 Top-3 Recall
#    对测试每个 query，predict 得分后取 top3，看正样本是否在内
pred_scores = ranker.predict(X_test)
# 重组为 per-query scores
offset = 0
hits = []
for grp_size in g_test:
    grp_scores = pred_scores[offset:offset+grp_size]
    grp_labels = y_test      [offset:offset+grp_size]
    topk_idx   = np.argsort(grp_scores)[-3:]
    # 只要正样本(label=1)在 top3 就算命中
    hits.append(any(grp_labels[i]==1 for i in topk_idx))
    offset += grp_size

print("Ranker Test Top-3 Recall: ", np.mean(hits))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001460 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1303
[LightGBM] [Info] Number of data points in the train set: 40004, number of used features: 34
[10]	valid_0's ndcg@3: 0.106144
[20]	valid_0's ndcg@3: 0
[30]	valid_0's ndcg@3: 0.0469279
[40]	valid_0's ndcg@3: 0
[50]	valid_0's ndcg@3: 0
[60]	valid_0's ndcg@3: 0.0592164
[70]	valid_0's ndcg@3: 0.0592164
[80]	valid_0's ndcg@3: 0
[90]	valid_0's ndcg@3: 0
[100]	valid_0's ndcg@3: 0.0469279
[110]	valid_0's ndcg@3: 0.0469279
[120]	valid_0's ndcg@3: 0
[130]	valid_0's ndcg@3: 0.0592164
[140]	valid_0's ndcg@3: 0.106144
[150]	valid_0's ndcg@3: 0.106144
[160]	valid_0's ndcg@3: 0.106144
[170]	valid_0's ndcg@3: 0.106144
[180]	valid_0's ndcg@3: 0.0592164
[190]	valid_0's ndcg@3: 0.106144
[200]	valid_0's ndcg@3: 0.153072
[210]	valid_0's ndcg@3: 0.15307

In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRanker
from sklearn.metrics import ndcg_score, recall_score
from lightgbm import LGBMRanker, log_evaluation, early_stopping
# 1. 读取并预处理（跟你之前的 pipeline 一致，只留关键步骤）
df = pd.read_csv("retail_store_sales_cleaned_feature_engineering.csv",
                 parse_dates=["Transaction Date"])
df = df.sort_values(["Customer ID", "Transaction Date"]).reset_index(drop=True)

# 假设你已经做完所有特征工程，并把 RFM、多窗口特征、聚类特征都加进来了：

le = LabelEncoder()
df["Category_ID"] = le.fit_transform(df["Category"])
# -----------------------------
# 3. 衍生特徵：Inter_Days、30天RFM、週期編碼、flags
# -----------------------------
df["Prev_Date"] = df.groupby("Customer ID")["Transaction Date"].shift(1)
df["Inter_Days"] = (df["Transaction Date"] - df["Prev_Date"])\
                   .dt.days.fillna(0)

window_times, window_amounts = deque(), deque()
freq30 = np.zeros(len(df), int)
amt30_sum = np.zeros(len(df))
amt30_count = np.zeros(len(df))

for i, (cust, t, amt) in enumerate(zip(
    df["Customer ID"],
    df["Transaction Date"],
    df["Total Spent"]
)):
    if i == 0 or cust != df.loc[i-1, "Customer ID"]:
        window_times.clear(); window_amounts.clear()
    window_times.append(t); window_amounts.append(amt)
    while (t - window_times[0]).days > 30:
        window_times.popleft(); window_amounts.popleft()
    freq30[i]     = len(window_times) - 1
    amt30_sum[i]  = sum(window_amounts)
    amt30_count[i]= len(window_times)

df["freq30"]     = freq30
df["amt30_mean"] = amt30_sum / amt30_count

# 週期編碼
df["month_sin"] = np.sin(2*np.pi*df["Month"]  / 12)
df["month_cos"] = np.cos(2*np.pi*df["Month"]  / 12)
df["day_sin"]   = np.sin(2*np.pi*(df["Day"]-1)/31)
df["day_cos"]   = np.cos(2*np.pi*(df["Day"]-1)/31)

bool_cols = [
    "Is_Weekend","Is_Holiday","Is_NonWorkday",
    "PM_Credit Card","PM_Digital Wallet",
    "Loc_Online","Disc_True","Disc_Unknown"
]
df[bool_cols] = df[bool_cols].astype(int)

feature_cols = [
    "Price Per Unit","Quantity","Total Spent","Recency_Cust",
    "Inter_Days","freq30","amt30_mean",
    "month_sin","month_cos","day_sin","day_cos"
] + bool_cols

def rolling_rfm(df, window_days, amt_col="Total Spent"):
    times, amts, freqs, sums = deque(), deque(), [], []
    for cust, group in df.groupby("Customer ID", sort=False):
        times.clear(); amts.clear()
        for t, amt in zip(group["Transaction Date"], group[amt_col]):
            times.append(t); amts.append(amt)
            # pop 超出 window_days
            while (t - times[0]).days > window_days:
                times.popleft(); amts.popleft()
            freqs.append(len(times)-1)            # 扣掉自己
            sums.append(sum(amts))
    return freqs, sums

df = df.sort_values(["Customer ID","Transaction Date"]).reset_index(drop=True)
# 7 天
df["freq7"],  df["amt7_sum"]  = rolling_rfm(df,  7)
# 30 天（以前已有 freq30, amt30_mean），如果想保留 sum 可覆蓋或改名
# 90 天
df["freq90"], df["amt90_sum"] = rolling_rfm(df, 90)

# 為了跟 amt30_mean 同步，上面 sum 改成 mean 也很簡單：
df["amt7_mean"]  = df["amt7_sum"]  / (df["freq7"]  + 1)  # +1 包含當前交易
df["amt90_mean"] = df["amt90_sum"] / (df["freq90"] + 1)

# --- 2. 距今最後購買日的指數衰減特徵（Recency） ---------------------

# 先算「距今天數」
today = df["Transaction Date"].max()  # or datetime.today()
last_purchase = df.groupby("Customer ID")["Transaction Date"].transform("max")
df["days_since_last"] = (today - last_purchase).dt.days

# 再做指數衰減：exp(-λ·days)，λ 可調
lam = 0.05
df["recency_exp"] = np.exp(-lam * df["days_since_last"])

# --- 3. 客戶整體 RFM 聚類 -----------------------------------------

# 為每位客戶計算整體 RFM 向量
rfm = df.groupby("Customer ID").agg({
    "days_since_last": "min",               # Recency：最近那筆交易距今
    "Transaction ID": "count",              # Frequency：交易次數
    "Total Spent": "sum"                    # Monetary：總花費
}).rename(columns={
    "days_since_last": "R",
    "Transaction ID": "F",
    "Total Spent": "M"
})

# 標準化後做 KMeans
scaler_rfm = StandardScaler()
rfm_scaled = scaler_rfm.fit_transform(rfm[["R","F","M"]])
kmeans = KMeans(n_clusters=5, random_state=42)
rfm["cluster"] = kmeans.fit_predict(rfm_scaled)

# Merge 回原始 df
df = df.merge(rfm["cluster"], left_on="Customer ID", right_index=True)

# --- 4. 更新 feature_cols ---------------------------------------

new_feats = [
    "freq7", "amt7_mean",
    "freq90", "amt90_mean",
    "days_since_last", "recency_exp",
    "cluster"
]
feature_cols += new_feats
# 2. 为每个“查询”（query），即每一次要预测下一品类的历史序列，构造正负样本：
#    把每条正样本 (seq → true next Category_ID) 和随机采的几个负样本拼到一起，用同一个 query_id 分组。
records = []
for cust, grp in df.groupby("Customer ID"):
    cats = grp["Category_ID"].values
    feats = grp[feature_cols].values
    for i in range(1, len(grp)):
        hist_feat = feats[i-1]  # 这里用“上一笔”特征，也可以用窗口聚合等
        true_cat  = cats[i]
        # 正样本
        records.append((cust, hist_feat, true_cat, 1))
        # 负采样 3 个
        for _ in range(3):
            neg = np.random.randint(0, df["Category_ID"].nunique())
            while neg == true_cat:
                neg = np.random.randint(0, df["Category_ID"].nunique())
            records.append((cust, hist_feat, neg, 0))

rank_df = pd.DataFrame(records, columns=["query_id","feat","cat_id","label"])

# 3. 构造训练矩阵：将 feat 展开、cat_id 用 one-hot 或 embedding 特征拼进去
#    这里示例：直接把 feat + “cat_id” 做 one-hot（也可用 embedding lookup）。
X = np.stack(rank_df["feat"].values)
# one-hot encode cat_id：

# K: 總類別數
K = df["Category_ID"].nunique()

records = []
query_counter = 0

for cust, grp in df.groupby("Customer ID", sort=False):
    feats = grp[feature_cols].values
    cats  = grp["Category_ID"].values
    for i in range(1, len(grp)):
        hist_feat = feats[i-1]
        true_cat  = cats[i]
        # 這筆歷史序列就用 query_counter 當作唯一 ID
        for cat_id in range(K):
            label = 1 if cat_id == true_cat else 0
            records.append((query_counter, hist_feat, cat_id, label))
        query_counter += 1

rank_df = pd.DataFrame(records,
    columns=["query_id","feat","cat_id","label"])

# 展開特徵
X = np.vstack(rank_df["feat"].values)
onehots = np.eye(K)[rank_df["cat_id"].values]
X = np.hstack([X, onehots])
y = rank_df["label"].values

# 正確地用 query_id 分組，得到每筆 query 的樣本數（應該都等於 K）
group_sizes = rank_df.groupby("query_id").size().tolist()  

# 切分 query_ids
all_qids = np.arange(query_counter)
q_train, q_test = train_test_split(all_qids,
    test_size=0.2, random_state=42)

# 建立訓練／測試遮罩
train_mask = rank_df["query_id"].isin(q_train)

X_train = X[train_mask]
y_train = y[train_mask]
g_train = [K] * len(q_train)

X_test = X[~train_mask]
y_test = y[~train_mask]
g_test = [K] * len(q_test)
# 5. 用 LGBMRanker 训练
callbacks = [
    log_evaluation(period=10),

]

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="gbdt",
    n_estimators=1000,
    learning_rate=0.01,
    random_state=42,
    importance_type="gain"
)

ranker.fit(
    X_train, y_train,
    group=g_train,
    eval_set=[(X_test, y_test)],
    eval_group=[g_test],
    eval_at=[3],          # NDCG@3
    callbacks=callbacks    # ← 在这里传入 callbacks，取代 verbose
)
# 6. 评估 Top-3 Recall
#    对测试每个 query，predict 得分后取 top3，看正样本是否在内
pred_scores = ranker.predict(X_test)
# 重组为 per-query scores
offset = 0
hits = []
for grp_size in g_test:
    grp_scores = pred_scores[offset:offset+grp_size]
    grp_labels = y_test      [offset:offset+grp_size]
    topk_idx   = np.argsort(grp_scores)[-3:]
    # 只要正样本(label=1)在 top3 就算命中
    hits.append(any(grp_labels[i]==1 for i in topk_idx))
    offset += grp_size

print("Ranker Test Top-3 Recall: ", np.mean(hits))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002837 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1305
[LightGBM] [Info] Number of data points in the train set: 80320, number of used features: 34
[10]	valid_0's ndcg@3: 0.274812
[20]	valid_0's ndcg@3: 0.276216
[30]	valid_0's ndcg@3: 0.276363
[40]	valid_0's ndcg@3: 0.279313
[50]	valid_0's ndcg@3: 0.272303
[60]	valid_0's ndcg@3: 0.27135
[70]	valid_0's ndcg@3: 0.271317
[80]	valid_0's ndcg@3: 0.276088
[90]	valid_0's ndcg@3: 0.272052
[100]	valid_0's ndcg@3: 0.271577
[110]	valid_0's ndcg@3: 0.2718
[120]	valid_0's ndcg@3: 0.270657
[130]	valid_0's ndcg@3: 0.268855
[140]	valid_0's ndcg@3: 0.265246
[150]	valid_0's ndcg@3: 0.264112
[160]	valid_0's ndcg@3: 0.265046
[170]	valid_0's ndcg@3: 0.266398
[180]	valid_0's ndcg@3: 0.266588
[190]	valid_0's ndcg@3: 0.267029
[200]	valid_0's ndcg@3: 0.267982