In [1]:
# ==============================================================================
# 01_Data_Preprocessing.ipynb
# ==============================================================================
# 本 Notebook 負責資料清洗、時序切分與類別不平衡處理
# 對應計畫書章節: 5.1 模組一：動態圖譜建構 [Source 47] 與 7.2 資料前處理 [Source 109]

import pandas as pd
import numpy as np
import torch
import os
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# 設定路徑
RAW_DATA_PATH = "../data/raw/"
PROCESSED_DATA_PATH = "../data/processed/"
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)

print("Libraries imported successfully.")

# ==============================================================================
# 1. 載入或生成模擬數據 (Load or Generate Dummy Data)
# ==============================================================================
# 由於原始 Elliptic 資料集很大，這裡提供一個自動偵測機制：
# 如果 data/raw 裡面沒有檔案，就自動生成「模擬數據」來跑流程。

def load_or_generate_data():
    features_file = os.path.join(RAW_DATA_PATH, "elliptic_txs_features.csv")
    classes_file = os.path.join(RAW_DATA_PATH, "elliptic_txs_classes.csv")
    
    if os.path.exists(features_file) and os.path.exists(classes_file):
        print("檢測到原始資料集，正在載入...")
        # 這裡僅載入前 1000 筆作為範例，避免記憶體不足
        df_features = pd.read_csv(features_file, header=None)
        df_classes = pd.read_csv(classes_file)
        # 重新命名欄位以符合 Elliptic 格式
        df_features.columns = ['txId', 'time_step'] + [f'feat_{i}' for i in range(165)]
    else:
        print("⚠️ 未檢測到原始資料集，正在生成「模擬數據」以供演示...")
        # 模擬 1000 筆交易，時間步從 1 到 49
        n_samples = 1000
        tx_ids = np.arange(n_samples)
        time_steps = np.random.randint(1, 50, n_samples) # 模擬 49 個時間步 [Source 49]
        
        # 模擬 166 維特徵 (包含 txId 和 time_step)
        features = np.random.randn(n_samples, 165)
        df_features = pd.DataFrame(features, columns=[f'feat_{i}' for i in range(165)])
        df_features.insert(0, 'time_step', time_steps)
        df_features.insert(0, 'txId', tx_ids)
        
        # 模擬類別 (0: unknown, 1: illicit, 2: licit)
        # 製造極度不平衡：非法交易 (class 1) 很少
        classes = np.random.choice([2, 2, 2, 1], size=n_samples, p=[0.9, 0.05, 0.03, 0.02])
        df_classes = pd.DataFrame({'txId': tx_ids, 'class': classes})
        
    return df_features, df_classes

df_features, df_classes = load_or_generate_data()
print(f"資料載入完成。Features shape: {df_features.shape}, Classes shape: {df_classes.shape}")

# ==============================================================================
# 2. 資料清洗與合併 (Data Cleaning & Merging)
# ==============================================================================
# [Source 70] 計畫書提到：屬於監督式學習，需移除標記為未知的節點 (Class 0/Unknown)

# 合併特徵與標籤
df_merged = pd.merge(df_features, df_classes, on='txId', how='inner')

# 重新映射標籤：Elliptic 中 1=Illicit, 2=Licit。通常我們將 Illicit 改為 1, Licit 改為 0。
# 這裡先移除 class 0 (Unknown)
df_clean = df_merged[df_merged['class'] != 0].copy()
# 標籤轉換: 原始 1(非法) -> 1, 原始 2(合法) -> 0
df_clean['label'] = df_clean['class'].apply(lambda x: 1 if x == 1 else 0)

print(f"清洗後資料量: {len(df_clean)} (移除 Unknown 交易)")
print(f"非法交易數量 (Label 1): {df_clean['label'].sum()}")
print(f"合法交易數量 (Label 0): {len(df_clean) - df_clean['label'].sum()}")

# ==============================================================================
# 3. 時序切分 (Temporal Split)
# ==============================================================================
# [Source 49, 110] 嚴格依照時間切分：訓練集 (T1-T35), 測試集 (T36-T49)
# 這是為了避免「未來資訊洩漏 (Data Leakage)」

train_mask = df_clean['time_step'] <= 35
test_mask = df_clean['time_step'] > 35

train_data = df_clean[train_mask]
test_data = df_clean[test_mask]

print("\n[時序切分結果]")
print(f"訓練集 (T1-35): {len(train_data)} 筆")
print(f"測試集 (T36-49): {len(test_data)} 筆")

# ==============================================================================
# 4. 處理類別不平衡 (Handling Imbalance with SMOTE)
# ==============================================================================
# [Source 50, 111, 183] 使用合成少數過採樣技術 (SMOTE) 
# 針對訓練集進行數據增強，解決非法樣本 <2% 的問題

print("\n[執行 SMOTE 資料增強]")
X_train = train_data.drop(columns=['txId', 'class', 'label', 'time_step'])
y_train = train_data['label']

# 檢查訓練集中是否有非法交易，若太少則不執行 SMOTE (避免 Demo 報錯)
if y_train.sum() > 1:
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    print(f"SMOTE 前訓練集形狀: {X_train.shape}")
    print(f"SMOTE 後訓練集形狀: {X_resampled.shape}")
    print(f"增強後非法交易佔比: {y_resampled.mean():.2%}")
else:
    print("⚠️ 警告：模擬數據中非法交易過少，跳過 SMOTE 步驟。")
    X_resampled, y_resampled = X_train, y_train

# ==============================================================================
# 5. 儲存處理後的數據 (Save Processed Data)
# ==============================================================================
# 將處理好的 Tensors 儲存起來，供後續模型使用

# 轉換為 PyTorch Tensor
train_features_tensor = torch.tensor(X_resampled.values, dtype=torch.float32)
train_labels_tensor = torch.tensor(y_resampled.values, dtype=torch.float32)

torch.save({'x': train_features_tensor, 'y': train_labels_tensor}, 
           os.path.join(PROCESSED_DATA_PATH, 'train_data.pt'))

print(f"\n✅ 資料預處理完成！檔案已儲存至 {PROCESSED_DATA_PATH}")

Libraries imported successfully.
⚠️ 未檢測到原始資料集，正在生成「模擬數據」以供演示...
資料載入完成。Features shape: (1000, 167), Classes shape: (1000, 2)
清洗後資料量: 1000 (移除 Unknown 交易)
非法交易數量 (Label 1): 23
合法交易數量 (Label 0): 977

[時序切分結果]
訓練集 (T1-35): 698 筆
測試集 (T36-49): 302 筆

[執行 SMOTE 資料增強]
SMOTE 前訓練集形狀: (698, 165)
SMOTE 後訓練集形狀: (1358, 165)
增強後非法交易佔比: 50.00%

✅ 資料預處理完成！檔案已儲存至 ../data/processed/
