### **PCA商品分群**
- Motivation: 此Project的目的為將性質類似之商品分類，在訓練模型時用同性質商品而非單一商品資料進行訓練，降低訓練時overfitting的機率
- 過程如下:
    1. 讀取29種商品資料，包含貨幣兌、黃金、原油、指數等商品
    2. 將資料轉換為dollar bars(以固定金額為基準的bar，性質較time bars穩定)
    3. 計算指標，並對非平穩指標進行Fractional Difference，讓數據平穩化的同時保持記憶性
    4. 使用PCA(主成分分析)將資料正交化
    5. 使用Kmeans將PCA的結果分群，並將相似的商品歸類在同一群組

### **讀取資料**

In [None]:
import sys
import os


project_root = os.path.abspath(os.path.join("..", "QuantCommon"))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils.tools import read_file
from utils.processing import get_dollar_bars, apply_cusum_filter, getDailyVol
import numpy as np

# 調整資料路徑，對應 common 裡的資料
filepath = os.path.join(project_root, "data", "FI", "M1")
file = read_file(filepath)


讀取並篩選進度: 100%|██████████| 29/29 [02:43<00:00,  5.63s/it]

有 29 個 DataFrame。





### **計算指標**
- 計算多項指標並對非平穩之指標進行Fractional Difference，讓指標在轉換為平穩序列的同時保持記憶性

In [2]:
import talib
import pandas as pd
from statsmodels.tsa.stattools import adfuller
# --- 以下 FFD helper functions 跟之前一樣 --- #
def get_ffd_weights(d: float, size: int, thresh: float = 1e-5) -> np.ndarray:
    w = [1.0]
    for k in range(1, size):
        w.append(w[-1] * ((-d + k - 1) / k))
    w = np.array(w)
    M = np.where(np.abs(w) > thresh)[0].max() + 1
    return w[:M]

def fractional_diff(series: pd.Series, d: float, thresh: float = 1e-5) -> pd.Series:
    x = series.values
    w = get_ffd_weights(d, len(x), thresh)
    if w.size == 1:
        return series.copy().rename(series.name)
    conv = np.convolve(x, w, mode='valid')
    idx = series.index[w.size-1:]
    return pd.Series(conv, index=idx, name=series.name)

def find_min_d(series: pd.Series, d_grid: np.ndarray) -> float:
    for d in d_grid:
        ffd = fractional_diff(series, d)
        pval = adfuller(ffd.dropna(), maxlag=1, regression='c')[1]
        if pval < 0.05:
            return d
    return d_grid[-1]

def safe_log(series: pd.Series, eps: float = 1e-8) -> pd.Series:
    return np.log(np.clip(series, a_min=eps, a_max=None))

def compute_talib_features(data: pd.DataFrame,
                           periods: list = None,
                           apply_ffd: bool = True,
                           d_vals: np.ndarray = np.linspace(0, 1, 51),
                           ffd_thresh: float = 1e-5) -> pd.DataFrame:
    """
    接收含 open, high, low, close, volume 的 DataFrame，
    針對 periods 裡每個週期，計算一批 TA-Lib 指標，
    並回傳一個新的 DataFrame，裡面是所有這些技術指標特徵。
    """
    if periods is None:
        periods = [7, 14, 28, 50, 100]

    
    high, low, close, volume, log_ret = data['high'], data['low'], data['close'], data['volume'], np.log(data['close']).diff()
    # Log Returns
    features_df = pd.DataFrame(index=data.index)
    features_df['log_ret'] = log_ret
    for p in periods:
        # —— 波動率類 —— #
        features = dict({})
        features[f'atr_{p}']      = talib.ATR(high, low, close, timeperiod=p)
        upper, mid, lower         = talib.BBANDS(close, timeperiod=p, nbdevup=2, nbdevdn=2)
        features[f'bb_width_{p}']  = (upper - lower) / mid
        features[f'volatility_{p}'] = log_ret.rolling(window=p, min_periods=p, center=False).std()

        # —— 趨勢類 —— #
        features[f'sma_{p}']       = talib.SMA(close, timeperiod=p)
        features[f'ema_{p}']       = talib.EMA(close, timeperiod=p)
        features[f'adx_{p}']       = talib.ADX(high, low, close, timeperiod=p)
        features[f'plus_di_{p}']   = talib.PLUS_DI(high, low, close, timeperiod=p)
        features[f'minus_di_{p}']  = talib.MINUS_DI(high, low, close, timeperiod=p)
        features[f'dx_{p}']        = talib.DX(high, low, close, timeperiod=p)
        features[f'adxr_{p}']      = talib.ADXR(high, low, close, timeperiod=p)

        # —— 動量／均值回歸類 —— #
        features[f'rsi_{p}']       = talib.RSI(close, timeperiod=p)
        features[f'roc_{p}']       = talib.ROC(close, timeperiod=p)
        features[f'mom_{p}']       = talib.MOM(close, timeperiod=p)
        # features[f'autocorr_{p}'] = log_ret.rolling(window=100, min_periods=100, center=False).apply(lambda x: x.autocorr(lag=p), raw=False)
        # PPO = (EMA_fast - EMA_slow)/EMA_slow * 100
        fast = p
        slow = max(2*p, p+1)
        features[f'ppo_{p}'], features[f'ppo_signal_{p}'], features[f'ppo_hist_{p}'] = \
            talib.MACDEXT(close,
                          fastperiod=fast, fastmatype=0,
                          slowperiod=slow, slowmatype=0,
                          signalperiod=int(p/2), signalmatype=0)

        # KAMA
        features[f'kama_{p}']      = talib.KAMA(close, timeperiod=p)

        # Williams %R
        features[f'willr_{p}']     = talib.WILLR(high, low, close, timeperiod=p)

        # Stochastic
        slowk, slowd = talib.STOCH(
            high, low, close,
            fastk_period=p,
            slowk_period=max(3, p//3), slowk_matype=0,
            slowd_period=max(3, p//3), slowd_matype=0
        )
        features[f'stoch_k_{p}']   = slowk
        features[f'stoch_d_{p}']   = slowd
        features_df = pd.concat([features_df, pd.DataFrame(features)], axis=1)

    # —— 不需 timeperiod 的指標 —— #
    features_df['obv'] = talib.OBV(close, volume)
    features_df['adl'] = talib.AD(high, low, close, volume)
    features_df['sar'] = talib.SAR(high, low, acceleration=0.02, maximum=0.2)



    if apply_ffd:
        ffd_dict = {}
        for col in features_df.columns:
            # 1) 做微分前，自動檢定是否需要平穩化
            series = features_df[col].dropna()
            # 只對非平穩序列跑 FFD
            pval = adfuller(series, maxlag=1, regression='c')[1]
            if pval < 0.05:
                # 平穩就不動，直接填回原序列
                ffd_series = series
            else:
                # 非平穩就找 d* 並做分數階微分
                log_series = safe_log(series)
                d_star     = find_min_d(log_series, d_vals)
                ffd_series = fractional_diff(log_series, d_star, thresh=ffd_thresh)
            # ffd_dict[f'{col}_ffd'] = ffd_series
            ffd_dict[f'{col}'] = ffd_series

        # 合併並對齊 index
        ffd_df = pd.DataFrame(ffd_dict)
        features_df = pd.concat([features_df, ffd_df], axis=1)
        features_df = features_df.dropna()
    return features_df



### **建立PCA Pipeline**
- 先把指標用rolling window的方式轉換為precentile
- 再把資料標準化後進行PCA

In [3]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline


class RollingPercentileTransformer(BaseEstimator, TransformerMixin):
    """
    對每一欄做滑動 percentile 計算，回傳每個時間點 t 
    欄位值在過去 window 期內的百分位 (0~1)。
    """
    def __init__(self, window: int = 252, min_periods: int = 1):
        self.window = window
        self.min_periods = min_periods

    def fit(self, X, y=None):
        # 不需要學任何東西
        return self

    def transform(self, X):
        # 假設 X 是 DataFrame
        X = pd.DataFrame(X).copy()
        for col in X.columns:
            # 每個 col 分別做 rolling.apply
            X[col] = (
                X[col]
                .rolling(window=self.window, min_periods=self.min_periods)
                .apply(lambda arr: (arr <= arr[-1]).sum() / len(arr), raw=True)
            )
        return X.values  # 回傳 numpy array 給後續 scaler


# === Pipeline : z-score → PCA ===
pipe = Pipeline([
    ('roll_pct', RollingPercentileTransformer(window=252)),
    ('scaler',  StandardScaler()),
])



In [4]:
from tqdm import tqdm
# 假設有多個商品，每個商品都有自己的 features_df
all_pc1_loadings = []

for product in tqdm(file):
    data = file[product]
    data = get_dollar_bars(data)
    data = data.iloc[-50000:]
    data = compute_talib_features(data,
                                 periods=[7,14,28,50,100],
                                 apply_ffd=True)
    data = data.dropna()
    X_scaled = pipe.fit_transform(data)
    pca = PCA()
    pca.fit(X_scaled)
    pc1 = pd.Series(pca.components_[0], index=data.columns)
    all_pc1_loadings.append(pc1)

loadings_df = pd.DataFrame(np.array(all_pc1_loadings), index=file.keys())


  0%|          | 0/29 [00:00<?, ?it/s]

Filtered Dollar Bars Count: 184439


  3%|▎         | 1/29 [03:23<1:35:07, 203.85s/it]

Filtered Dollar Bars Count: 11064


  7%|▋         | 2/29 [04:01<47:49, 106.26s/it]  

Filtered Dollar Bars Count: 182677


 10%|█         | 3/29 [07:09<1:02:09, 143.44s/it]

Filtered Dollar Bars Count: 188128


 14%|█▍        | 4/29 [10:46<1:11:54, 172.59s/it]

Filtered Dollar Bars Count: 186053


 17%|█▋        | 5/29 [14:15<1:14:19, 185.80s/it]

Filtered Dollar Bars Count: 12072


 21%|██        | 6/29 [14:52<51:46, 135.06s/it]  

Filtered Dollar Bars Count: 10731


 24%|██▍       | 7/29 [15:24<37:10, 101.39s/it]

Filtered Dollar Bars Count: 189937


 28%|██▊       | 8/29 [18:50<47:05, 134.56s/it]

Filtered Dollar Bars Count: 183083


 31%|███       | 9/29 [22:19<52:37, 157.90s/it]

Filtered Dollar Bars Count: 11543


 34%|███▍      | 10/29 [22:53<37:53, 119.64s/it]

Filtered Dollar Bars Count: 10738


 38%|███▊      | 11/29 [23:17<27:06, 90.33s/it] 

Filtered Dollar Bars Count: 12747


 41%|████▏     | 12/29 [23:54<21:00, 74.13s/it]

Filtered Dollar Bars Count: 12494


 45%|████▍     | 13/29 [24:31<16:47, 62.95s/it]

Filtered Dollar Bars Count: 10645


 48%|████▊     | 14/29 [24:59<13:07, 52.48s/it]

Filtered Dollar Bars Count: 67499


 52%|█████▏    | 15/29 [28:17<22:29, 96.38s/it]

Filtered Dollar Bars Count: 12093


 55%|█████▌    | 16/29 [28:53<16:55, 78.09s/it]

Filtered Dollar Bars Count: 10534


 59%|█████▊    | 17/29 [29:21<12:36, 63.06s/it]

Filtered Dollar Bars Count: 11065


 62%|██████▏   | 18/29 [29:52<09:48, 53.48s/it]

Filtered Dollar Bars Count: 11109


 66%|██████▌   | 19/29 [30:22<07:41, 46.17s/it]

Filtered Dollar Bars Count: 12030


 69%|██████▉   | 20/29 [30:52<06:13, 41.55s/it]

Filtered Dollar Bars Count: 12056


 72%|███████▏  | 21/29 [31:24<05:09, 38.70s/it]

Filtered Dollar Bars Count: 68478


 76%|███████▌  | 22/29 [34:24<09:26, 80.98s/it]

Filtered Dollar Bars Count: 68549


 79%|███████▉  | 23/29 [37:46<11:43, 117.32s/it]

Filtered Dollar Bars Count: 69163


 83%|████████▎ | 24/29 [41:01<11:42, 140.56s/it]

Filtered Dollar Bars Count: 9979


 86%|████████▌ | 25/29 [41:31<07:09, 107.36s/it]

Filtered Dollar Bars Count: 10038


 90%|████████▉ | 26/29 [42:03<04:14, 84.69s/it] 

Filtered Dollar Bars Count: 55369


 93%|█████████▎| 27/29 [45:05<03:48, 114.16s/it]

Filtered Dollar Bars Count: 54868


 97%|█████████▋| 28/29 [48:10<02:15, 135.30s/it]

Filtered Dollar Bars Count: 55850


100%|██████████| 29/29 [51:21<00:00, 106.25s/it]


In [5]:
loadings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,198,199,200,201,202,203,204,205,206,207
AUDUSD_M1,-0.021945,0.003266,0.000512,5.8e-05,-0.093463,-0.095009,0.00322,-0.076904,0.074523,0.001978,...,-0.050954,-0.026251,-0.059753,-0.074989,-0.10964,-0.089519,-0.065343,-0.008174,-0.015783,-0.068945
AUS200_M1,-0.021114,0.01174,0.004584,0.009096,-0.092561,-0.094327,0.001978,-0.073124,0.074274,-0.001083,...,-0.050176,-0.02469,-0.053266,-0.092749,-0.104583,-0.085845,-0.064149,-0.069968,-0.071407,-0.076001
EURGBP_M1,-0.020929,-0.008563,-0.004486,-0.005588,-0.096082,-0.097831,0.007834,-0.07593,0.07206,0.007351,...,-0.053374,-0.030474,-0.053062,-0.061482,-0.109551,-0.089493,-0.064544,-0.070102,-0.056523,-0.0678
EURJPY_M1,-0.020743,0.028144,0.015002,0.022638,-0.091026,-0.092545,0.008642,-0.072224,0.073583,0.004482,...,-0.045682,-0.023301,-0.04943,-0.083224,-0.105379,-0.087746,-0.062731,-0.069081,-0.066441,-0.071665
EURUSD_M1,-0.01988,0.000275,0.000726,0.000372,-0.094499,-0.096131,0.010231,-0.076883,0.074795,0.00693,...,-0.049146,-0.02329,-0.05588,-0.082022,-0.108179,-0.089326,-0.063995,-0.005812,-0.022095,-0.072861


### **Kmeans分群**
- 將PCA的結果先用silhouette_score找到最適合的群數，再用Kmeans分群

In [6]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def find_best_k_by_silhouette(X, k_range=range(3, 10)):
    best_k = k_range[0]
    best_score = -1
    print(X.shape)
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42).fit(X)
        labels = kmeans.labels_
        score = silhouette_score(X, labels)
        if score > best_score:
            best_score = score
            best_k = k
    return best_k, best_score
best_k, best_score = find_best_k_by_silhouette(loadings_df.values, k_range=range(3, 10))

(29, 208)


In [7]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=best_k, random_state=0)
labels = kmeans.fit_predict(loadings_df)

loadings_df['cluster'] = labels
loadings_df['cluster'].to_csv('clusters.csv')