### **讀取資料**

In [1]:
import sys
import os

# 新的 project_root 指向 common 的上一層
project_root = os.path.abspath(os.path.join("..", "QuantCommon"))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils.tools import read_file
from utils.processing import get_dollar_bars
import numpy as np
import pandas as pd
import numpy as np


clusters = pd.read_csv("clusters.csv", index_col=0)
print(f'XAUUSD 在第{clusters.loc["XAUUSD_M1", "cluster"]}群')

XAUUSD 在第2群


In [2]:
group = clusters[clusters["cluster"] == clusters.loc["XAUUSD_M1", "cluster"]]

data = dict({})
for i in group.index:
    filepath = os.path.join(project_root, "data", "FI", "M1",f"{i}.csv")
    df = pd.read_csv(filepath, parse_dates=True)
    df['time'] = pd.to_datetime(df['time'])
    df = get_dollar_bars(df)
    data[i] = df

Filtered Dollar Bars Count: 184442
Filtered Dollar Bars Count: 182676
Filtered Dollar Bars Count: 186054
Filtered Dollar Bars Count: 183075
Filtered Dollar Bars Count: 10739
Filtered Dollar Bars Count: 67499
Filtered Dollar Bars Count: 11067
Filtered Dollar Bars Count: 12016
Filtered Dollar Bars Count: 68479
Filtered Dollar Bars Count: 55366
Filtered Dollar Bars Count: 54861


In [None]:
from utils.metalabeling import add_vertical_barrier, get_events, get_bins
from utils.processing import apply_cusum_filter, getDailyVol

for _,df in data:
    vol = getDailyVol(df["close"], span0=20)
    cusum_events  = apply_cusum_filter(df, volatility=vol).index
    vertical_barriers = add_vertical_barrier(cusum_events, df, num_days=2)
    pt_sl = [1, 1]
    min_ret = 0.003
    triple_barrier_events = get_events(close=df["close"],
                                                t_events=cusum_events,
                                                pt_sl=pt_sl,
                                                target=vol,
                                                min_ret=min_ret,
                                                num_threads=4,
                                                vertical_barrier_times=vertical_barriers,
                                                side_prediction=None)
    labels  = get_bins(triple_barrier_events, df["close"])

TypeError: string indices must be integers

In [3]:
from utils.metalabeling import add_vertical_barrier, get_events, get_bins
from utils.processing import apply_cusum_filter, getDailyVol

tmp = data["XAUUSD_M1"].copy()

vol = getDailyVol(tmp["close"], span0=20)
cusum_events  = apply_cusum_filter(tmp, volatility=vol).index
vertical_barriers = add_vertical_barrier(cusum_events, tmp, num_days=2)
pt_sl = [1, 1]
min_ret = 0.003
triple_barrier_events = get_events(close=tmp["close"],
                                               t_events=cusum_events,
                                               pt_sl=pt_sl,
                                               target=vol,
                                               min_ret=min_ret,
                                               num_threads=4,
                                               vertical_barrier_times=vertical_barriers,
                                               side_prediction=None)
labels  = get_bins(triple_barrier_events, tmp["close"])

time
2020-06-22 00:25:00    0.000000
2020-06-22 01:23:00   -0.001183
2020-06-22 02:19:00    0.000383
2020-06-22 02:53:00   -0.000520
2020-06-22 03:11:00    0.001200
                         ...   
2023-12-29 19:53:00   -0.003922
2023-12-29 20:25:00   -0.000900
2023-12-29 21:00:00   -0.000899
2023-12-29 21:38:00   -0.000290
2023-12-29 22:10:00   -0.001326
Name: close, Length: 54854, dtype: float64
CUSUM Bars Count: 13443


In [4]:
from utils.processing import cal_weights

weights = cal_weights(triple_barrier_events, tmp["close"])

In [6]:
from utils.processing import compute_talib_features
feats = compute_talib_features(tmp,
                               periods=[7,14,28,50,100],
                               apply_ffd=True)

## Make Features

In [7]:
idx = feats.index.intersection(labels.index)
feats = feats.loc[idx]
labels = labels.loc[idx]["bin"]
weights =  weights.loc[idx]["weight"]
t1 = triple_barrier_events.loc[idx]["t1"]
print(feats.shape, labels.shape, weights.shape, t1.shape)

(3448, 208) (3448,) (3448,) (3448,)


## PCA

In [None]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

feats = feats.dropna()
class RollingPercentileTransformer(BaseEstimator, TransformerMixin):
    """
    對每一欄做滑動 percentile 計算，回傳每個時間點 t 
    欄位值在過去 window 期內的百分位 (0~1)。
    """
    def __init__(self, window: int = 252, min_periods: int = 1):
        self.window = window
        self.min_periods = min_periods

    def fit(self, X, y=None):
        # 不需要學任何東西
        return self

    def transform(self, X):
        # 假設 X 是 DataFrame
        X = pd.DataFrame(X).copy()
        for col in X.columns:
            # 每個 col 分別做 rolling.apply
            X[col] = (
                X[col]
                .rolling(window=self.window, min_periods=self.min_periods)
                .apply(lambda arr: (arr <= arr[-1]).sum() / len(arr), raw=True)
            )
        return X.values  # 回傳 numpy array 給後續 scaler

# === Pipeline 1: rolling percentile → z-score → PCA ===
pipe1 = Pipeline([
    ("roll_pct", RollingPercentileTransformer(window=252)),
    ("scaler",  StandardScaler()),
    ("pca",     PCA(n_components=0.95, whiten=False)),
])

# === Pipeline 2: z-score → PCA ===
pipe2 = Pipeline([
    ("scaler", StandardScaler()),
    ("pca",    PCA(n_components=0.95, whiten=False)),
])

# === 使用方式 ===
# 假設 feats 是一個 DataFrame，columns 就是你的所有技術指標
# e.g. feats = compute_talib_features(data)

# 1) 第一條流水線
X1 = pipe1.fit_transform(feats)  
# 2) 第二條流水線
X2 = pipe2.fit_transform(feats)


## MDA MDI SFI

#### Purged K Fold


In [None]:
import numpy as np
import pandas as pd

class PurgedKFold:
    def __init__(self, n_splits=3, t1=None, pct_embargo=0.0):
        if not isinstance(t1, pd.Series):
            raise ValueError("t1 must be a pandas Series")
        self.n_splits = n_splits
        self.t1 = t1.sort_index()
        self.pct_embargo = pct_embargo

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

    def split(self, X, y=None, groups=None):
        if not X.index.equals(self.t1.index):
            raise ValueError("X and t1 must have the same index")
        n_samples = len(X)
        indices = np.arange(n_samples)
        # divide indices into contiguous chunks
        test_slices = np.array_split(indices, self.n_splits)
        mbrg = int(n_samples * self.pct_embargo)

        for slice_ in test_slices:
            i, j = slice_[0], slice_[-1] + 1
            test_idx = indices[i:j]

            # start‐time of test block
            t0 = self.t1.index[i]
            # end‐time of test block
            t1_max = self.t1.iloc[test_idx].max()
            # find the position just after t1_max
            max_t1_pos = self.t1.index.searchsorted(t1_max)

            # training before test block
            train_before = indices[self.t1.index < t0]
            # training after test + embargo
            train_after = indices[max_t1_pos + mbrg :]

            train_idx = np.concatenate([train_before, train_after])
            yield train_idx, test_idx


#### CVscore

In [None]:
import numpy as np
from sklearn.base import clone
from sklearn.metrics import log_loss, accuracy_score

def cv_score(clf,
             X,
             y,
             sample_weight=None,
             scoring="neg_log_loss",
             t1=None,
             cv=3,
             pct_embargo=0.01):

    if scoring not in ["neg_log_loss", "accuracy"]:
        raise ValueError('scoring must be "neg_log_loss" or "accuracy"')

    pkf = PurgedKFold(n_splits=cv, t1=t1, pct_embargo=pct_embargo)
    scores = []

    for train_idx, test_idx in pkf.split(X):
        # 複製一份新的 model
        model = clone(clf)
        # fit
        if sample_weight is None:
            model.fit(X.iloc[train_idx], y.iloc[train_idx])
        else:
            model.fit(X.iloc[train_idx],
                      y.iloc[train_idx],
                      sample_weight=sample_weight.iloc[train_idx].values)
        # predict + score
        if scoring == "neg_log_loss":
            prob = model.predict_proba(X.iloc[test_idx])
            sc = -log_loss(y.iloc[test_idx],
                           prob,
                           sample_weight=(None if sample_weight is None else sample_weight.iloc[test_idx].values),
                           labels=model.classes_)
        else:
            pred = model.predict(X.iloc[test_idx])
            sc = accuracy_score(y.iloc[test_idx],
                                pred,
                                sample_weight=(None if sample_weight is None else sample_weight.iloc[test_idx].values))
        scores.append(sc)
    return np.array(scores)




#### MDA MDI SFI 實作

In [None]:
import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.metrics import log_loss, accuracy_score
from tqdm import tqdm

# 假設你已經有：
#   - PurgedKFold 實作  
#   - cv_score 函式  
# 並且都在 your_module 裡可以 import  


# 1) MDI Feature Importance
def feat_imp_mdi(fit, feat_names):
    """
    fit: 已訓練好的 tree‐ensemble（RandomForest, ExtraTrees…）
    feat_names: list of feature names
    return: pd.DataFrame with columns ["mean","std"] 純量化後的重要度
    """
    # 從每顆樹蒐集 feature_importances_
    df0 = pd.DataFrame(
        [tree.feature_importances_ for tree in fit.estimators_],
        columns=feat_names
    ).replace(0, np.nan)  # 如果 max_features=1，某些 tree 有 0
    imp = pd.concat({
        "median": df0.median(),
        "std" : df0.std() * df0.shape[0]**-0.5
    }, axis=1)
    # normalize to sum=1
    imp["median"] /= imp["median"].sum()
    imp.sort_values(by="median", ascending=False, inplace=True)
    return imp


# 2) MDA: 支援 X, y, sample_weight, t1 為 np.ndarray
def feat_imp_mda(clf,
                 X,
                 y,
                 sample_weight=None,
                 t1=None,
                 cv: int = 5,
                 pct_embargo: float = 0.01,
                 scoring: str = "neg_log_loss"
                ) -> (pd.DataFrame, float):
    # --- 1) numpy → pandas ---
    if isinstance(X, np.ndarray):
        X = pd.DataFrame(X)
    if not isinstance(y, pd.Series):
        y = pd.Series(y, index=X.index)
    if sample_weight is not None and not isinstance(sample_weight, pd.Series):
        sample_weight = pd.Series(sample_weight, index=X.index)
    if t1 is not None and not isinstance(t1, pd.Series):
        t1 = pd.Series(t1, index=X.index)

    feat_names = list(X.columns)

    # --- 2) baseline score ---
    base_scores = cv_score(clf, X, y,
                           sample_weight=sample_weight,
                           scoring=scoring,
                           t1=t1,
                           cv=cv,
                           pct_embargo=pct_embargo)
    base_mean = base_scores.mean()

    # --- 3) 每個 feature permutation, 加進度條 ---
    diffs = []
    for col in tqdm(feat_names, desc="MDA permuting features"):
        Xp = X.copy()
        np.random.shuffle(Xp[col].values)
        perm_scores = cv_score(clf, Xp, y,
                               sample_weight=sample_weight,
                               scoring=scoring,
                               t1=t1,
                               cv=cv,
                               pct_embargo=pct_embargo)
        diffs.append(base_scores - perm_scores)

    diffs = np.vstack(diffs)
    imp_df = pd.DataFrame({
        "mean": diffs.mean(axis=1),
        "std" : diffs.std(axis=1) * diffs.shape[1]**-0.5
    }, index=feat_names)
    imp_df.sort_values(by="mean", ascending=False, inplace=True)
    return imp_df, base_mean

# 3) SFI: 支援 X, y, sample_weight, t1 為 np.ndarray
def SFI(feat_names: list,
                 clf,
                 X: pd.DataFrame,
                 y: pd.Series,
                 sample_weight=None,
                 t1=None,
                 cv: int = 5,
                 pct_embargo: float = 0.01,
                 scoring: str = "neg_log_loss"
                ) -> pd.DataFrame:
    if isinstance(X, np.ndarray):
        X = pd.DataFrame(X, columns=feat_names)
    if not isinstance(y, pd.Series):
        y = pd.Series(y, index=X.index)
    if sample_weight is not None and not isinstance(sample_weight, pd.Series):
        sample_weight = pd.Series(sample_weight, index=X.index)
    if t1 is not None and not isinstance(t1, pd.Series):
        t1 = pd.Series(t1, index=X.index)

    imp = pd.DataFrame(columns=["mean", "std"])
    for featName in feat_names:
        dfo = cv_score(clf, X=X[[featName]],  y = y,
                      sample_weight= sample_weight,
                      scoring=scoring, t1 = t1, cv = cv)
        imp.loc[featName, "mean"] = dfo.mean()
        imp.loc[featName, "std"] = dfo.std() * dfo.shape[0]**-0.5
        imp.sort_values(by="mean", ascending=False, inplace=True)
    return imp


### RF and compute MDI MDA SFI

In [16]:
col = [f"PCA_{i}" for i in range(X2.shape[1])]

In [17]:
# 用CV不用切割資料集
X = pd.DataFrame(X2, columns= col, index=feats.index)
y = labels.values
weights = weights.values
# t1 在上面定義好了

In [18]:
X

Unnamed: 0_level_0,PCA_0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,PCA_10,PCA_11,PCA_12,PCA_13,PCA_14,PCA_15,PCA_16,PCA_17,PCA_18,PCA_19
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2021-01-22 16:00:00,-5.799092,-6.667734,2.831685,-4.376821,4.745149,4.974754,2.585366,-0.389366,4.162528,1.293853,1.544921,-1.215628,-2.283115,1.045526,-3.037757,1.378900,1.650350,-2.139362,-0.283402,-0.148872
2021-01-28 14:50:00,-12.438673,-5.509063,0.436969,-2.253595,4.779983,-4.238147,-5.008632,-0.920011,4.083533,0.551789,0.355830,0.725524,1.191631,0.931340,1.555752,0.790570,-0.931003,-1.237774,-2.571104,-0.070142
2021-01-28 15:58:00,-0.914997,-3.263566,-6.273350,5.583443,2.726280,2.424141,2.093870,1.179466,-0.341848,-0.887487,3.766684,0.869564,1.103646,1.653496,-1.123067,0.720345,0.562026,-1.178783,0.067104,-2.912669
2021-01-28 16:15:00,-4.921787,-4.013460,-4.221147,2.292808,3.155357,-1.735619,3.362364,-3.007337,-0.953520,1.139383,3.295104,-0.222116,1.022648,-0.279718,-0.456692,0.641452,-0.455475,-1.646982,-0.944321,-0.659289
2021-01-28 16:21:00,-8.490138,-4.423071,-2.284867,0.825572,4.501388,-2.716232,2.577636,-2.354038,0.865607,0.408851,2.383079,0.962312,1.365664,1.509007,-1.289653,0.346439,-0.605571,-1.815647,-1.195692,-0.297338
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-20 16:32:00,-10.078947,10.976820,-2.200420,-5.326469,3.837847,0.166117,-1.960596,-0.426028,2.010870,-1.870346,0.778321,-1.961269,1.455997,1.611769,-0.650180,-2.265725,-0.386517,3.155148,0.285252,0.630826
2024-12-20 17:18:00,-11.826614,11.489787,-0.894275,-3.255975,6.131481,1.406690,-2.766024,-0.444787,1.300437,-1.161436,-0.928581,-2.642621,1.669947,1.000449,-0.558620,-2.406410,-0.332787,3.178175,0.084099,0.079467
2024-12-23 07:34:00,-6.772874,11.546762,-5.528706,-6.310889,3.496664,-0.622073,2.408875,1.316003,1.649202,0.434992,0.721245,-2.058231,2.431842,0.244633,-1.828676,-1.167709,0.122303,2.939340,0.339328,1.005489
2024-12-30 03:22:00,-5.937019,10.048670,-1.800176,-10.201551,3.500005,0.787053,-2.101928,-0.470903,1.261134,1.725928,2.235624,0.501275,0.568966,1.236081,-0.501452,0.882439,0.391568,2.427200,0.284950,0.320626


In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

In [None]:
avgU = weights.mean()
clf = DecisionTreeClassifier(criterion="entropy", max_features="auto", class_weight="balanced")
clf = BaggingClassifier(estimator=clf, n_estimators=1000, max_samples=avgU)

In [21]:
# 1. MDI
clf_fit = clf.fit(X, y, sample_weight=weights)
mdi_imp = feat_imp_mdi(clf_fit, col)
print(mdi_imp)




          median       std
PCA_10  0.053839  0.000421
PCA_15  0.053119  0.000404
PCA_4   0.051702  0.000404
PCA_17  0.051656  0.000406
PCA_1   0.051380  0.000398
PCA_6   0.051320  0.000412
PCA_14  0.050161  0.000403
PCA_19  0.050110  0.000400
PCA_3   0.050083  0.000413
PCA_16  0.049945  0.000398
PCA_18  0.049815  0.000406
PCA_11  0.049576  0.000404
PCA_13  0.049454  0.000399
PCA_5   0.049263  0.000396
PCA_7   0.049117  0.000415
PCA_8   0.048678  0.000381
PCA_9   0.048476  0.000392
PCA_2   0.047525  0.000387
PCA_12  0.047445  0.000393
PCA_0   0.047337  0.000380


In [None]:
# 2. MDA
mda_imp, base = feat_imp_mda(
    clf, X, y, cv=5,
    sample_weight=weights,
    t1=t1, pct_embargo=0.01,
    scoring="neg_log_loss"
)
print(mda_imp, base)


MDA permuting features: 100%|██████████| 20/20 [37:45<00:00, 113.27s/it]

            mean       std
PCA_8   0.001491  0.000445
PCA_17  0.001370  0.000586
PCA_4   0.001166  0.000573
PCA_19  0.000805  0.000300
PCA_11  0.000736  0.000947
PCA_10  0.000730  0.001512
PCA_13  0.000630  0.000865
PCA_7   0.000580  0.000947
PCA_1   0.000416  0.001008
PCA_3   0.000381  0.000432
PCA_6   0.000376  0.000866
PCA_18  0.000369  0.000743
PCA_15  0.000005  0.001496
PCA_16 -0.000230  0.000801
PCA_5  -0.000251  0.000851
PCA_9  -0.000368  0.000534
PCA_12 -0.000480  0.000863
PCA_14 -0.000639  0.000500
PCA_2  -0.001096  0.001216
PCA_0  -0.001827  0.001479 -0.7009380955929598





In [None]:
# 3. SFI
sfi_imp = SFI(X.columns, clf, X, y, scoring="neg_log_loss", sample_weight=weights , cv=5, t1 = t1, pct_embargo=0.01)
print(sfi_imp)
sfi_imp.to_csv("sfi_imp.csv")

            mean         std
PCA_16 -0.836891  0.00624592
PCA_0    -0.8486   0.0191458
PCA_17 -0.851275   0.0107346
PCA_18 -0.851617   0.0136697
PCA_5  -0.853323  0.00954717
PCA_9  -0.857127   0.0134292
PCA_15 -0.861897   0.0043069
PCA_10 -0.863975   0.0094395
PCA_6  -0.865283   0.0111665
PCA_7  -0.865411   0.0114415
PCA_2  -0.866265  0.00833982
PCA_19 -0.866476  0.00921716
PCA_3  -0.872402   0.0115352
PCA_4  -0.875083  0.00595834
PCA_8  -0.875461  0.00624876
PCA_12 -0.882521  0.00575093
PCA_11  -0.88269   0.0120514
PCA_14  -0.88443   0.0143149
PCA_13 -0.897128  0.00950199
PCA_1  -0.909431   0.0287834
