### **讀取資料**

In [None]:
import sys
import os

project_root = os.path.abspath(os.path.join("..", "QuantCommon"))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils.processing import get_dollar_bars
import numpy as np
import pandas as pd
import numpy as np


clusters = pd.read_csv("results/clusters.csv", index_col=0)
print(f'XAUUSD 在第{clusters.loc["XAUUSD_M1", "cluster"]}群')

XAUUSD 在第2群


In [2]:
group = clusters[clusters["cluster"] == clusters.loc["XAUUSD_M1", "cluster"]]
data = dict({})
for i in group.index:
    print(f"Processing {i[:-3]}...")
    filepath = os.path.join(project_root, "data", "FI", "M1",f"{i}.csv")
    df = pd.read_csv(filepath, parse_dates=True)
    df['time'] = pd.to_datetime(df['time'])
    df = get_dollar_bars(df)
    data[i] = df

Processing AUDUSD...
Filtered Dollar Bars Count: 184442
Processing EURGBP...
Filtered Dollar Bars Count: 182676
Processing EURUSD...
Filtered Dollar Bars Count: 186054
Processing GBPUSD...
Filtered Dollar Bars Count: 183075
Processing HK50...
Filtered Dollar Bars Count: 10739
Processing NZDUSD...
Filtered Dollar Bars Count: 67499
Processing UK100...
Filtered Dollar Bars Count: 11067
Processing US2000...
Filtered Dollar Bars Count: 12016
Processing USDCAD...
Filtered Dollar Bars Count: 68479
Processing XAGUSD...
Filtered Dollar Bars Count: 55366
Processing XAUUSD...
Filtered Dollar Bars Count: 54861


In [None]:
from utils.metalabeling import add_vertical_barrier, get_events, get_bins
from utils.processing import apply_cusum_filter, getDailyVol, cal_weights, compute_talib_features

feats_list, labels_list, weights_list, t1_list = [], [], [], []

for symbol,df in data.items():
    print(f"Processing {symbol[:-3]}...")
    vol = getDailyVol(df["close"], span0=20)
    cusum_events  = apply_cusum_filter(df, volatility=vol).index
    vertical_barriers = add_vertical_barrier(cusum_events, df, num_days=2)
    pt_sl = [1, 1]
    min_ret = 0.003
    triple_barrier_events = get_events(close=df["close"],
                                                t_events=cusum_events,
                                                pt_sl=pt_sl,
                                                target=vol,
                                                min_ret=min_ret,
                                                num_threads=4,
                                                vertical_barrier_times=vertical_barriers,
                                                side_prediction=None)
    labels  = get_bins(triple_barrier_events, df["close"])
    weights = cal_weights(triple_barrier_events, df["close"])
    feats = compute_talib_features(df,
                               periods=[7,28,50,100],
                               apply_ffd=True)
    
    # normalize features
    for col in feats.columns:
        # 每個 col 分別做 rolling.apply
        feats[col] = (
            feats[col]
            .rolling(window=200, min_periods=1)
            .apply(lambda arr: (arr <= arr[-1]).sum() / len(arr), raw=True)
        )
    idx = feats.index.intersection(labels.index)
    feats = feats.loc[idx]
    labels = labels.loc[idx]["bin"]
    weights = weights.loc[idx]["weight"]
    weights = weights / weights.mean() # normalize weights
    t1 = triple_barrier_events.loc[idx]["t1"]

    feats_list.append(feats)
    labels_list.append(labels.rename("bin"))
    weights_list.append(weights.rename("weight"))
    t1_list.append(t1.rename("t1"))

    
feats = pd.concat(feats_list)
labels = pd.concat(labels_list)
weights = pd.concat(weights_list)/len(weights)
t1 = pd.concat(t1_list)

combined_features = pd.concat(
    [feats, labels, weights, t1],
    axis=1
)
combined_features.sort_index(inplace=True)
combined_features.to_csv("intermediate_results/combined_features.csv", index=True)

labels = combined_features['bin']   
weights = combined_features['weight']
t1 = combined_features['t1']
feats = combined_features.drop(columns=['bin', 'weight', 't1'],axis=1)

Processing AUDUSD...
Processing EURGBP...
Processing EURUSD...
Processing GBPUSD...
Processing HK50...
Processing NZDUSD...
Processing UK100...
Processing US2000...
Processing USDCAD...
Processing XAGUSD...
Processing XAUUSD...


## PCA

In [4]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# === Pipeline : z-score → PCA ===
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("pca",    PCA(n_components=0.95, whiten=False)),
])


X = pipe.fit_transform(feats)


In [5]:
from joblib import dump
# 1) 存整個 Pipeline（包含 scaler + pca）
dump(pipe, "pipeline_scaler_pca.joblib")


['pipeline_scaler_pca.joblib']

## MDA MDI SFI

#### Purged K Fold


In [6]:
import numpy as np
import pandas as pd

class PurgedKFold:
    def __init__(self, n_splits=3, t1=None, pct_embargo=0.0):
        if not isinstance(t1, pd.Series):
            raise ValueError("t1 must be a pandas Series")
        self.n_splits = n_splits
        self.t1 = t1.sort_index()
        self.pct_embargo = pct_embargo

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

    def split(self, X, y=None, groups=None):
        if not X.index.equals(self.t1.index):
            raise ValueError("X and t1 must have the same index")
        n_samples = len(X)
        indices = np.arange(n_samples)
        # divide indices into contiguous chunks
        test_slices = np.array_split(indices, self.n_splits)
        mbrg = int(n_samples * self.pct_embargo)

        for slice_ in test_slices:
            i, j = slice_[0], slice_[-1] + 1
            test_idx = indices[i:j]

            # start‐time of test block
            t0 = self.t1.index[i]
            # end‐time of test block
            t1_max = self.t1.iloc[test_idx].max()
            # find the position just after t1_max
            max_t1_pos = self.t1.index.searchsorted(t1_max)

            # training before test block
            train_before = indices[self.t1.index < t0]
            # training after test + embargo
            train_after = indices[max_t1_pos + mbrg :]

            train_idx = np.concatenate([train_before, train_after])
            yield train_idx, test_idx


#### CVscore

In [7]:
import numpy as np
from sklearn.base import clone
from sklearn.metrics import log_loss, accuracy_score

def cv_score(clf,
             X,
             y,
             sample_weight=None,
             scoring="neg_log_loss",
             t1=None,
             cv=3,
             pct_embargo=0.01):

    if scoring not in ["neg_log_loss", "accuracy"]:
        raise ValueError('scoring must be "neg_log_loss" or "accuracy"')

    pkf = PurgedKFold(n_splits=cv, t1=t1, pct_embargo=pct_embargo)
    scores = []

    for train_idx, test_idx in pkf.split(X):
        # 複製一份新的 model
        model = clone(clf)
        # fit
        if sample_weight is None:
            model.fit(X.iloc[train_idx], y.iloc[train_idx])
        else:
            model.fit(X.iloc[train_idx],
                      y.iloc[train_idx],
                      sample_weight=sample_weight.iloc[train_idx].values)
        # predict + score
        if scoring == "neg_log_loss":
            prob = model.predict_proba(X.iloc[test_idx])
            sc = -log_loss(y.iloc[test_idx],
                           prob,
                           sample_weight=(None if sample_weight is None else sample_weight.iloc[test_idx].values),
                           labels=model.classes_)
        else:
            pred = model.predict(X.iloc[test_idx])
            sc = accuracy_score(y.iloc[test_idx],
                                pred,
                                sample_weight=(None if sample_weight is None else sample_weight.iloc[test_idx].values))
        scores.append(sc)
    return np.array(scores)




#### MDA MDI SFI 實作

In [8]:
import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.metrics import log_loss, accuracy_score
from tqdm import tqdm



# 1) MDI 
def feat_imp_mdi(fit, feat_names):
    """
    fit: 已訓練好的 tree‐ensemble（RandomForest, ExtraTrees…）
    feat_names: list of feature names
    return: pd.DataFrame with columns ["mean","std"] 純量化後的重要度
    """
    # 從每顆樹蒐集 feature_importances_
    df0 = pd.DataFrame(
        [tree.feature_importances_ for tree in fit.estimators_],
        columns=feat_names
    ).replace(0, np.nan)  # 如果 max_features=1，某些 tree 有 0
    imp = pd.concat({
        "mean": df0.mean(),
        "std" : df0.std() * df0.shape[0]**-0.5
    }, axis=1)
    # normalize to sum=1
    imp["mean"] /= imp["mean"].sum()
    imp.sort_values(by="mean", ascending=False, inplace=True)
    return imp


# 2) MDA: 
def feat_imp_mda(clf,
                 X,
                 y,
                 sample_weight=None,
                 t1=None,
                 cv: int = 5,
                 pct_embargo: float = 0.01,
                 scoring: str = "neg_log_loss"
                ) -> (pd.DataFrame, float):
    # --- 1) numpy → pandas ---
    if isinstance(X, np.ndarray):
        X = pd.DataFrame(X)
    if not isinstance(y, pd.Series):
        y = pd.Series(y, index=X.index)
    if sample_weight is not None and not isinstance(sample_weight, pd.Series):
        sample_weight = pd.Series(sample_weight, index=X.index)
    if t1 is not None and not isinstance(t1, pd.Series):
        t1 = pd.Series(t1, index=X.index)

    feat_names = list(X.columns)

    # --- 2) baseline score ---
    base_scores = cv_score(clf, X, y,
                           sample_weight=sample_weight,
                           scoring=scoring,
                           t1=t1,
                           cv=cv,
                           pct_embargo=pct_embargo)
    base_mean = base_scores.mean()

    # --- 3) 每個 feature permutation, 加進度條 ---
    diffs = []
    for col in tqdm(feat_names, desc="MDA permuting features"):
        Xp = X.copy()
        np.random.shuffle(Xp[col].values)
        perm_scores = cv_score(clf, Xp, y,
                               sample_weight=sample_weight,
                               scoring=scoring,
                               t1=t1,
                               cv=cv,
                               pct_embargo=pct_embargo)
        diffs.append(base_scores - perm_scores)

    diffs = np.vstack(diffs)
    imp_df = pd.DataFrame({
        "mean": diffs.mean(axis=1),
        "std" : diffs.std(axis=1) * diffs.shape[1]**-0.5
    }, index=feat_names)
    imp_df.sort_values(by="mean", ascending=False, inplace=True)
    return imp_df

# 3) SFI
def SFI(feat_names: list,
                 clf,
                 X: pd.DataFrame,
                 y: pd.Series,
                 sample_weight=None,
                 t1=None,
                 cv: int = 5,
                 pct_embargo: float = 0.01,
                 scoring: str = "neg_log_loss"
                ) -> pd.DataFrame:
    if isinstance(X, np.ndarray):
        X = pd.DataFrame(X, columns=feat_names)
    if not isinstance(y, pd.Series):
        y = pd.Series(y, index=X.index)
    if sample_weight is not None and not isinstance(sample_weight, pd.Series):
        sample_weight = pd.Series(sample_weight, index=X.index)
    if t1 is not None and not isinstance(t1, pd.Series):
        t1 = pd.Series(t1, index=X.index)

    imp = pd.DataFrame(columns=["mean", "std"])
    for featName in feat_names:
        dfo = cv_score(clf, X=X[[featName]],  y = y,
                      sample_weight= sample_weight,
                      scoring=scoring, t1 = t1, cv = cv)
        imp.loc[featName, "mean"] = dfo.mean()
        imp.loc[featName, "std"] = dfo.std() * dfo.shape[0]**-0.5
        imp.sort_values(by="mean", ascending=False, inplace=True)
    return imp


### RF and compute MDI MDA SFI

In [9]:
col = [f"PCA_{i}" for i in range(X.shape[1])]

In [10]:
# 用CV不用切割資料集
X = pd.DataFrame(X, columns= col, index=feats.index)
y = labels.values
weights = weights.values
# t1 在上面定義好了　　　　　　　　　　　　　　　　　　　　　　　

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

In [12]:
avgU = weights.mean()
clf = DecisionTreeClassifier(criterion="entropy", max_features="sqrt", class_weight="balanced")
clf = BaggingClassifier(estimator=clf, n_estimators=1000, max_samples=avgU)

In [None]:
# 1. MDI
clf_fit = clf.fit(X, y, sample_weight=weights)
mdi_imp = feat_imp_mdi(clf_fit, col)
mdi_imp.to_csv("results/mdi.csv")


In [None]:
# 2. MDA
mda_imp = feat_imp_mda(
    clf, X, y, cv=5,
    sample_weight=weights,
    t1=t1, pct_embargo=0.01,
    scoring="neg_log_loss"
)
print(mda_imp)
mda_imp.to_csv("results/mda.csv")


MDA permuting features: 100%|██████████| 32/32 [20:09<00:00, 37.81s/it]

            mean       std
PCA_26  0.000554  0.000265
PCA_22  0.000510  0.000323
PCA_29  0.000479  0.000327
PCA_9   0.000414  0.000528
PCA_30  0.000374  0.000244
PCA_17  0.000368  0.000251
PCA_21  0.000302  0.000115
PCA_11  0.000281  0.000456
PCA_19  0.000256  0.000239
PCA_31  0.000254  0.000297
PCA_23  0.000225  0.000240
PCA_3   0.000213  0.000142
PCA_16  0.000204  0.000309
PCA_25  0.000198  0.000131
PCA_14  0.000163  0.000148
PCA_13  0.000155  0.000251
PCA_8   0.000150  0.000401
PCA_10  0.000132  0.000285
PCA_6   0.000125  0.000372
PCA_5   0.000118  0.000341
PCA_28  0.000077  0.000224
PCA_20  0.000049  0.000159
PCA_2   0.000018  0.000151
PCA_18  0.000015  0.000249
PCA_1   0.000011  0.000236
PCA_12  0.000002  0.000306
PCA_24 -0.000015  0.000293
PCA_0  -0.000030  0.000475
PCA_4  -0.000043  0.000279
PCA_27 -0.000109  0.000370
PCA_15 -0.000117  0.000259
PCA_7  -0.000269  0.000315





In [None]:
# 3. SFI
sfi_imp = SFI(
    X.columns, clf, X, y, 
    scoring="neg_log_loss", 
    sample_weight=weights , 
    cv=5, t1 = t1, pct_embargo=0.01)
print(sfi_imp)
sfi_imp.to_csv("results/sfi.csv")

            mean       std
PCA_10   -0.6933  0.000128
PCA_29 -0.693366  0.000169
PCA_28 -0.693376  0.000141
PCA_14 -0.693436  0.000162
PCA_8  -0.693591  0.000175
PCA_31 -0.693601  0.000129
PCA_30 -0.693672  0.000322
PCA_18 -0.693696  0.000249
PCA_21 -0.693717  0.000196
PCA_5  -0.693745  0.000309
PCA_22 -0.693759  0.000507
PCA_19 -0.693772  0.000115
PCA_27 -0.693784   0.00021
PCA_7  -0.693794  0.000253
PCA_9  -0.693801  0.000187
PCA_2  -0.693804   0.00017
PCA_0  -0.693808  0.000288
PCA_20 -0.693824  0.000234
PCA_26 -0.693834  0.000402
PCA_6  -0.693836   0.00028
PCA_4  -0.693848  0.000227
PCA_1   -0.69386  0.000109
PCA_16 -0.693867  0.000268
PCA_24 -0.693968  0.000133
PCA_17 -0.693972  0.000584
PCA_25 -0.693977  0.000191
PCA_15 -0.694074  0.000331
PCA_3  -0.694074  0.000271
PCA_23 -0.694118  0.000226
PCA_11 -0.694134  0.000245
PCA_13 -0.694147  0.000296
PCA_12 -0.694167  0.000199
