In [1]:
import sys
import os

project_root = os.path.abspath(os.path.join("..", "QuantCommon"))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils.processing import get_dollar_bars
import numpy as np
import pandas as pd
import numpy as np


clusters = pd.read_csv("clusters.csv", index_col=0)
print(f'XAUUSD 在第{clusters.loc["XAUUSD_M1", "cluster"]}群')

XAUUSD 在第2群


In [2]:
group = clusters[clusters["cluster"] == clusters.loc["XAUUSD_M1", "cluster"]]
data = dict({})
for i in group.index:
    print(f"Processing {i[:-3]}...")
    filepath = os.path.join(project_root, "data", "FI", "M1",f"{i}.csv")
    df = pd.read_csv(filepath, parse_dates=True)
    df['time'] = pd.to_datetime(df['time'])
    df = get_dollar_bars(df)
    data[i] = df

Processing AUDUSD...
Filtered Dollar Bars Count: 184442
Processing EURGBP...
Filtered Dollar Bars Count: 182676
Processing EURUSD...
Filtered Dollar Bars Count: 186054
Processing GBPUSD...
Filtered Dollar Bars Count: 183075
Processing HK50...
Filtered Dollar Bars Count: 10739
Processing NZDUSD...
Filtered Dollar Bars Count: 67499
Processing UK100...
Filtered Dollar Bars Count: 11067
Processing US2000...
Filtered Dollar Bars Count: 12016
Processing USDCAD...
Filtered Dollar Bars Count: 68479
Processing XAGUSD...
Filtered Dollar Bars Count: 55366
Processing XAUUSD...
Filtered Dollar Bars Count: 54861


In [None]:
from utils.metalabeling import add_vertical_barrier, get_events, get_bins
from utils.processing import apply_cusum_filter, getDailyVol, cal_weights, compute_talib_features
from joblib import load

feats_list, labels_list, weights_list, t1_list = [], [], [], []
pca_pipe = load("models/pipeline_scaler_pca.joblib")

for symbol,df in data.items():
    print(f"Processing {symbol[:-3]}...")
    vol = getDailyVol(df["close"], span0=20)
    cusum_events  = apply_cusum_filter(df, volatility=vol).index
    vertical_barriers = add_vertical_barrier(cusum_events, df, num_days=2)
    pt_sl = [1, 1]
    min_ret = 0.003
    triple_barrier_events = get_events(close=df["close"],
                                                t_events=cusum_events,
                                                pt_sl=pt_sl,
                                                target=vol,
                                                min_ret=min_ret,
                                                num_threads=4,
                                                vertical_barrier_times=vertical_barriers,
                                                side_prediction=None)
    labels  = get_bins(triple_barrier_events, df["close"])
    weights = cal_weights(triple_barrier_events, df["close"])
    feats = compute_talib_features(df,
                               periods=[7,28,50,100],
                               apply_ffd=True)
    
    # normalize features
    for col in feats.columns:
        # 每個 col 分別做 rolling.apply
        feats[col] = (
            feats[col]
            .rolling(window=200, min_periods=1)
            .apply(lambda arr: (arr <= arr[-1]).sum() / len(arr), raw=True)
        )
    idx = feats.index.intersection(labels.index)
    feats = feats.loc[idx]
    labels = labels.loc[idx]["bin"]
    weights = weights.loc[idx]["weight"]
    weights = weights / weights.mean() # normalize weights
    t1 = triple_barrier_events.loc[idx]["t1"]

    # apply PCA
    pca_results = pca_pipe.transform(feats)
    col = [f"PCA_{i}" for i in range(pca_results.shape[1])]
    feats = pd.DataFrame(pca_results, columns= col, index=feats.index)

    feats_list.append(feats)
    labels_list.append(labels.rename("label"))
    weights_list.append(weights.rename("weight"))
    t1_list.append(t1.rename("t1"))


from joblib import dump

# 假設 feats_list, labels_list, weights_list, t1_list 都已經準備好

# 1) 直接一次把四個 list 打包存檔
dump(
    (feats_list, labels_list, weights_list, t1_list),
    "intermediate_results/events_data.joblib",
    compress=3  # 可選壓縮等級，0-9
)




Processing AUDUSD...
Processing EURGBP...
Processing EURUSD...
Processing GBPUSD...
Processing HK50...
Processing NZDUSD...
Processing UK100...
Processing US2000...
Processing USDCAD...
Processing XAGUSD...
Processing XAUUSD...


['intermediate_results/events_data.joblib']