In [2]:
# to reload local packages when debugging
%load_ext autoreload
%autoreload 2

%cd /mnt/c/Users/resha/Documents/Github/balancing_framework/

import pickle
import pandas as pd
import numpy as np

with open('/mnt/c/Users/resha/Documents/Github/balancing_framework/spy5m_labelled_episodes.pkl', 'rb') as f:
    df_original = pickle.load(f)
with open('/mnt/c/Users/resha/Documents/Github/balancing_framework/spy5m_labelled_episodes_fracdiff.pkl', 'rb') as f:
    df_fd = pickle.load(f)
# PZ algorithm has some look ahead so remove the episode labels, will be uesd only for some kind of analysis afterwards
df = df_original.drop(columns=['episode']) 
# df



/mnt/c/Users/resha/Documents/Github/balancing_framework


In [3]:
import pandas_ta as ta
import scipy.stats as stats
import numpy_ext as npx

# 10 days of 5 minute bars: (1440/5) * 10  = 2880, 30 days of 5 minute bars: (1440/5) * 30 = 7200

def generate_ta_features(df, window=2880):
    # Momentum Indicators
    df[f'rsi_{window}'] = ta.rsi(df['close'], length=window)                      # Relative Strength Index
    df[[f'stoch_k_14_3_3_{window}',f'stoch_d_14_3_3_{window}']] = ta.stoch(df['high'], df['low'], df['close']).iloc[:, :2]        # Stochastic Oscillator
    df[f'ao_{window}'] = ta.ao(df['high'], df['low'])                            # Awesome Oscillator
    df[f'macd_{window}'] = ta.macd(df['close']).iloc[:, 0]                       # MACD line
    df[f'macd_signal_{window}'] = ta.macd(df['close']).iloc[:, 1]                # Signal line
    df[f'ppo_{window}'] = ta.ppo(df['close']).iloc[:, 0]                         # Percentage Price Oscillator
    
    # Trend Indicators
    df[f'sma_{window}'] = ta.sma(df['close'], length=window)                      # Simple Moving Average
    df[f'ema_{window}'] = ta.ema(df['close'], length=window)                      # Exponential Moving Average
    df[f'ema_{window*3}'] = ta.ema(df['close'], length=window*3)                    # Long-term EMA
    df[f'adx_{window}'] = ta.adx(df['high'], df['low'], df['close'])[f'ADX_14']   # Average Directional Index
    
    # Volatility Indicators
    df[f'atr_{window}'] = ta.atr(df['high'], df['low'], df['close'], length=window)  # Average True Range
    df[[f'bb_upper_{window}',f'bb_middle_{window}',f'bb_lower_{window}']] = ta.bbands(df['close']).iloc[:, :3] # Bollinger Bands
    df[[f'kc_upper_{window}',f'kc_middle_{window}',f'kc_lower_{window}']] = ta.kc(df['high'], df['low'], df['close']).iloc[:, :3] # Keltner Channels
    df[[f'donchian_upper_{window}',f'donchian_middle_{window}',f'donchian_lower_{window}']] = ta.donchian(df['high'], df['low'], length=window).iloc[:, :3] # Donchian Channel
    
    # Volume Indicators
    df[f'obv_{window}'] = ta.obv(df['close'], df['volume'])                      # On-Balance Volume
    df[f'cmf_{window}'] = ta.cmf(df['high'], df['low'], df['close'], df['volume'], length=window) # Chaikin Money Flow
    df[f'mfi_{window}'] = ta.mfi(df['high'], df['low'], df['close'], df['volume'], length=window).astype(float) # Money Flow Index
    df[f'vwap_{window}'] = ta.vwap(df['high'], df['low'], df['close'], df['volume'])          # Volume Weighted Average Price
    
    # Price Transformations & Statistical Indicators
    df[f'log_ret_{window}'] = np.log(df['close'] / df['close'].shift(1))         # Log returns
    df[f'std_{window}'] = df['close'].rolling(window=window).std()                # Rolling 20-period standard deviation
    df[f'zscore_{window}'] = (df['close'] - df['close'].rolling(window=window).mean()) / df[f'std_{window}'] # Z-score
    
    # Price Difference Indicators
    df[f'delta_close_{window}'] = df['close'] - df['close'].shift(1)             # Price change
    df[f'delta_high_{window}'] = df['high'] - df['high'].shift(1)                # High price change
    df[f'delta_low_{window}'] = df['low'] - df['low'].shift(1)                   # Low price change
    
    # Custom Combinations
    df[f'macd_diff_{window}'] = df[f'macd_{window}'] - df[f'macd_signal_{window}']                   # MACD Difference
    df[f'close_over_sma_{window}'] = df['close'] / df[f'sma_{window}']               # Close price to SMA ratio
    df[f'volatility_ratio_{window}'] = df[f'atr_{window}'] / df['close']                # ATR-based volatility ratio

    # Acceleration Indicators - standard deviation of second deriv aka return acceleration
    df[f"racc_close_{window}"] = df["close"].pct_change().diff().rolling(window).std() 
    df[f"racc_close_{window*2}"] = df["close"].pct_change().diff().rolling(window*2).std() 
    df[f"racc_close_{window*3}"] = df["close"].pct_change().diff().rolling(window*3).std() 

    # AQR Momentum Indicators
    def aqr_momentum(array: np.ndarray) -> float:
        """
        Input:  Price time series.
        Output: Annualized exponential regression slope, 
                multiplied by the R2
        """    
        returns = np.diff(np.log(array))  # .diff()
        x = np.arange(len(returns))
        slope, _, rvalue, _, _ = stats.linregress(x, returns)
        return ((1 + slope) ** 252) * (rvalue ** 2)  # annualize slope and multiply by R^2
    def add_aqr_momentum(df: pd.DataFrame, column: str, window: int) -> pd.DataFrame:
        df[f"aqr_momo_{column}_{window}"] = npx.rolling_apply(
            aqr_momentum, window, df[column].values, n_jobs=10
        )
        return df
    # Add AQR momentum indicators
    df = add_aqr_momentum(df, "close", window)
    df = add_aqr_momentum(df, "close", window*2)
    df = add_aqr_momentum(df, "close", window*3)
    
    # Internal Bar Strength (IBS)
    df['ibs'] = (df['close'] - df['low']) / (df['high'] - df['low'])   # Internal Bar Strength
    

    # Drop rows with NaN values (optional)
    df.dropna(inplace=True)

    return df


In [4]:
# 10 days of 5 minute bars: (1440/5) * 10  = 2880, 30 days of 5 minute bars: (1440/5) * 30 = 7200

df2 = generate_ta_features(df, 28)
# df2 = generate_ta_features(df2, 288)
df2 = generate_ta_features(df2, 2880)
# df2 = generate_ta_features(df2, 7200)

 8663530.08333333  168436.8       ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df[f'mfi_{window}'] = ta.mfi(df['high'], df['low'], df['close'], df['volume'], length=window).astype(float) # Money Flow Index
 2468903.24        978036.58666667]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df[f'mfi_{window}'] = ta.mfi(df['high'], df['low'], df['close'], df['volume'], length=window).astype(float) # Money Flow Index
  df[f'vwap_{window}'] = ta.vwap(df['high'], df['low'], df['close'], df['volume'])          # Volume Weighted Average Price
 1.84722661e+06 8.66353008e+06]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df[f'mfi_{window}'] = ta.mfi(df['high'], df['low'], df['close'], df['volume'], length=window).astype(float) # Money Flow Index
  1450857.38666667  2468903.24         978036.58666667]' has dtype incompatible with int64, please explicitly cast to a c

In [5]:
df2

Unnamed: 0_level_0,volume,vwap,open,close,high,low,transactions,label,rsi_28,stoch_k_14_3_3_28,...,delta_low_2880,macd_diff_2880,close_over_sma_2880,volatility_ratio_2880,racc_close_2880,racc_close_5760,racc_close_8640,aqr_momo_close_2880,aqr_momo_close_5760,aqr_momo_close_8640
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-01-08 13:25:00-05:00,115100.0,113.1185,113.0800,113.1300,113.1600,113.04,136,0,53.732218,66.028097,...,0.03,0.006613,1.039049,0.001006,0.001198,0.001158,0.001157,0.000065,0.000197,0.000061
2004-01-08 13:30:00-05:00,114700.0,113.0916,113.1000,113.1200,113.1400,113.04,150,0,53.352266,71.647510,...,0.00,0.009370,1.038933,0.001006,0.001198,0.001158,0.001157,0.000064,0.000188,0.000067
2004-01-08 13:35:00-05:00,110700.0,113.0738,113.1000,113.0600,113.1400,113.02,158,1,51.103770,66.666667,...,-0.02,0.011080,1.038358,0.001006,0.001198,0.001158,0.001157,0.000047,0.000178,0.000062
2004-01-08 13:40:00-05:00,73500.0,113.0421,113.0300,113.0800,113.0900,113.01,123,0,51.805882,58.045977,...,-0.01,0.012336,1.038517,0.001006,0.001198,0.001158,0.001157,0.000049,0.000185,0.000066
2004-01-08 13:45:00-05:00,64600.0,113.0762,113.0800,113.0700,113.1100,113.04,95,1,51.423011,48.400800,...,0.03,0.013055,1.038401,0.001006,0.001197,0.001158,0.001157,0.000032,0.000172,0.000063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-04-30 19:35:00-04:00,4929.0,500.8928,500.8900,500.8800,500.9500,500.85,71,1,38.967372,19.154907,...,-0.07,-0.157894,0.988589,0.000757,0.001044,0.000901,0.000853,0.000143,0.000083,0.000304
2024-04-30 19:40:00-04:00,10401.0,500.9938,500.9101,501.0682,501.0682,500.91,93,1,41.216741,14.264368,...,0.06,-0.147279,0.988973,0.000757,0.001044,0.000901,0.000853,0.000154,0.000079,0.000299
2024-04-30 19:45:00-04:00,3686.0,501.1517,501.1200,501.1500,501.1800,501.11,62,0,42.177301,31.505747,...,0.20,-0.135964,0.989147,0.000756,0.001044,0.000901,0.000853,0.000147,0.000080,0.000254
2024-04-30 19:50:00-04:00,1952.0,501.0709,501.1500,500.9900,501.1500,500.99,56,1,40.824141,37.827586,...,-0.12,-0.127046,0.988844,0.000756,0.001044,0.000901,0.000852,0.000141,0.000084,0.000258


In [None]:
# pickle it
with open('/mnt/c/Users/resha/Documents/Github/balancing_framework/s.pkl', 'wb') as f:
    pickle.dump(df2, f)

In [None]:
from fracdiff import frac_diff_bestd, frac_diff_ffd

X = df2.drop(columns=['label'])

X, fd_change_pct = frac_diff_bestd(X)

In [None]:
y = df2['label'][X.index]
fd_df = pd.concat([X, y], axis=1).dropna()

# pickle it
with open('/mnt/c/Users/resha/Documents/Github/balancing_framework/spy5m_labelled_episodes_ta_fd.pkl', 'wb') as f:
    pickle.dump(fd_df, f)

In [6]:
from framework import run_measurements, viz

X = df2.drop(columns=['label'])
y = df2['label']
chunk_size = 500_000
cold_start_size = 10_000
dataset_name = 'sp500'
model_name = 'random_forest'

a,c,p = run_measurements(X, y, chunk_size, cold_start_size, dataset_name, model_name, num_runs=1, frac_diff=False)

RUNNING ADAPTATION MEASURE


  0%|          | 0/2 [00:00<?, ?it/s][I 2024-11-15 12:22:38,798] A new study created in memory with name: no-name-47d2a962-a0d9-44b8-a46f-5e355321efef


Tuning run 0.0 of 1.60245


[I 2024-11-15 12:23:00,065] Trial 0 finished with value: 0.5344673913043478 and parameters: {'n_estimators': 6, 'max_depth': 14}. Best is trial 0 with value: 0.5344673913043478.


Optimization Time: 0.3669894233833323 minutes
Training run 0.0 of 1.60245


 50%|█████     | 1/2 [00:44<00:44, 44.26s/it]

Run 0 Accuracy: 0.5372


[I 2024-11-15 12:23:22,859] A new study created in memory with name: no-name-6a3f9c68-e744-4254-b472-45e03c081f58


Tuning run 1.0 of 1.60245


[I 2024-11-15 12:26:12,349] Trial 0 finished with value: 0.5379665027595215 and parameters: {'n_estimators': 34, 'max_depth': 11}. Best is trial 0 with value: 0.5379665027595215.


Optimization Time: 2.899540299450003 minutes
Training run 1.0 of 1.60245


100%|██████████| 2/2 [07:09<00:00, 214.89s/it]


Run 0 Accuracy: 0.5268
RUNNING CONSOLIDATION MEASURE


  0%|          | 0/2 [00:00<?, ?it/s][I 2024-11-15 12:29:48,572] A new study created in memory with name: no-name-67cdc188-437a-4eb6-a825-4532243d153f


Tuning run 0.0 of 1.60245


[W 2024-11-15 12:30:01,162] Trial 0 failed with parameters: {'n_estimators': 31, 'max_depth': 5} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/reshawn/miniconda3/envs/clfr/lib/python3.11/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/mnt/c/Users/resha/Documents/Github/balancing_framework/training.py", line 57, in objective
    clf.fit(X_train, y_train)
  File "/home/reshawn/miniconda3/envs/clfr/lib/python3.11/site-packages/sklearn/ensemble/_forest.py", line 473, in fit
    trees = Parallel(
            ^^^^^^^^^
  File "/home/reshawn/miniconda3/envs/clfr/lib/python3.11/site-packages/sklearn/utils/parallel.py", line 63, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/reshawn/miniconda3/envs/clfr/lib/python3.11/site-packages/joblib/parallel.py", line 1088, in __

KeyboardInterrupt: 

In [None]:
viz(a, c, metric='accuracy', title='tatest') # Frac Diff , First Order Diff
viz(a, c, metric='f1', title='tatest') # Frac Diff , First Order Diff