In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# from scipy.conftest import num_parallel_threads

import my_afml_fncs as my_afml
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
filename_end = "2019_2024"

dollar_bars_df = pd.read_csv(f'tick_data/bars/dollar_bars_barchart_{filename_end}.csv')
dollar_bars_df['bar_number'] = range(len(dollar_bars_df))
dollar_bars_df['Date'] = pd.to_datetime(dollar_bars_df['Date'])
dollar_bars_df.set_index('Date', inplace=True)
dollar_bars_df.sort_index(inplace=True)

In [3]:
t1 = pd.Series([2, 3, 5], index=[0, 2, 4])  # t0,t1 for each feature obs
barIx = range(t1.max() + 1)  # index of bars
indM = my_afml.getIndMatrix(barIx, t1)
phi = np.random.choice(indM.columns, size=indM.shape[1])
print(phi)
print('Standard uniqueness:', my_afml.getAvgUniqueness(indM[phi]).mean())
phi = my_afml.seqBootstrap(indM)
print(phi)
print('Sequential uniqueness:',my_afml.getAvgUniqueness(indM[phi]).mean())


[1 0 1]
Standard uniqueness: 0.5370370370370371


Sequential Bootstrap: 100%|██████████| 3/3 [00:00<00:00, 285.11it/s]

[0, 2, 0]
Sequential uniqueness: 0.6666666666666666





In [4]:
dollar_vol=my_afml.getDailyVol(dollar_bars_df.Close)

In [5]:
# 3.1(a) apply CUSUM filet usings daily vols
dollar_es_cusum_events=my_afml.getTEvents(dollar_bars_df.Close,dollar_vol.mean())
print('Dollar bar CUSUM events:',dollar_es_cusum_events.shape)

Dollar bar CUSUM events: (48245,)


In [6]:
# Define moving average windows
short_window = 50  # e.g., 50 periods
long_window = 1000  # e.g., 200 periods

In [7]:
# create events where the moving averages cross over
dollar_events = my_afml.movingAverageCrossover(dollar_bars_df.Close, short_window, long_window)
# try this with a crossover sign change
dollar_events['signal_switch'] = dollar_events.signal.diff() / 2
# drop rows with any NaN values
dollar_events = dollar_events.dropna()
# filter rows where signal_switch is not zero
dollar_events = dollar_events[dollar_events['signal_switch'] != 0]

In [8]:
dollar_t1=my_afml.getVb(dollar_bars_df.Close,dollar_events.index)

In [9]:
indM=my_afml.getIndMatrix(dollar_events.index,dollar_t1)

In [10]:
# phi = np.random.choice(indM.columns, size=indM.shape[1])
# print(phi)
# print('Standard uniqueness:', my_afml.getAvgUniqueness(indM[phi]).mean())
# phi = my_afml.seqBootstrap(indM)
# print(phi)

In [11]:
# out['w']=mpPandasObj(mpSampleW,('molecule',events.index),numThreads, \
# t1=events['t1'],numCoEvents=numCoEvents,close=close)
# out['w']*=out.shape[0]/out['w'].sum()

In [12]:
dollar_bars_2024 = dollar_bars_df[dollar_bars_df.index.year == 2024]
dollar_events_2024 = dollar_events[dollar_events.index.year == 2024]

In [13]:
dollar_t1_2024=my_afml.getVb(dollar_bars_2024.Close,dollar_events_2024.index)

In [14]:
dollar_vol_2024=my_afml.getDailyVol(dollar_bars_2024.Close)

In [15]:
ptsl=[1,1]
# target is set to be standard deviation
dollar_trgt_2024=dollar_vol*dollar_bars_2024.Close
# trgt is variable but we can set a fixed minimum return minRet=0.00003
numThread=128
minRet=0.00003
# get events will find time of first touch after each event
dollar_trigger_secondary_model_2024=my_afml.getEvents(dollar_bars_2024.Close, dollar_events_2024.index, ptsl, dollar_trgt_2024, minRet, numThread, dollar_t1)

Running processJobs


2025-03-18 14:11:56.078292 100.0% applyPtSlOnT1 done after 0.03 minutes. Remaining 0.0 minutes.


In [16]:
numThreads=128
out=pd.DataFrame()
close=dollar_bars_2024.Close
events=dollar_trigger_secondary_model_2024

Calculate number of co-events at each point

In [17]:
numCoEvents=my_afml.mpPandasObj(my_afml.mpNumCoEvents,('molecule',events.index),numThreads, \
closeIdx=close.index,t1=events['t1'])

Running processJobs


2025-03-18 14:11:58.881692 100.0% mpNumCoEvents done after 0.01 minutes. Remaining 0.0 minutes.


Calculate average uniqueness of each label

In [18]:
out['tW']=my_afml.mpPandasObj(my_afml.mpSampleTW,('molecule',events.index),numThreads, \
t1=events['t1'],numCoEvents=numCoEvents)

Running processJobs


2025-03-18 14:12:03.397147 100.0% mpSampleTW done after 0.04 minutes. Remaining 0.0 minutes.


Calculate mean average uniqueness

In [19]:
mean_avergage_uniqueness=out['tW'].mean()

0.05 << 1

In [20]:
serial_correlation = out['tW'].autocorr()
print(f"Serial Correlation of 'tW': {serial_correlation}")

Serial Correlation of 'tW': 0.5239922155080112


In [21]:
from scipy.stats import t

# Number of observations in 'tW' (degrees of freedom = n-2 for correlation)
n = out['tW'].dropna().shape[0]

# Compute statistical significance (two-tailed test for correlation)
t_stat = serial_correlation * ((n - 2) ** 0.5) / ((1 - serial_correlation ** 2) ** 0.5)
p_value = 2 * t.sf(abs(t_stat), n - 2)  # Two-tailed p-value

# Print the results
print(f"Serial Correlation of 'tW': {serial_correlation}")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")

# Determine significance level
if p_value < 0.05:
    print("The serial correlation is statistically significant (p < 0.05).")
else:
    print("The serial correlation is not statistically significant (p >= 0.05).")

Serial Correlation of 'tW': 0.5239922155080112
t-statistic: 14.68804048988226
p-value: 1.1819331953049815e-41
The serial correlation is statistically significant (p < 0.05).


4.3

In [22]:
dollar_bins_secondary_model_2024=my_afml.getBins(dollar_trigger_secondary_model_2024,dollar_bars_2024.Close)


In [23]:
# now fit same data using these labels
dollar_X=dollar_bars_df.loc[dollar_bins_secondary_model_2024.index,:]
dollar_y=dollar_bins_secondary_model_2024['bin']

# Prepare the data
features = dollar_X
features.reset_index(drop=True, inplace=True)
features = features.drop(columns=['Symbol'])
labels = dollar_y  # Target variable
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

# Train RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


#4.3(a)
# Predict and evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

Accuracy: 0.7660818713450293
              precision    recall  f1-score   support

        -1.0       0.81      0.65      0.72        78
         0.0       0.00      0.00      0.00         0
         1.0       0.75      0.86      0.80        93

    accuracy                           0.77       171
   macro avg       0.52      0.50      0.51       171
weighted avg       0.78      0.77      0.77       171



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
#4.3(b)
from sklearn.model_selection import KFold, cross_val_score

kf = KFold(n_splits=5, shuffle=False)
clf = RandomForestClassifier(n_estimators=100, random_state=42)

scores = cross_val_score(clf, features, labels, cv=kf, scoring='accuracy')

print(f"K-Fold Cross-Validation Scores: {scores}")
print(f"Mean Accuracy: {scores.mean()}")

K-Fold Cross-Validation Scores: [0.62280702 0.4122807  0.4122807  0.46491228 0.57017544]
Mean Accuracy: 0.49649122807017554
