# Coursework for Rough Paths and Applications to Machine Learning

CID:  02022635,  02051786

# Testing

In [2]:
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score as r2d2
from sklearn.linear_model import ElasticNet

In [3]:
#Load the raw data, replace NaNs with 0s
path = 'data/data.csv.gz'
df = pd.read_csv(path, compression="gzip")
df = df.astype('float32')
df.fillna(0, inplace=True)

In [4]:
#Load lagged LGBMs
lags = [50, 55, 60, 65, 70,75,80,85, 87]
models = [joblib.load(f'models/lgb_lag{lag}.pkl') for lag in lags]

In [5]:
#Generate lagged targets
def y_preprocess(y):
    set_start = 270
    valid_start = 1679079
    blend_start = 2238706
    test_start = 2798332
    blend_end = test_start - 1695
    set_end = 3497465
    length = set_end - set_start
    y_train = y[set_start:valid_start].copy().values.clip(-3,3)
    y_valid = y[valid_start:blend_start].copy().values.clip(-3,3)
    y_blend = y[blend_start:blend_end].copy().values
    y_test = y[test_start:set_end].copy().values
    return y_train, y_valid, y_blend, y_test
y_dict = {}
for lag in lags:
    y_dict[lag] = df.y.shift(87 - lag).copy()
y_train_dict= {}
y_valid_dict = {}
y_blend_dict = {}
y_test_dict = {}
for lag in lags:
    y_train_dict[lag], y_valid_dict[lag], y_blend_dict[lag], y_test_dict[lag] = y_preprocess(y_dict[lag])

In [6]:
#Load in augmented dataset - this has the raw data, financial features, and their signature kernel statistics 
x_scaled_test = np.load("data/test_finfeat_sigkern.npy")

In [7]:
#Stack the test predictions of each lagged model
test_predicts = np.zeros((len(lags), x_scaled_test.shape[0]))
for i, lag in enumerate(lags):
    predictions_test = models[i].predict(x_scaled_test)
    test_predicts[i] = predictions_test
    print(f"lag = {lag}, r2 = {r2d2(y_test_dict[lag], predictions_test):.4f}")



lag = 50, r2 = 0.1308




lag = 55, r2 = 0.0930




lag = 60, r2 = 0.0623




lag = 65, r2 = 0.0377




lag = 70, r2 = 0.0206




lag = 75, r2 = 0.0075




lag = 80, r2 = 0.0071




lag = 85, r2 = 0.0092




lag = 87, r2 = 0.0212


In [8]:
#Load LightGBMs trained on signature transforms
for depth in [1, 2]:
    for window in [10, 20, 50, 87]:
        f_test = np.load(f"data/TEST_d{depth}_levels_5_window_{window}.npz")
        x_test = f_test["arr_0"].copy()

        lgbm = joblib.load(f'models/regen_lgb_depth_{depth}_levels_5_window_{window}.pkl')

        y_test_trans = lgbm.predict(x_test)
        test_predicts = np.vstack((test_predicts, y_test_trans))

depth, window = 1, 100
f_test = np.load(f"data/TEST_d{depth}_levels_5_window_{window}.npz")
x_test = f_test["arr_0"].copy()

lgbm = joblib.load(f'models/regen_lgb_depth_{depth}_levels_5_window_{window}.pkl')

y_test_trans = lgbm.predict(x_test)
test_predicts = np.vstack((test_predicts, y_test_trans))




In [9]:
#Fetch target at lag 87
y_test = y_test_dict[87]

In [10]:
#Load trained ElasticNet weights
best_model = joblib.load("models/elasticnet_blend.pkl")

#Display R^2 performance on last 5%, 10%, 15% and 20% of data as requested.
N = test_predicts.shape[1]
for ind in [int(N/4), int(N/2), int(3*N/4), N]:
    y_pred_test = best_model.predict(test_predicts[:,-ind:].T)
    final_r2 = r2d2(y_test[-ind:], y_pred_test)
    print(f"R² Score on last {ind} rows:", final_r2)



R² Score on last 174783 rows: 0.01604947401436796
R² Score on last 349566 rows: 0.02211753533319971
R² Score on last 524349 rows: 0.022629527383128845
R² Score on last 699133 rows: 0.024281436587600846
