In [1]:
from data_utils import set_seed, read_and_merge, build_features_5_21, split_by_date, fit_transform_scalers, make_sequences_generic, time_aware_train_val_split
from model_utils import build_lstm, train
from eval_utils import regression_report_log_and_orig, plot_history, scatter_actual_vs_pred, plot_timeseries

WINDOW = 30; BATCH_SIZE = 64; EPOCHS = 30; LR = 1e-3; SEED = 42
set_seed(SEED)

df, split_date = read_and_merge("5_21_train.csv", "5_21_test.csv")
df = build_features_5_21(df, use_vix=True, target="target_5d")

train_df, test_df = split_by_date(df, split_date)
feature_cols = ["log_vol", "vix_close", "vix_lag1", "vix_change", "vix_5d_ma"]
train_s, test_s, scaler = fit_transform_scalers(train_df, test_df, feature_cols)

X_train, y_train, _ = make_sequences_generic(train_s, WINDOW, feature_cols)
X_test,  y_test,  idx_test = make_sequences_generic(test_s,  WINDOW, feature_cols)
X_tr, y_tr, X_val, y_val = time_aware_train_val_split(X_train, y_train, val_frac=0.1)


2025-11-13 09:25:49.776742: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
print("y_train mean:", y_train.mean(), "std:", y_train.std())
print("Example target values:", y_train[:10])
df["y_log"].describe()


y_train mean: 24.1018 std: 0.29409042
Example target values: [23.672213 23.660542 23.641653 23.637873 23.62909  23.630688 23.63696
 23.652788 23.699932 23.762552]


count    5260.000000
mean       24.141615
std         0.312005
min        23.248759
25%        23.933331
50%        24.089335
75%        24.397679
max        25.047560
Name: y_log, dtype: float64

In [8]:
print(feature_cols)


['log_vol', 'vix_close', 'vix_lag1', 'vix_change', 'vix_5d_ma']


In [None]:

model = build_lstm(WINDOW, n_features=len(feature_cols), lr=LR)
hist = train(model, X_tr, y_tr, X_val, y_val, epochs=EPOCHS, batch_size=BATCH_SIZE)

y_pred_log = model.predict(X_test).ravel()
report = regression_report_log_and_orig(y_test, y_pred_log)
print(report)

plot_history(hist, title="LSTM Training History — 5-day (VIX)")
scatter_actual_vs_pred(y_test, y_pred_log, title="LSTM 5-day VIX — Actual vs Pred (log)")
plot_timeseries(idx_test, np.exp(y_test), np.exp(y_pred_log), ylabel="5-day accumulated volume", title="LSTM 5-day (VIX) — Original scale")