In [None]:
from data_utils import set_seed, read_and_merge, build_features_nextday, split_by_date, fit_transform_scalers, make_sequences_generic, time_aware_train_val_split
from model_utils import build_lstm, train
from eval_utils import regression_report_log_and_orig, plot_history, scatter_actual_vs_pred, plot_timeseries
import numpy as np

WINDOW = 30; BATCH_SIZE = 64; EPOCHS = 80; LR = 1e-3; SEED = 42
set_seed(SEED)


# 1) Load data
df, split_date = read_and_merge("train_volume_vix.csv", "test_volume_vix.csv")


# 2) Build only volume-based features (ignore VIX columns)
df = df.copy()
df["log_volume_t"] = np.log(df["sh_volume"])
df["y_log"] = np.log(df["target_volume"])
df = df.dropna(subset=["log_volume_t", "y_log"]).reset_index(drop=True)


# 3) Split & scale
train_df, test_df = split_by_date(df, split_date)
feature_cols = ["log_volume_t"]
train_s, test_s, scaler = fit_transform_scalers(train_df, test_df, feature_cols)


# 4) Build sequences
X_train, y_train, _ = make_sequences_generic(train_s, WINDOW, feature_cols)
X_test, y_test, idx_test = make_sequences_generic(test_s, WINDOW, feature_cols)
X_tr, y_tr, X_val, y_val = time_aware_train_val_split(X_train, y_train, val_frac=0.15)


# 5) Model
model = build_lstm(WINDOW, n_features=len(feature_cols), lr=LR)
hist = train(model, X_tr, y_tr, X_val, y_val, epochs=EPOCHS, batch_size=BATCH_SIZE)


# 6) Evaluate
y_pred_log = model.predict(X_test).ravel()
report = regression_report_log_and_orig(y_test, y_pred_log)
print(report)


# 7) Plots
plot_history(hist, title="Training History — Next-day (No VIX)")
scatter_actual_vs_pred(y_test, y_pred_log, title="Next-day (No VIX) — Actual vs Pred (log)")
plot_timeseries(idx_test, np.exp(y_test), np.exp(y_pred_log), ylabel="Next-day volume", title="Next-day (No VIX) — Original scale")