In [None]:
# ===== LSTM for Next-Day Trading Volume (from your CSVs) =====
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks


2025-11-05 16:56:05.712691: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:

# -----------------------------
# 0) Reproducibility
# -----------------------------
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)


In [4]:

# -----------------------------
# 1) Load & merge data (keep your split)
# -----------------------------
train = pd.read_csv("train_volume_vix.csv", parse_dates=["date"])
test  = pd.read_csv("test_volume_vix.csv",  parse_dates=["date"])

train = train.sort_values("date").reset_index(drop=True)
test  = test.sort_values("date").reset_index(drop=True)

split_date = test["date"].min()
df = pd.concat([train, test], ignore_index=True).sort_values("date").reset_index(drop=True)


In [5]:

# -----------------------------
# 2) Feature engineering (no leakage)
#    We build features that are known up to day t to predict target at t+1
# -----------------------------
# Basic transforms
df["log_volume_t"] = np.log(df["sh_volume"])

# VIX dynamics
df["vix_change"] = df["vix_close"].diff()           # today's change vs yesterday
df["vix_pct"]    = df["vix_close"].pct_change()


# Drop initial NaNs from diff/pct_change
df = df.dropna(subset=["vix_change", "vix_pct"]).reset_index(drop=True)

# Target: predict next-day volume (already provided as target_volume)
# We'll model y_log = log(target_volume)
df["y_log"] = np.log(df["target_volume"])


In [8]:
# NO VIX
# -----------------------------
# 3) Select feature columns
#    (sequence model learns temporal patterns; no manual lags needed)
# -----------------------------
feature_cols = ["log_volume_t"]

feat_df = df[["date"] + feature_cols + ["y_log"]].copy()


In [7]:
# VIX
# -----------------------------
# 3) Select feature columns
#    (sequence model learns temporal patterns; no manual lags needed)
# -----------------------------
feature_cols = ["log_volume_t", "vix_close", "vix_change"]

feat_df = df[["date"] + feature_cols + ["y_log"]].copy()

In [9]:

# -----------------------------
# 4) Split back to train/test by date
#    (windows for earliest test targets will include history from train, which is OK)
# -----------------------------
train_df = feat_df[feat_df["date"] < split_date].copy()
test_df  = feat_df[feat_df["date"] >= split_date].copy()


In [10]:

# -----------------------------
# 5) Scale features and target using train only
# -----------------------------
X_scaler = StandardScaler()
y_scaler = StandardScaler()

X_train_2d = X_scaler.fit_transform(train_df[feature_cols].values)
X_test_2d  = X_scaler.transform(test_df[feature_cols].values)

y_train_1d = y_scaler.fit_transform(train_df[["y_log"]].values).ravel()
y_test_1d  = y_scaler.transform(test_df[["y_log"]].values).ravel()


In [11]:

# -----------------------------
# 6) Build sequences (rolling windows)
#    LOOKBACK = number of past days the LSTM sees to predict t+1
# -----------------------------
LOOKBACK = 30  # try 30 trading days; adjust as you like

def make_sequences(X_2d, y_1d, dates, lookback):
    X_list, y_list, d_list = [], [], []
    for i in range(lookback, len(X_2d)):
        X_list.append(X_2d[i-lookback:i, :])   # window [i-lookback, ..., i-1]
        y_list.append(y_1d[i])                 # target aligned at i (predict next-day log vol)
        d_list.append(dates.iloc[i])
    return np.array(X_list), np.array(y_list), pd.Series(d_list)

# Build sequences on the FULL timeline so test windows can include train history,
# then split sequences by their TARGET DATE (d_list) relative to split_date.
X_all_2d = X_scaler.transform(feat_df[feature_cols].values)  # transform entire series
y_all_1d = y_scaler.transform(feat_df[["y_log"]].values).ravel()
dates_all = feat_df["date"].reset_index(drop=True)

X_seq, y_seq, d_seq = make_sequences(X_all_2d, y_all_1d, dates_all, LOOKBACK)

train_mask = d_seq < split_date
test_mask  = d_seq >= split_date

X_train, y_train_seq = X_seq[train_mask], y_seq[train_mask]
X_test,  y_test_seq  = X_seq[test_mask],  y_seq[test_mask]
dates_test_seq = d_seq[test_mask].reset_index(drop=True)

print("Train sequences:", X_train.shape, " | Test sequences:", X_test.shape)


Train sequences: (4752, 30, 1)  | Test sequences: (501, 30, 1)


In [15]:
print(len(X_train))

4752


In [12]:

# -----------------------------
# 7) Define LSTM model
# -----------------------------
n_features = X_train.shape[-1]

model = models.Sequential([
    layers.Input(shape=(LOOKBACK, n_features)),
    layers.LSTM(64, return_sequences=True),
    layers.Dropout(0.2),
    layers.LSTM(32),
    layers.Dropout(0.2),
    layers.Dense(1)
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss="mse",
              metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse")])

es = callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
rlr = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5)


In [None]:

# -----------------------------
# 8) Train/validation split (time-aware: last slice of train as val)
# -----------------------------
val_frac = 0.15
val_size = int(len(X_train) * val_frac)
X_tr, X_val = X_train[:-val_size], X_train[-val_size:]
y_tr, y_val = y_train_seq[:-val_size], y_train_seq[-val_size:]

hist = model.fit(
    X_tr, y_tr,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=64,
    callbacks=[es, rlr],
    verbose=1
)


In [None]:

# -----------------------------
# 9) Inference & metrics
#    (invert scaling to log-space, then to original volume)
# -----------------------------
# Predict (scaled y)
y_pred_scaled = model.predict(X_test).ravel()

# Back to log scale
y_pred_log = y_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
y_test_log = y_scaler.inverse_transform(y_test_seq.reshape(-1, 1)).ravel()

# Back to original scale
pred = np.exp(y_pred_log)
actual = np.exp(y_test_log)

# Metrics
mse_log = mean_squared_error(y_test_log, y_pred_log)
rmse_log = np.sqrt(mse_log)
mae_log = mean_absolute_error(y_test_log, y_pred_log)
r2_log = r2_score(y_test_log, y_pred_log)

mse_orig = mean_squared_error(actual, pred)
rmse_orig = np.sqrt(mse_orig)
mae_orig = mean_absolute_error(actual, pred)
mape = (np.abs(pred - actual) / actual).mean() * 100

print("\n=== LSTM Performance ===")
print(f"Log scale -> RMSE: {rmse_log:.4f}  MAE: {mae_log:.4f}  RÂ²: {r2_log:.3f}")
print(f"Original  -> RMSE: {rmse_orig:,.0f}  MAE: {mae_orig:,.0f}  MAPE: {mape:.2f}%")


In [None]:

# -----------------------------
# 10) Plots
# -----------------------------
# Loss curve
plt.figure(figsize=(6,4))
plt.plot(hist.history["loss"], label="train")
plt.plot(hist.history["val_loss"], label="val")
plt.xlabel("Epoch"); plt.ylabel("MSE (scaled)")
plt.title("Training History")
plt.legend(); plt.tight_layout(); plt.show()

# Actual vs Predicted (log scale)
plt.figure(figsize=(6,4))
plt.scatter(y_test_log, y_pred_log, alpha=0.6)
lo, hi = min(y_test_log.min(), y_pred_log.min()), max(y_test_log.max(), y_pred_log.max())
plt.plot([lo, hi], [lo, hi], 'r--', lw=2)
plt.xlabel("Actual log(Next-Day Volume)")
plt.ylabel("Predicted log(Next-Day Volume)")
plt.title("LSTM: Actual vs Predicted (log scale)")
plt.tight_layout(); plt.show()

# Time series (original scale)
plot_df = pd.DataFrame({
    "date": dates_test_seq,
    "actual_volume": actual,
    "predicted_volume": pred
})
plt.figure(figsize=(12,6))
plt.plot(plot_df["date"], plot_df["actual_volume"], label="Actual", alpha=0.85)
plt.plot(plot_df["date"], plot_df["predicted_volume"], label="Predicted", alpha=0.85)
plt.xlabel("Date"); plt.ylabel("Trading Volume")
plt.title("LSTM: Actual vs Predicted (original scale)")
plt.legend(); plt.tight_layout(); plt.show()
