In [None]:
# =====================================================
# LSTM for 5-day accumulated volume using only lagged volume sequences
# =====================================================
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, optimizers


In [None]:

# -----------------------------
# 0) CONFIG
# -----------------------------
WINDOW = 30          # past trading days fed to the LSTM
BATCH_SIZE = 64
EPOCHS = 30
LR = 1e-3
PATIENCE = 10        # early stopping
SEED = 42

np.random.seed(SEED)
tf.random.set_seed(SEED)


In [None]:

# -----------------------------
# 1) LOAD & MERGE
# -----------------------------
train = pd.read_csv("5_21_train.csv", parse_dates=["date"])
test  = pd.read_csv("5_21_test.csv", parse_dates=["date"])

train = train.sort_values("date").reset_index(drop=True)
test  = test.sort_values("date").reset_index(drop=True)
split_date = test["date"].min()

# Combine to ensure sequences that straddle the split are handled correctly
df = pd.concat([train, test], ignore_index=True).sort_values("date").reset_index(drop=True)


In [None]:

# -----------------------------
# 2) KEEP ONLY WHAT WE NEED (no VIX yet)
# -----------------------------
# We’ll only use log(sh_volume) as the single feature for the LSTM.
df["log_vol"] = np.log(df["sh_volume"])

# Target is already prepared in your CSV as the accumulated next 5 trading days.
# We predict on log scale (consistent with your RF code).
df["y_log"] = np.log(df["target_5d"])

# Drop any rows that still have NaNs (e.g., the very beginning or end)
df = df.dropna(subset=["log_vol", "y_log"]).reset_index(drop=True)


In [None]:

# -----------------------------
# 3) TRAIN / TEST SPLIT BY DATE (no leakage)
# -----------------------------
start_date = pd.Timestamp("2020-01-01")
df = df[df["date"] >= start_date].reset_index(drop=True)

# Re-split using the stored split_date
train_df = df[df["date"] < split_date].copy()
test_df  = df[df["date"] >= split_date].copy()


In [None]:

# -----------------------------
# 4) SCALE FEATURES USING TRAIN ONLY
# -----------------------------
scaler = StandardScaler()
scaler.fit(train_df[["log_vol"]])

train_df["log_vol_scaled"] = scaler.transform(train_df[["log_vol"]])
test_df["log_vol_scaled"]  = scaler.transform(test_df[["log_vol"]])


In [None]:

# -----------------------------
# 5) BUILD SEQUENCES (WINDOW past days -> predict y at day t)
#    For each index t, X contains [log_vol_scaled at t-WINDOW+1 ... t]
#    and y contains y_log at t (target_5d is sum of t+1..t+5 -> safe)
# -----------------------------
def make_sequences(frame, window, feature_col="log_vol_scaled", target_col="y_log"):
    X_list, y_list, idx_list = [], [], []
    values = frame[feature_col].values.astype(np.float32)
    targets = frame[target_col].values.astype(np.float32)
    for t in range(window - 1, len(frame)):
        X_list.append(values[t - window + 1 : t + 1])  # shape (window,)
        y_list.append(targets[t])                       # scalar
        idx_list.append(frame["date"].iloc[t])          # for alignment/plotting
    X = np.array(X_list)[:, :, None]                    # (samples, window, features=1)
    y = np.array(y_list)
    idx = pd.to_datetime(idx_list)
    return X, y, idx

X_train, y_train, idx_train = make_sequences(train_df, WINDOW)
X_test,  y_test,  idx_test  = make_sequences(test_df,  WINDOW)

# Optional: Make a small validation split from the tail of the training set (time-aware)
val_size = max( int(0.1 * len(X_train)), 1 )
X_tr, y_tr = X_train[:-val_size], y_train[:-val_size]
X_val, y_val = X_train[-val_size:], y_train[-val_size:]

print(f"Train sequences: {len(X_tr):,}  |  Val sequences: {len(X_val):,}  |  Test sequences: {len(X_test):,}")
print("Input shape:", X_tr.shape, " Target shape:", y_tr.shape)


In [None]:

# -----------------------------
# 6) DEFINE LSTM MODEL
# -----------------------------
def build_model(window):
    model = models.Sequential([
        layers.Input(shape=(window, 1)),
        layers.LSTM(64, return_sequences=True),
        layers.Dropout(0.2),
        layers.LSTM(32),
        layers.Dropout(0.2),
        layers.Dense(1)  # predicting log(target_5d)
    ])
    model.compile(
        optimizer=optimizers.Adam(learning_rate=LR),
        loss="mse",
        metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse")]
    )
    return model

model = build_model(WINDOW)
model.summary()


In [None]:

# -----------------------------
# 7) TRAIN
# -----------------------------
es = callbacks.EarlyStopping(
    monitor="val_loss",
    patience=PATIENCE,
    restore_best_weights=True
)

lr_plateau = callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=max(PATIENCE // 2, 2),
    min_lr=1e-5,
    verbose=1
)

history = model.fit(
    X_tr, y_tr,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1,
    callbacks=[es, lr_plateau]
)


In [None]:

# -----------------------------
# 8) EVALUATION
#     a) Log scale (y_log)
#     b) Original scale (exp)
# -----------------------------
# Predictions
y_pred_test_log = model.predict(X_test).ravel()
y_pred_test = np.exp(y_pred_test_log)
y_true_test = np.exp(y_test)

# Log scale metrics
mse_log = mean_squared_error(y_test, y_pred_test_log)
rmse_log = np.sqrt(mse_log)
mae_log = mean_absolute_error(y_test, y_pred_test_log)
r2_log  = r2_score(y_test, y_pred_test_log)

print("\nLog scale metrics:")
print(f"  RMSE (log): {rmse_log:.4f}")
print(f"  MAE  (log): {mae_log:.4f}")
print(f"  R²   (log): {r2_log:.4f}")

# Original scale metrics
mse = mean_squared_error(y_true_test, y_pred_test)
rmse = np.sqrt(mse)
mae  = mean_absolute_error(y_true_test, y_pred_test)
mape = (np.abs(y_pred_test - y_true_test) / y_true_test).mean() * 100

print("\nOriginal scale metrics:")
print(f"  RMSE: {rmse:,.0f}")
print(f"  MAE : {mae:,.0f}")
print(f"  MAPE: {mape:.2f}%")


In [None]:

# -----------------------------
# 9) PLOTS
# -----------------------------
# Training history (loss)
plt.figure(figsize=(6,4))
plt.plot(history.history["loss"], label="Train")
plt.plot(history.history["val_loss"], label="Val")
plt.xlabel("Epoch")
plt.ylabel("Loss (MSE, log scale)")
plt.title("LSTM Training History")
plt.legend()
plt.tight_layout()
plt.show()

# Actual vs Predicted (log)
plt.figure(figsize=(6,4))
plt.scatter(y_test, y_pred_test_log, alpha=0.6)
lo, hi = min(y_test.min(), y_pred_test_log.min()), max(y_test.max(), y_pred_test_log.max())
plt.plot([lo, hi], [lo, hi], linestyle="--")
plt.xlabel("Actual log(target_5d)")
plt.ylabel("Predicted log(target_5d)")
plt.title("LSTM: Actual vs Predicted (log scale)")
plt.tight_layout()
plt.show()

# Time series (original scale)
ts_plot = pd.DataFrame({
    "date": idx_test,
    "actual_5d_volume": y_true_test,
    "predicted_5d_volume": y_pred_test
}).sort_values("date")

plt.figure(figsize=(12,5))
plt.plot(ts_plot["date"], ts_plot["actual_5d_volume"], label="Actual", alpha=0.9)
plt.plot(ts_plot["date"], ts_plot["predicted_5d_volume"], label="Predicted", alpha=0.9)
plt.xlabel("Date")
plt.ylabel("5-day accumulated trading volume")
plt.title("LSTM: Actual vs Predicted (original scale)")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:

# -----------------------------
# 10) QUICK SANITY CHECK: window alignment
# -----------------------------
# For a random test index, ensure the last element of the sequence is exactly today's log_vol,
# and the target corresponds to the precomputed target_5d at the same index.
rand_idx = np.random.randint(0, len(X_test))
print("\nSanity check at one test sample:")
print("Sequence window ends at date:", idx_test.iloc[rand_idx])
print("Sequence shape:", X_test[rand_idx].shape, "— last timestep feature value:", X_test[rand_idx][-1, 0])
print("Pred log target:", y_pred_test_log[rand_idx], " | True log target:", y_test[rand_idx])
