In [None]:
# ===== LSTM for 5-Day Accumulated Volume (Using Your CSVs) =====
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from keras import layers, models, callbacks

# -----------------------------
# 0) Reproducibility
# -----------------------------
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

# -----------------------------
# 1) Load & merge data
# -----------------------------
train = pd.read_csv("5_21_train.csv", parse_dates=["date"])
test  = pd.read_csv("5_21_test.csv", parse_dates=["date"])

train = train.sort_values("date").reset_index(drop=True)
test  = test.sort_values("date").reset_index(drop=True)

split_date = test["date"].min()
df = pd.concat([train, test], ignore_index=True).sort_values("date").reset_index(drop=True)

# -----------------------------
# 2) Features (log volume + VIX features for later use)
# -----------------------------
df["log_vol"] = np.log(df["total_dollar_volume"])

# VIX features (computed but NOT included in feature_cols yet)
df["vix_lag1"] = df["vix_close"].shift(1)
df["vix_change"] = df["vix_close"] - df["vix_lag1"]
df["vix_5d_ma"] = df["vix_close"].rolling(window=5).mean()

# Target (already in your CSV)
df["y_log"] = np.log(df["target_5d"])

# Drop rows with NaN in any relevant fields
df = df.dropna(subset=["log_vol", "y_log"]).reset_index(drop=True)

# -----------------------------
# 3) Train/Test split by date
# -----------------------------
start_date = pd.Timestamp("2020-03-01")
df = df[df["date"] >= start_date].reset_index(drop=True)

# Re-split using the stored split_date
train_df = df[df["date"] < split_date].copy()
test_df  = df[df["date"] >= split_date].copy()

# -----------------------------
# 4) Active feature columns
# -----------------------------
feature_cols = ["log_vol"]

df = df.dropna(subset=feature_cols).reset_index(drop=True)

# Scale features and target using TRAIN only
X_scaler = StandardScaler()
y_scaler = StandardScaler()

X_train_2d = X_scaler.fit_transform(train_df[feature_cols])
X_test_2d  = X_scaler.transform(test_df[feature_cols])

y_train_1d = y_scaler.fit_transform(train_df[["y_log"]]).ravel()
y_test_1d  = y_scaler.transform(test_df[["y_log"]]).ravel()

# -----------------------------
# 5) Build sequences
# -----------------------------
LOOKBACK = 30

def make_sequences(X, y, dates, lookback):
    X_list, y_list, d_list = [], [], []
    for i in range(lookback, len(X)):
        X_list.append(X[i-lookback:i, :])
        y_list.append(y[i])
        d_list.append(dates.iloc[i])
    return np.array(X_list), np.array(y_list), pd.Series(d_list)

# Use FULL data so test windows can include train history
X_all = X_scaler.transform(df[feature_cols])
y_all = y_scaler.transform(df[["y_log"]]).ravel()
dates_all = df["date"].reset_index(drop=True)

X_seq, y_seq, d_seq = make_sequences(X_all, y_all, dates_all, LOOKBACK)

train_mask = d_seq < split_date
test_mask  = d_seq >= split_date

X_train, y_train_seq = X_seq[train_mask], y_seq[train_mask]
X_test,  y_test_seq  = X_seq[test_mask],  y_seq[test_mask]
dates_test_seq = d_seq[test_mask].reset_index(drop=True)

print("Train sequences:", X_train.shape, " | Test sequences:", X_test.shape)

# -----------------------------
# 6) LSTM model
# -----------------------------
n_features = X_train.shape[-1]

model = models.Sequential([
    layers.Input(shape=(LOOKBACK, n_features)),
    layers.LSTM(64, return_sequences=True),
    layers.Dropout(0.2),
    layers.LSTM(32),
    layers.Dropout(0.2),
    layers.Dense(1)
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss="mse",
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

es = callbacks.EarlyStopping(monitor="val_loss", patience=15, restore_best_weights=True)
rlr = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5)

# -----------------------------
# 7) Train with time-aware split
# -----------------------------
val_size = int(len(X_train) * 0.15)
X_tr, X_val = X_train[:-val_size], X_train[-val_size:]
y_tr, y_val = y_train_seq[:-val_size], y_train_seq[-val_size:]

hist = model.fit(
    X_tr, y_tr,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=64,
    callbacks=[es, rlr],
    verbose=1
)

# -----------------------------
# 8) Evaluation
# -----------------------------
y_pred_scaled = model.predict(X_test).ravel()

# back-transform
y_pred_log = y_scaler.inverse_transform(y_pred_scaled.reshape(-1,1)).ravel()
y_test_log = y_scaler.inverse_transform(y_test_seq.reshape(-1,1)).ravel()

pred = np.exp(y_pred_log)
actual = np.exp(y_test_log)

print("\n=== 5-Day LSTM Performance ===")
print("RMSE (log):", np.sqrt(mean_squared_error(y_test_log, y_pred_log)))
print("MAE  (log):", mean_absolute_error(y_test_log, y_pred_log))
print("RÂ²   (log):", r2_score(y_test_log, y_pred_log))

print("\nOriginal scale:")
print("RMSE:", np.sqrt(mean_squared_error(actual, pred)))
print("MAE :", mean_absolute_error(actual, pred))
print("MAPE:", (np.abs(pred - actual) / actual).mean() * 100)

# -----------------------------
# 9) Plots
# -----------------------------
plt.figure(figsize=(6,4))
plt.plot(hist.history["loss"], label="train")
plt.plot(hist.history["val_loss"], label="val")
plt.title("Training Loss (5-day LSTM)")
plt.legend()
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,4))
plt.scatter(y_test_log, y_pred_log, alpha=0.6)
lo, hi = min(y_test_log.min(), y_pred_log.min()), max(y_test_log.max(), y_pred_log.max())
plt.plot([lo,hi],[lo,hi],'r--')
plt.xlabel("Actual log(target_5d)")
plt.ylabel("Predicted log(target_5d)")
plt.title("Actual vs Predicted (log scale)")
plt.tight_layout()
plt.show()

ts_df = pd.DataFrame({
    "date": dates_test_seq,
    "actual_5d": actual,
    "pred_5d": pred
})

plt.figure(figsize=(12,5))
plt.plot(ts_df["date"], ts_df["actual_5d"], label="Actual", alpha=0.85)
plt.plot(ts_df["date"], ts_df["pred_5d"], label="Predicted", alpha=0.85)
plt.title("5-Day Accumulated Volume Prediction")
plt.legend()
plt.tight_layout()
plt.show()
