In [3]:
import pandas as pd

# Load data
vol = pd.read_csv("crsp_daily_volume_all.csv", parse_dates=["date"])
vix = pd.read_csv("vix_closing_prices_2004_to_today_FRED.csv", parse_dates=["date"])

# Keep only from 2004 onward
vol = vol[vol["date"] >= "2004-01-01"]

# Merge on date
df = pd.merge(vol, vix, on="date", how="inner")

# Shift sh_volume one day *back* so today's VIX predicts next day's volume
df["target_volume"] = df["total_dollar_volume"].shift(-1)

# Drop last row (no target for final day)
df = df.dropna(subset=["target_volume"])

# Optional: reset index
df = df.reset_index(drop=True)

df.to_csv("volume_vix_merged.csv", index=False)

# Split into train/test (e.g. last 2 years as test)
train = df[df["date"] < "2023-01-01"]
test = df[df["date"] >= "2023-01-01"]

print("Train shape:", train.shape)
print("Test shape:", test.shape)

# Save for modeling
train.to_csv("train_volume_vix.csv", index=False)
test.to_csv("test_volume_vix.csv", index=False)


Train shape: (4783, 5)
Test shape: (501, 5)
