In [4]:
import pandas as pd

# --- Load data ---
vix = pd.read_csv("vix_closing_prices_2004_to_today_FRED.csv", parse_dates=["date"])
volume = pd.read_csv("crsp_daily_volume_all.csv", parse_dates=["date"])

# --- Merge VIX and Volume data ---
df = pd.merge(volume, vix, on="date", how="left").sort_values("date")

# --- Create accumulated future volume targets ---
df["target_5d"] = df["sh_volume"].shift(-1).rolling(window=5).sum()
df["target_21d"] = df["sh_volume"].shift(-1).rolling(window=21).sum()

# --- Drop rows where targets are NaN (end of dataset) ---
df = df.dropna()

# --- Train/test split based on date ---
train = df[df["date"] < "2023-01-01"]
test = df[df["date"] >= "2023-01-01"]

print(f"Train period: {train['date'].min().date()} to {train['date'].max().date()}")
print(f"Test period:  {test['date'].min().date()} to {test['date'].max().date()}")
print(f"Train size: {len(train):,} rows, Test size: {len(test):,} rows")

# --- Quick check ---
df.head(10)

train.to_csv("5_21_train.csv", index=False)
test.to_csv("5_21_test.csv", index=False)


Train period: 2004-01-02 to 2022-12-30
Test period:  2023-01-03 to 2024-12-30
Train size: 4,783 rows, Test size: 501 rows
