In [1]:
import pandas as pd

# --- Load data ---
vix = pd.read_csv("vix_closing_prices_2004_to_today_FRED.csv", parse_dates=["date"])
volume = pd.read_csv("crsp_daily_volume_all.csv", parse_dates=["date"])

# --- Merge VIX and Volume data ---
df = pd.merge(volume, vix, on="date", how="left").sort_values("date")

# 5-day: sum of t+1 ... t+5
df["target_5d"] = sum(df["total_dollar_volume"].shift(-k) for k in range(1, 6))

# 21-day: sum of t+1 ... t+21
df["target_21d"] = sum(df["total_dollar_volume"].shift(-k) for k in range(1, 22))

df.dropna(subset=["total_dollar_volume", "vix_close", "target_21d", "target_5d"], inplace=True)
df.head()


Unnamed: 0,date,total_sh_volume,total_dollar_volume,vix_close,target_5d,target_21d
1004,2004-01-02,3213291000.0,71773100000.0,18.22,522756400000.0,2210464000000.0
1005,2004-01-05,4529997000.0,102617700000.0,17.49,515919300000.0,2216186000000.0
1006,2004-01-06,4308225000.0,96439810000.0,16.73,521722400000.0,2219877000000.0
1007,2004-01-07,4697461000.0,104191800000.0,15.5,511452500000.0,2210390000000.0
1008,2004-01-08,5293304000.0,112087200000.0,15.61,511457900000.0,2182912000000.0


In [2]:

# --- Train/test split based on date ---
train = df[df["date"] < "2023-01-01"]
test = df[df["date"] >= "2023-01-01"]

print(f"Train period: {train['date'].min().date()} to {train['date'].max().date()}")
print(f"Test period:  {test['date'].min().date()} to {test['date'].max().date()}")
print(f"Train size: {len(train):,} rows, Test size: {len(test):,} rows")

# --- Quick check ---
df.head(10)

train.to_csv("5_21_train.csv", index=False)
test.to_csv("5_21_test.csv", index=False)


Train period: 2004-01-02 to 2022-12-30
Test period:  2023-01-03 to 2024-11-29
Train size: 4,783 rows, Test size: 481 rows


In [4]:
import pandas as pd
import numpy as np

# Assume df is already loaded and sorted by date
# and has: date, sh_volume, target_5d, target_21d

def check_target_sum_fixed(df, n=5, target_col="target_5d"):
    mismatches = []
    for i in range(len(df)):
        if i + n >= len(df):
            break
        # Expected target = sum of next n volumes (after this date)
        actual_sum = df["total_dollar_volume"].iloc[i+1:i+1+n].sum()
        expected = df[target_col].iloc[i]
        if not np.isclose(actual_sum, expected, atol=1e-2):
            mismatches.append((df["date"].iloc[i], expected, actual_sum))
    return mismatches

mismatch_5d = check_target_sum_fixed(df, 5, "target_5d")
mismatch_21d = check_target_sum_fixed(df, 21, "target_21d")

print(f"5-day mismatches: {len(mismatch_5d)}")
print(f"21-day mismatches: {len(mismatch_21d)}")

if mismatch_5d:
    print("Example mismatch (5d):", mismatch_5d[:2])
if mismatch_21d:
    print("Example mismatch (21d):", mismatch_21d[:2])


5-day mismatches: 0
21-day mismatches: 0
