# Data prep: deterministic train/val/test splits


Load `data/master_dataset.parquet`, sort by date, do light exploration, and make deterministic train/val/test splits for other notebooks.

Artifacts live under `notebooks/model_evaluation_final/artifacts/`:
- `data/` for splits, indices, config, feature list

Each model notebook should handle its own scaling or preprocessing.


In [1]:
from pathlib import Path
import json
import pandas as pd


#paths
_start = Path.cwd().resolve()
_candidates = [_start] + list(_start.parents)
_repo = None
for p in _candidates:
    if (p / "data/master_dataset.parquet").exists():
        _repo = p
        break
REPO_ROOT = _repo if _repo else _start

MASTER_PATH = REPO_ROOT / "data/master_dataset.parquet"
ARTIFACTS_DIR = REPO_ROOT / "notebooks/model_evaluation_final/artifacts"
DATA_DIR = ARTIFACTS_DIR / "data"
for d in [ARTIFACTS_DIR, DATA_DIR]:
    d.mkdir(parents=True, exist_ok=True)

CONFIG = {
    "test_days": 30,  #hold out the most recent n rows as test
    "val_fraction_pre_test": 0.2,  #fraction of pre-test data for validation
    "target_col": "rv_5d",
    "date_col": "date",
    "drop_cols": ["symbol"],  #dropped if present
    "scaler": "standard"
}

with open(DATA_DIR / "split_config.json", "w") as f:
    json.dump(CONFIG, f, indent=2)

MASTER_PATH


PosixPath('/Users/aayushrijal/Documents/GitHub/volatility_forecast/data/master_dataset.parquet')

In [9]:
assert MASTER_PATH.exists(), f"Missing master dataset at {MASTER_PATH}"
df = pd.read_parquet(MASTER_PATH)

#normalize column types
df[CONFIG["date_col"]] = pd.to_datetime(df[CONFIG["date_col"]])
df = df.sort_values(CONFIG["date_col"]).reset_index(drop=True)

df.head(), df.tail(), df.shape


(        date  spy_ret_1d  spy_ret_5d  spy_ret_10d  spy_ret_20d  spy_ret_60d  \
 0 2010-01-04         NaN         NaN          NaN          NaN          NaN   
 1 2010-01-05    0.002643         NaN          NaN          NaN          NaN   
 2 2010-01-06    0.000704         NaN          NaN          NaN          NaN   
 3 2010-01-07    0.004212         NaN          NaN          NaN          NaN   
 4 2010-01-08    0.003322         NaN          NaN          NaN          NaN   
 
    spy_vol_5d  spy_vol_10d  spy_vol_20d  spy_vol_60d  ...      vix3m  \
 0         NaN          NaN          NaN          NaN  ...  22.770000   
 1         NaN          NaN          NaN          NaN  ...  22.389999   
 2         NaN          NaN          NaN          NaN  ...  21.799999   
 3         NaN          NaN          NaN          NaN  ...  21.600000   
 4         NaN          NaN          NaN          NaN  ...  21.000000   
 
    vix_term  rsi_spy_14  corr_spy_tlt_20d  corr_spy_hyg_20d  corr_spy_tlt_60d

In [10]:
#drop rows with null rv_5d (labels not available yet)
print("\n" + "="*60)
print("STEP 1: Drop rows with null rv_5d")
print("="*60)
original_rows = len(df)
df_clean = df.dropna(subset=[CONFIG["target_col"]]).copy()
rv_dropped = original_rows - len(df_clean)
print(f"Dropped {rv_dropped} rows with null rv_5d: {len(df_clean)} rows remaining")

df_clean.shape


STEP 1: Drop rows with null rv_5d
Dropped 5 rows with null rv_5d: 4043 rows remaining


(4043, 22)

In [4]:
print("\n" + "="*60)
print("STEP 2: Create train/val/test splits")
print("="*60)

date_col = CONFIG["date_col"]
target_col = CONFIG["target_col"]
drop_cols = [c for c in CONFIG["drop_cols"] if c in df_clean.columns]
feature_cols = [c for c in df_clean.columns if c not in [date_col, target_col] + drop_cols]

#split after dropping null rv_5d
test_days = CONFIG["test_days"]
pre_test = df_clean.iloc[:-test_days] if test_days > 0 else df_clean
test = df_clean.iloc[-test_days:] if test_days > 0 else df_clean.iloc[0:0]

val_size = max(1, int(len(pre_test) * CONFIG["val_fraction_pre_test"]))
val = pre_test.iloc[-val_size:]
train = pre_test.iloc[:-val_size]

assert len(train) > 0, "Train set empty; adjust split parameters."
assert len(val) > 0, "Val set empty; adjust split parameters."
assert len(test) >= 0, "Test set negative size."

print(f"Split sizes (before dropping null features):")
print(f"  Train: {len(train)} rows")
print(f"  Val:   {len(val)} rows")
print(f"  Test:  {len(test)} rows")
print(f"  Total: {len(train) + len(val) + len(test)} rows")

summary = {
    "train_rows_before_null_drop": len(train),
    "val_rows_before_null_drop": len(val),
    "test_rows": len(test),
    "train_start": train[date_col].min().date(),
    "train_end": train[date_col].max().date(),
    "val_start": val[date_col].min().date(),
    "val_end": val[date_col].max().date(),
    "test_start": test[date_col].min().date() if len(test) else None,
    "test_end": test[date_col].max().date() if len(test) else None,
    "n_features": len(feature_cols)
}
summary


STEP 2: Create train/val/test splits
Split sizes (before dropping null features):
  Train: 3211 rows
  Val:   802 rows
  Test:  30 rows
  Total: 4043 rows


{'train_rows_before_null_drop': 3211,
 'val_rows_before_null_drop': 802,
 'test_rows': 30,
 'train_start': datetime.date(2010, 1, 4),
 'train_end': datetime.date(2022, 10, 4),
 'val_start': datetime.date(2022, 10, 5),
 'val_end': datetime.date(2025, 12, 15),
 'test_start': datetime.date(2025, 12, 16),
 'test_end': datetime.date(2026, 1, 29),
 'n_features': 20}

In [5]:
#drop rows with null features from train and val (keep test realistic)
print("\n" + "="*60)
print("STEP 3: Drop rows with null features from train/val")
print("="*60)

train_before_drop = len(train)
train = train.dropna(subset=feature_cols).copy()
train_feature_dropped = train_before_drop - len(train)
print(f"Train: dropped {train_feature_dropped} rows with null features: {len(train)} rows remaining")

val_before_drop = len(val)
val = val.dropna(subset=feature_cols).copy()
val_feature_dropped = val_before_drop - len(val)
print(f"Val:   dropped {val_feature_dropped} rows with null features: {len(val)} rows remaining")

print(f"Test:  keeping all {len(test)} rows (realistic nulls for evaluation)")

print(f"\nFinal split sizes:")
print(f"  Train: {len(train)} rows (clean)")
print(f"  Val:   {len(val)} rows (clean)")
print(f"  Test:  {len(test)} rows (may have nulls)")

len(train), len(val), len(test)


STEP 3: Drop rows with null features from train/val
Train: dropped 60 rows with null features: 3151 rows remaining
Val:   dropped 0 rows with null features: 802 rows remaining
Test:  keeping all 30 rows (realistic nulls for evaluation)

Final split sizes:
  Train: 3151 rows (clean)
  Val:   802 rows (clean)
  Test:  30 rows (may have nulls)


(3151, 802, 30)

In [6]:
print("\n" + "="*60)
print("STEP 4: Compute feature means and save artifacts")
print("="*60)

X_train = train[feature_cols].copy()
y_train = train[[target_col]].copy()
X_val = val[feature_cols].copy()
y_val = val[[target_col]].copy()
X_test = test[feature_cols].copy()
y_test = test[[target_col]].copy() if len(test) else pd.DataFrame(columns=[target_col])

#compute feature means from training data only (for prediction imputation)
feature_means = X_train.mean().to_dict()
print(f"Computed means for {len(feature_means)} features from training data")

#persist indices(by date)
train[[date_col]].to_csv(DATA_DIR / "train_indices.csv", index=False)
val[[date_col]].to_csv(DATA_DIR / "val_indices.csv", index=False)
test[[date_col]].to_csv(DATA_DIR / "test_indices.csv", index=False)

#persist unscaled features/targets
X_train.to_parquet(DATA_DIR / "X_train.parquet", index=False)
X_val.to_parquet(DATA_DIR / "X_val.parquet", index=False)
X_test.to_parquet(DATA_DIR / "X_test.parquet", index=False)
y_train.to_parquet(DATA_DIR / "y_train.parquet", index=False)
y_val.to_parquet(DATA_DIR / "y_val.parquet", index=False)
y_test.to_parquet(DATA_DIR / "y_test.parquet", index=False)

#persist feature means for prediction imputation
with open(DATA_DIR / "feature_means.json", "w") as f:
    json.dump(feature_means, f, indent=2)

with open(DATA_DIR / "feature_columns.json", "w") as f:
    json.dump(feature_cols, f, indent=2)

print(f"\nSaved artifacts to {DATA_DIR}:")
print(f"  - Split indices (train/val/test_indices.csv)")
print(f"  - Features and targets (X/y parquet files)")
print(f"  - feature_means.json ({len(feature_means)} features)")
print(f"  - feature_columns.json ({len(feature_cols)} features)")

len(X_train), len(X_val), len(X_test), len(feature_cols)


STEP 4: Compute feature means and save artifacts
Computed means for 20 features from training data

Saved artifacts to /Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts/data:
  - Split indices (train/val/test_indices.csv)
  - Features and targets (X/y parquet files)
  - feature_means.json (20 features)
  - feature_columns.json (20 features)


(3151, 802, 30, 20)

In [7]:
#updated summary with final counts
summary_final = {
    "original_rows": original_rows,
    "dropped_null_rv_5d": rv_dropped,
    "after_rv_5d_drop": len(df_clean),
    "train_rows_final": len(train),
    "train_feature_dropped": train_feature_dropped,
    "val_rows_final": len(val),
    "val_feature_dropped": val_feature_dropped,
    "test_rows": len(test),
    "train_start": train[date_col].min().date(),
    "train_end": train[date_col].max().date(),
    "val_start": val[date_col].min().date(),
    "val_end": val[date_col].max().date(),
    "test_start": test[date_col].min().date() if len(test) else None,
    "test_end": test[date_col].max().date() if len(test) else None,
    "n_features": len(feature_cols)
}

with open(DATA_DIR / "split_summary.json", "w") as f:
    json.dump(summary_final, f, indent=2, default=str)

print("\n" + "="*60)
print("FINAL SUMMARY")
print("="*60)
print(f"Original rows:        {summary_final['original_rows']}")
print(f"Dropped null rv_5d:   {summary_final['dropped_null_rv_5d']}")
print(f"After rv_5d drop:     {summary_final['after_rv_5d_drop']}")
print(f"\nTrain rows (final):   {summary_final['train_rows_final']} (dropped {summary_final['train_feature_dropped']} with null features)")
print(f"Val rows (final):     {summary_final['val_rows_final']} (dropped {summary_final['val_feature_dropped']} with null features)")
print(f"Test rows:            {summary_final['test_rows']} (kept realistic nulls)")
print(f"\nFeatures:             {summary_final['n_features']}")

summary_final


FINAL SUMMARY
Original rows:        4048
Dropped null rv_5d:   5
After rv_5d drop:     4043

Train rows (final):   3151 (dropped 60 with null features)
Val rows (final):     802 (dropped 0 with null features)
Test rows:            30 (kept realistic nulls)

Features:             20


{'original_rows': 4048,
 'dropped_null_rv_5d': 5,
 'after_rv_5d_drop': 4043,
 'train_rows_final': 3151,
 'train_feature_dropped': 60,
 'val_rows_final': 802,
 'val_feature_dropped': 0,
 'test_rows': 30,
 'train_start': datetime.date(2010, 3, 31),
 'train_end': datetime.date(2022, 10, 4),
 'val_start': datetime.date(2022, 10, 5),
 'val_end': datetime.date(2025, 12, 15),
 'test_start': datetime.date(2025, 12, 16),
 'test_end': datetime.date(2026, 1, 29),
 'n_features': 20}

In [8]:
#quick stats for sanity
stats_target = df_clean[[target_col]].describe()
stats_features = df_clean[feature_cols].describe().T.head(10)

print("\nTarget variable stats:")
print(stats_target)
print("\nSample feature stats (first 10):")
print(stats_features)

stats_target, stats_features


Target variable stats:
             rv_5d
count  4043.000000
mean      0.019483
std       0.014487
min       0.001851
25%       0.010543
50%       0.016279
75%       0.024170
max       0.189043

Sample feature stats (first 10):
               count      mean       std       min       25%       50%  \
spy_ret_1d    4042.0  0.000519  0.010847 -0.115886 -0.003709  0.000706   
spy_ret_5d    4038.0  0.002593  0.022585 -0.198077 -0.006985  0.004582   
spy_ret_10d   4033.0  0.005176  0.030680 -0.265117 -0.007947  0.008335   
spy_ret_20d   4023.0  0.010458  0.042544 -0.370872 -0.007856  0.016789   
spy_ret_60d   3983.0  0.031455  0.064320 -0.359347  0.004383  0.040782   
spy_vol_5d    4038.0  0.019494  0.014492  0.001851  0.010550  0.016283   
spy_vol_10d   4033.0  0.028456  0.019267  0.003834  0.017027  0.023628   
spy_vol_20d   4023.0  0.041302  0.025641  0.009232  0.026084  0.034892   
spy_vol_60d   3983.0  0.074675  0.039231  0.024900  0.052995  0.062331   
drawdown_60d  3984.0  0.027855 

(             rv_5d
 count  4043.000000
 mean      0.019483
 std       0.014487
 min       0.001851
 25%       0.010543
 50%       0.016279
 75%       0.024170
 max       0.189043,
                count      mean       std       min       25%       50%  \
 spy_ret_1d    4042.0  0.000519  0.010847 -0.115886 -0.003709  0.000706   
 spy_ret_5d    4038.0  0.002593  0.022585 -0.198077 -0.006985  0.004582   
 spy_ret_10d   4033.0  0.005176  0.030680 -0.265117 -0.007947  0.008335   
 spy_ret_20d   4023.0  0.010458  0.042544 -0.370872 -0.007856  0.016789   
 spy_ret_60d   3983.0  0.031455  0.064320 -0.359347  0.004383  0.040782   
 spy_vol_5d    4038.0  0.019494  0.014492  0.001851  0.010550  0.016283   
 spy_vol_10d   4033.0  0.028456  0.019267  0.003834  0.017027  0.023628   
 spy_vol_20d   4023.0  0.041302  0.025641  0.009232  0.026084  0.034892   
 spy_vol_60d   3983.0  0.074675  0.039231  0.024900  0.052995  0.062331   
 drawdown_60d  3984.0  0.027855  0.040243  0.000000  0.001600  0.0111

Next: other notebooks load these artifacts. Use the scaler only for linear models.
