# Data prep: deterministic train/val/test splits


Load `data/master_dataset.parquet`, sort by date, do light exploration, and make deterministic train/val/test splits for other notebooks.

Artifacts live under `notebooks/model_evaluation_final/artifacts/`:
- `data/` for splits, indices, config, feature list

Each model notebook should handle its own scaling or preprocessing.


In [13]:
from pathlib import Path
import json
import pandas as pd


#paths
_start = Path.cwd().resolve()
_candidates = [_start] + list(_start.parents)
_repo = None
for p in _candidates:
    if (p / "data/master_dataset.parquet").exists():
        _repo = p
        break
REPO_ROOT = _repo if _repo else _start

MASTER_PATH = REPO_ROOT / "data/master_dataset.parquet"
ARTIFACTS_DIR = REPO_ROOT / "notebooks/model_evaluation_final/artifacts"
DATA_DIR = ARTIFACTS_DIR / "data"
for d in [ARTIFACTS_DIR, DATA_DIR]:
    d.mkdir(parents=True, exist_ok=True)

CONFIG = {
    "test_days": 30,  #hold out the most recent n rows as test
    "val_fraction_pre_test": 0.2,  #fraction of pre-test data for validation
    "target_col": "rv_5d",
    "date_col": "date",
    "drop_cols": ["symbol"],  #dropped if present
    "scaler": "standard"
}

with open(DATA_DIR / "split_config.json", "w") as f:
    json.dump(CONFIG, f, indent=2)

MASTER_PATH


PosixPath('/Users/aayushrijal/Documents/GitHub/volatility_forecast/data/master_dataset.parquet')

In [14]:
assert MASTER_PATH.exists(), f"Missing master dataset at {MASTER_PATH}"
df = pd.read_parquet(MASTER_PATH)

#normalize column types
df[CONFIG["date_col"]] = pd.to_datetime(df[CONFIG["date_col"]])
df = df.sort_values(CONFIG["date_col"]).reset_index(drop=True)

df.head(), df.tail(), df.shape


(        date  spy_ret_1d  spy_ret_5d  spy_ret_10d  spy_ret_20d  spy_ret_60d  \
 0 2010-03-31   -0.003413    0.001369     0.003255     0.045110     0.035979   
 1 2010-04-01    0.006814    0.009810     0.010582     0.048901     0.040150   
 2 2010-04-05    0.008116    0.018527     0.023773     0.042825     0.047563   
 3 2010-04-06    0.002355    0.014554     0.020796     0.045005     0.045705   
 4 2010-04-07   -0.005729    0.008144     0.008059     0.037615     0.036654   
 
    spy_vol_5d  spy_vol_10d  spy_vol_20d  spy_vol_60d  ...      vix3m  \
 0    0.007427     0.013506     0.023040     0.068533  ...  19.920000   
 1    0.009947     0.015119     0.023836     0.068820  ...  19.900000   
 2    0.012824     0.016392     0.020799     0.069294  ...  19.350000   
 3    0.011400     0.015679     0.020931     0.069206  ...  18.840000   
 4    0.012740     0.015150     0.021637     0.069363  ...  19.190001   
 
    vix_term  rsi_spy_14  corr_spy_tlt_20d  corr_spy_hyg_20d  corr_spy_tlt_60d

In [15]:
date_col = CONFIG["date_col"]
target_col = CONFIG["target_col"]
drop_cols = [c for c in CONFIG["drop_cols"] if c in df.columns]
feature_cols = [c for c in df.columns if c not in [date_col, target_col] + drop_cols]

#determines split sizes
test_days = CONFIG["test_days"]
pre_test = df.iloc[:-test_days] if test_days > 0 else df
test = df.iloc[-test_days:] if test_days > 0 else df.iloc[0:0]

val_size = max(1, int(len(pre_test) * CONFIG["val_fraction_pre_test"]))
val = pre_test.iloc[-val_size:]
train = pre_test.iloc[:-val_size]

assert len(train) > 0, "Train set empty; adjust split parameters."
assert len(val) > 0, "Val set empty; adjust split parameters."
assert len(test) >= 0, "Test set negative size."

summary = {
    "train_rows": len(train),
    "val_rows": len(val),
    "test_rows": len(test),
    "train_start": train[date_col].min().date(),
    "train_end": train[date_col].max().date(),
    "val_start": val[date_col].min().date(),
    "val_end": val[date_col].max().date(),
    "test_start": test[date_col].min().date() if len(test) else None,
    "test_end": test[date_col].max().date() if len(test) else None,
    "n_features": len(feature_cols)
}
summary


{'train_rows': 3154,
 'val_rows': 788,
 'test_rows': 30,
 'train_start': datetime.date(2010, 3, 31),
 'train_end': datetime.date(2022, 10, 7),
 'val_start': datetime.date(2022, 10, 10),
 'val_end': datetime.date(2025, 11, 28),
 'test_start': datetime.date(2025, 12, 1),
 'test_end': datetime.date(2026, 1, 9),
 'n_features': 20}

In [16]:
X_train = train[feature_cols].copy()
y_train = train[[target_col]].copy()
X_val = val[feature_cols].copy()
y_val = val[[target_col]].copy()
X_test = test[feature_cols].copy()
y_test = test[[target_col]].copy() if len(test) else pd.DataFrame(columns=[target_col])

#persist indices(by date)
train[[date_col]].to_csv(DATA_DIR / "train_indices.csv", index=False)
val[[date_col]].to_csv(DATA_DIR / "val_indices.csv", index=False)
test[[date_col]].to_csv(DATA_DIR / "test_indices.csv", index=False)

#persist unscaled features/targets
X_train.to_parquet(DATA_DIR / "X_train.parquet", index=False)
X_val.to_parquet(DATA_DIR / "X_val.parquet", index=False)
X_test.to_parquet(DATA_DIR / "X_test.parquet", index=False)
y_train.to_parquet(DATA_DIR / "y_train.parquet", index=False)
y_val.to_parquet(DATA_DIR / "y_val.parquet", index=False)
y_test.to_parquet(DATA_DIR / "y_test.parquet", index=False)

with open(DATA_DIR / "feature_columns.json", "w") as f:
    json.dump(feature_cols, f, indent=2)

len(X_train), len(X_val), len(X_test), len(feature_cols)


(3154, 788, 30, 20)

In [17]:
#quick stats for sanity
stats_target = df[[target_col]].describe()
stats_features = df[feature_cols].describe().T.head()

stats_target, stats_features


(             rv_5d
 count  3972.000000
 mean      0.019514
 std       0.014571
 min       0.001851
 25%       0.010529
 50%       0.016283
 75%       0.024191
 max       0.189043,
               count      mean       std       min       25%       50%  \
 spy_ret_1d   3972.0  0.000520  0.010881 -0.115886 -0.003711  0.000697   
 spy_ret_5d   3972.0  0.002599  0.022634 -0.198077 -0.007006  0.004582   
 spy_ret_10d  3972.0  0.005181  0.030702 -0.265117 -0.007926  0.008302   
 spy_ret_20d  3972.0  0.010396  0.042515 -0.370872 -0.007888  0.016752   
 spy_ret_60d  3972.0  0.031478  0.064405 -0.359347  0.004195  0.040915   
 
                   75%       max  
 spy_ret_1d   0.005776  0.099863  
 spy_ret_5d   0.014785  0.160060  
 spy_ret_10d  0.022695  0.172254  
 spy_ret_20d  0.034642  0.207604  
 spy_ret_60d  0.071479  0.334965  )

Next: other notebooks load these artifacts. Use the scaler only for linear models.
