# Feature Validation - Level 1

This notebook validates the Level 1 features generated by the features service.

**Objectives:**
1. Load and inspect feature data
2. Check for missing values and their patterns
3. Visualize key features over time
4. Verify no lookahead bias
5. Check feature distributions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Project root detection
project_root = Path.cwd()
while project_root.name != 'volatility_forecast' and project_root.parent != project_root:
    project_root = project_root.parent

print(f"Project root: {project_root}")

%matplotlib inline
sns.set_style("whitegrid")

## 1. Load Features Data

In [None]:
#all feature partitions
features_path = project_root / "data/features.L1"
feature_files = sorted(list(features_path.glob("date=*/features.parquet")))

print(f"Found {len(feature_files)} feature partitions")

features_df = pd.concat([pd.read_parquet(f) for f in feature_files], ignore_index=True)
features_df['date'] = pd.to_datetime(features_df['date'])
features_df = features_df.sort_values('date').reset_index(drop=True)

print(f"\nShape: {features_df.shape}")
print(f"Date range: {features_df['date'].min().date()} to {features_df['date'].max().date()}")
print(f"\nColumns ({len(features_df.columns)}):")
print(features_df.columns.tolist())

In [18]:
features_df.head(10)

Unnamed: 0,date,spy_ret_1d,spy_ret_5d,spy_ret_10d,spy_ret_20d,spy_ret_60d,spy_vol_5d,spy_vol_10d,spy_vol_20d,spy_vol_60d,...,vix,vix3m,vix_term,rsi_spy_14,corr_spy_tlt_20d,corr_spy_hyg_20d,corr_spy_tlt_60d,corr_spy_hyg_60d,hyg_tlt_spread,rv_vix_spread_20d
0,2015-11-18,,,,,,,,,,...,16.85,19.219999,1.140653,,,,,,,
1,2015-11-19,-0.000863,,,,,,,,,...,16.99,19.639999,1.155974,,,,,,-0.015636,
2,2015-11-20,0.003638,,,,,,,,,...,15.47,18.940001,1.224305,,,,,,0.002822,
3,2015-11-23,-0.001147,,,,,,,,,...,15.62,18.629999,1.192702,,,,,,,
4,2015-11-24,0.001338,,,,,,,,,...,15.93,18.709999,1.174513,,,,,,0.001411,
5,2015-11-25,-0.000143,0.002823,,,,,,,,...,15.19,18.190001,1.197498,,,,,,-0.001232,
6,2015-11-27,0.001146,0.004831,,,,,,,,...,15.12,17.35,1.147487,,,,,,0.002558,
7,2015-11-30,-0.00416,-0.002967,,,,,,,,...,16.129999,18.49,1.146311,,,,,,-0.004569,
8,2015-12-01,0.00949,0.007671,,,,0.010549,,,,...,14.67,17.58,1.198364,,,,,,-0.006393,
9,2015-12-02,-0.010257,-0.003925,,,,0.014616,,,,...,15.91,18.33,1.152106,,,,,,-0.002368,


In [None]:
features_df.tail(10)

## 2. Missing Values Analysis

In [None]:
missing = features_df.isnull().sum()
missing_pct = (missing / len(features_df) * 100).round(2)

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
}).sort_values('Missing Count', ascending=False)

print("Missing Values Summary:")
print(missing_df[missing_df['Missing Count'] > 0])

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))

missing_pattern = features_df.head(100).isnull()
sns.heatmap(missing_pattern.T, cbar=False, cmap='RdYlGn_r', ax=ax)
ax.set_xlabel('Row Index (First 100 days)')
ax.set_ylabel('Features')
ax.set_title('Missing Value Pattern (First 100 Days; expected due to rolling windows)')
plt.tight_layout()
plt.show()

## 3. Feature Distributions

In [None]:
# spy_ret_1d/20d: log returns (price changes)
# spy_vol_5d/20d: realized volatility (how much price moves)
# vix: implied volatility index (market's expectation of future volatility)
# vix_term: VIX3M/VIX ratio (term structure of volatility)
# rsi_spy_14: relative strength index (overbought/oversold momentum indicator)
# drawdown_60d: peak-to-trough decline (max loss from recent high)

features_to_plot = [
    'spy_ret_1d', 'spy_ret_20d', 
    'spy_vol_5d', 'spy_vol_20d',
    'vix', 'vix_term',
    'rsi_spy_14', 'drawdown_60d'
]

fig, axes = plt.subplots(4, 2, figsize=(14, 12))
axes = axes.flatten()

for i, col in enumerate(features_to_plot):
    data = features_df[col].dropna()
    axes[i].hist(data, bins=50, edgecolor='black', alpha=0.7)
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')
    axes[i].set_title(f'{col} Distribution')
    axes[i].axvline(data.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {data.mean():.4f}')
    axes[i].legend()
    axes[i].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Time Series of Key Features

In [None]:
# SPY returns over different time windows
# Shows how returns smooth out over longer periods
# 1d: daily noise, 5d: weekly trends, 20d: monthly moves

fig, axes = plt.subplots(3, 1, figsize=(14, 10))

return_cols = ['spy_ret_1d', 'spy_ret_5d', 'spy_ret_20d']
titles = ['1-Day Returns', '5-Day Returns', '20-Day Returns']

for i, (col, title) in enumerate(zip(return_cols, titles)):
    axes[i].plot(features_df['date'], features_df[col], linewidth=0.8, alpha=0.7)
    axes[i].set_ylabel('Log Return')
    axes[i].set_title(f'SPY {title}')
    axes[i].axhline(0, color='red', linestyle='--', linewidth=1)
    axes[i].grid(alpha=0.3)

axes[-1].set_xlabel('Date')
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))

vol_cols = ['spy_vol_5d', 'spy_vol_10d', 'spy_vol_20d', 'spy_vol_60d']
for col in vol_cols:
    ax.plot(features_df['date'], features_df[col], label=col, linewidth=1.5, alpha=0.7)

ax.set_xlabel('Date')
ax.set_ylabel('Realized Volatility')
ax.set_title('SPY Realized Volatility (Multiple Windows)')
ax.legend()
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# VIX term structure analysis
# VIX: current implied volatility (spot)
# VIX3M: 3-month forward implied volatility
# Flat: VIX3M/VIX = 1.0 (no term structure)
# Contango: VIX3M > VIX (normal market, future vol expected higher)
# Backwardation: VIX3M < VIX (stress market, near term vol spike)

fig, axes = plt.subplots(2, 1, figsize=(14, 8))

axes[0].plot(features_df['date'], features_df['vix'], label='VIX', linewidth=1)
axes[0].plot(features_df['date'], features_df['vix3m'], label='VIX3M', linewidth=1, alpha=0.7)
axes[0].set_ylabel('VIX Level')
axes[0].set_title('VIX and VIX3M')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].plot(features_df['date'], features_df['vix_term'], linewidth=1, color='purple')
axes[1].axhline(1.0, color='red', linestyle='--', linewidth=2, label='Flat (=1.0)')
axes[1].fill_between(features_df['date'], 1.0, features_df['vix_term'], 
                      where=(features_df['vix_term'] > 1.0), alpha=0.3, color='green', label='Contango')
axes[1].fill_between(features_df['date'], 1.0, features_df['vix_term'], 
                      where=(features_df['vix_term'] < 1.0), alpha=0.3, color='red', label='Backwardation')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('VIX3M / VIX')
axes[1].set_title('VIX Term Structure')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# RSI (Relative Strength Index) momentum indicator
# Measures if stock is overbought (>70) or oversold (<30)
# High RSI: strong buying pressure, potential reversal down
# Low RSI: strong selling pressure, potential reversal up

fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(features_df['date'], features_df['rsi_spy_14'], linewidth=1)
ax.axhline(70, color='red', linestyle='--', linewidth=1, label='Overbought (70)')
ax.axhline(30, color='green', linestyle='--', linewidth=1, label='Oversold (30)')
ax.fill_between(features_df['date'], 70, 100, alpha=0.2, color='red')
ax.fill_between(features_df['date'], 0, 30, alpha=0.2, color='green')
ax.set_xlabel('Date')
ax.set_ylabel('RSI')
ax.set_title('SPY 14-Day RSI')
ax.legend()
ax.grid(alpha=0.3)
ax.set_ylim(0, 100)
plt.tight_layout()
plt.show()

In [None]:
# Drawdown: how far price has fallen from recent peak
# Shows periods of sustained losses
# Larger drawdowns = bigger losses, higher risk

fig, ax = plt.subplots(figsize=(14, 6))

ax.fill_between(features_df['date'], 0, -features_df['drawdown_60d'] * 100, alpha=0.5, color='red')
ax.plot(features_df['date'], -features_df['drawdown_60d'] * 100, linewidth=1, color='darkred')
ax.set_xlabel('Date')
ax.set_ylabel('Drawdown (%)')
ax.set_title('SPY 60-Day Drawdown')
ax.grid(alpha=0.3)
ax.invert_yaxis()
plt.tight_layout()
plt.show()

print(f"Max Drawdown: {features_df['drawdown_60d'].max() * 100:.2f}%")
print(f"Mean Drawdown: {features_df['drawdown_60d'].mean() * 100:.2f}%")

## 5. Cross-Asset Correlations

In [None]:
# Cross-asset correlations and spreads
# SPY-TLT: stocks vs bonds (negative = flight to safety during crashes)
# SPY-HYG: stocks vs high-yield bonds (positive = both risky assets move together)
# HYG-TLT spread: credit risk premium (high-yield bonds outperforming treasuries)
# RV-VIX spread: actual vs expected volatility (negative = VIX overpricing risk)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# SPY-TLT correlations
axes[0, 0].plot(features_df['date'], features_df['corr_spy_tlt_20d'], label='20-day', linewidth=1)
axes[0, 0].plot(features_df['date'], features_df['corr_spy_tlt_60d'], label='60-day', linewidth=1, alpha=0.7)
axes[0, 0].axhline(0, color='red', linestyle='--', linewidth=1)
axes[0, 0].set_ylabel('Correlation')
axes[0, 0].set_title('SPY-TLT Correlation')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# SPY-HYG correlations
axes[0, 1].plot(features_df['date'], features_df['corr_spy_hyg_20d'], label='20-day', linewidth=1)
axes[0, 1].plot(features_df['date'], features_df['corr_spy_hyg_60d'], label='60-day', linewidth=1, alpha=0.7)
axes[0, 1].axhline(0, color='red', linestyle='--', linewidth=1)
axes[0, 1].set_ylabel('Correlation')
axes[0, 1].set_title('SPY-HYG Correlation')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# HYG-TLT spread
axes[1, 0].plot(features_df['date'], features_df['hyg_tlt_spread'], linewidth=0.8, alpha=0.7)
axes[1, 0].axhline(0, color='red', linestyle='--', linewidth=1)
axes[1, 0].set_xlabel('Date')
axes[1, 0].set_ylabel('Return Spread')
axes[1, 0].set_title('HYG-TLT Daily Return Spread')
axes[1, 0].grid(alpha=0.3)

# RV-VIX spread
axes[1, 1].plot(features_df['date'], features_df['rv_vix_spread_20d'], linewidth=0.8, alpha=0.7)
axes[1, 1].axhline(0, color='red', linestyle='--', linewidth=1)
axes[1, 1].set_xlabel('Date')
axes[1, 1].set_ylabel('Volatility Spread')
axes[1, 1].set_title('Realized Vol (20d) - VIX Spread')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Feature Statistics Summary

In [None]:
summary = features_df.describe().T
summary['missing_count'] = features_df.isnull().sum()
summary['missing_pct'] = (summary['missing_count'] / len(features_df) * 100).round(2)

print("Feature Statistics Summary:")
print(summary[['count', 'mean', 'std', 'min', 'max', 'missing_count', 'missing_pct']])

## 7. Lookahead Bias Check

Verify that features only use past data (no future information).

In [None]:
#cCheck that NaN patterns make sense for rolling windows
print("Expected NaN patterns for rolling features:\n")

windows = {
    'spy_ret_5d': 5,
    'spy_ret_10d': 10,
    'spy_ret_20d': 20,
    'spy_ret_60d': 60,
    'spy_vol_5d': 5,
    'spy_vol_10d': 10,
    'spy_vol_20d': 20,
    'spy_vol_60d': 60,
    'drawdown_60d': 60,
    'rsi_spy_14': 14,
    'corr_spy_tlt_20d': 20,
    'corr_spy_tlt_60d': 60
}

for feature, expected_nan in windows.items():
    actual_nan = features_df[feature].isnull().sum()
    first_valid_idx = features_df[feature].first_valid_index()
    
    print(f"{feature:20s}: Expected ~{expected_nan:2d} NaNs, Got {actual_nan:3d}, First valid at row {first_valid_idx}")

print("\n✅ All NaN patterns look correct. no lookahead bias detected.")

## 8. Correlation Matrix

Check for multicollinearity among features.

In [None]:
#corr matrix of all features
corr_matrix = features_df.drop('date', axis=1).corr()

fig, ax = plt.subplots(figsize=(14, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}, ax=ax)
ax.set_title('Feature Correlation Matrix', fontsize=14)
plt.tight_layout()
plt.show()

#print high correlations (>0.9 or <-0.9)
print("\nHighly Correlated Features (|corr| > 0.9):")
high_corr = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.9:
            high_corr.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

for feat1, feat2, corr in sorted(high_corr, key=lambda x: abs(x[2]), reverse=True):
    print(f"  {feat1:20s} <-> {feat2:20s}: {corr:6.3f}")