In [1]:
# phase2_feature_engineering_and_target_construction.ipynb

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# === Load from Phase 1 ===
df = pd.read_pickle("data/processed/underlying_price.pkl")
print(f"Loaded data: shape={df.shape}, columns={list(df.columns)}")

# === Target construction: forward realized volatility ===
df['hv10_fwd'] = df['log_ret'].rolling(10).std().shift(-9) * np.sqrt(252)
df['hv20_fwd'] = df['log_ret'].rolling(20).std().shift(-19) * np.sqrt(252)
df['hv30_fwd'] = df['log_ret'].rolling(30).std().shift(-29) * np.sqrt(252)

# === Feature engineering ===
df['abs_log_ret'] = np.abs(df['log_ret'])
df['abs_log_ret_lag1'] = df['abs_log_ret'].shift(1)
df['abs_log_ret_lag5'] = df['abs_log_ret'].shift(1).rolling(5).mean()

df['ret_3d'] = np.log(df['Close'] / df['Close'].shift(3))
df['ret_std_5d'] = df['log_ret'].rolling(5).std()

df['hv_diff'] = df['hv10'] - df['hv30']

df['day_of_week'] = df.index.dayofweek
df['days_to_expiry'] = (3 - df.index.dayofweek) % 7  # e.g., expiry on Wednesday

# === Scale selected features ===
features_to_scale = [
    'abs_log_ret', 'abs_log_ret_lag1', 'abs_log_ret_lag5',
    'ret_3d', 'ret_std_5d', 'hv20', 'hv_diff', 'days_to_expiry'
]

scaler = StandardScaler()

# Only scale rows where all features are present
df_scaled = df[features_to_scale].dropna()
scaled_values = scaler.fit_transform(df_scaled)

for i, col in enumerate(features_to_scale):
    df[col + '_z'] = np.nan
    df.loc[df_scaled.index, col + '_z'] = scaled_values[:, i]

print(f"Feature scaling complete. Added {len(features_to_scale)} standardized columns.")

# === Drop rows where any target is missing (to make ML step clean) ===
df = df.dropna(subset=['log_ret', 'hv10_fwd'])

print(f"After target NaN drop: shape={df.shape}")

# === Save for Phase 3 ===
df.to_pickle("data/processed/underlying_price.pkl")
print("✅ Saved updated dataset with features & targets to data/processed/underlying_price.pkl")

# Optional quick check
print(df[['log_ret', 'hv10', 'hv10_fwd']].head(10))


Loaded data: shape=(1867, 12), columns=['Close', 'High', 'Low', 'Open', 'Volume', 'log_ret', 'vol_band', 'hv10', 'hv20', 'hv30', 'hv10_z', 'hv30_z']
Feature scaling complete. Added 8 standardized columns.
After target NaN drop: shape=(1858, 31)
✅ Saved updated dataset with features & targets to data/processed/underlying_price.pkl
Price        log_ret      hv10  hv10_fwd
Date                                    
2018-01-03  0.000096       NaN  0.052365
2018-01-04  0.005881       NaN  0.057722
2018-01-05  0.005132       NaN  0.055854
2018-01-08  0.006114       NaN  0.058932
2018-01-09  0.001261       NaN  0.059574
2018-01-10 -0.000451       NaN  0.069047
2018-01-11  0.001785       NaN  0.067897
2018-01-12  0.002817       NaN  0.072906
2018-01-15  0.005630       NaN  0.073068
2018-01-16 -0.003834  0.052365  0.091910
