# 2 Feature Engineering

In [2]:
from config import set_project_root

set_project_root("/Users/ryant/Documents/GitHub/options-driven-forecasting")

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import pytz
import os

In [4]:
df = pd.read_parquet("./src/data/forecast/options_forecasting_data.parquet")
df

Unnamed: 0,timestamp,open,returns_log_lag1,volume,rv_3d,rv_7d,rv_30d,rv_60d,rv_90d,rv_180d,...,vix_90d,vix_180d,volume_pcratio,volume_put,volume_call,volume_premium_ratio,volume_premium_put,volume_premium_call,volume_put_notional,volume_call_notional
0,1.622506e+12,37253.82,0.014510,5234.072735,104.33,110.48,127.47,103.25,94.67,100.02,...,140.30,148.79,1.960,8141.0,4143.4,1.740,8987390.21,5174252.89,3.051952e+08,1.554130e+08
1,1.622509e+12,37798.32,-0.016046,3154.434156,103.27,110.51,127.53,103.27,94.66,100.02,...,141.52,149.73,2.020,8668.7,4300.1,1.650,10297889.51,6237479.51,3.292732e+08,1.634039e+08
2,1.622513e+12,37196.64,-0.009363,3078.850158,103.12,110.46,127.62,103.34,94.70,100.05,...,140.18,148.58,1.950,8553.9,4385.4,1.940,10919268.30,5640726.65,3.200346e+08,1.641032e+08
3,1.622516e+12,36850.00,-0.003894,3179.701359,104.65,110.55,127.73,103.41,94.75,100.06,...,139.44,148.11,1.740,7572.6,4362.4,2.220,11105704.11,5001885.47,2.803137e+08,1.614490e+08
4,1.622520e+12,36706.80,0.001122,1767.107527,105.18,109.93,127.76,103.37,94.76,100.07,...,139.34,147.86,1.710,6853.9,4014.5,2.130,9382251.42,4397727.74,2.529243e+08,1.481317e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22652,1.704053e+12,42619.04,-0.001433,1015.140350,35.78,36.14,40.25,39.98,40.45,36.92,...,64.80,66.20,0.669,2270.2,3394.9,0.382,1190537.05,3117797.15,9.767444e+07,1.465480e+08
22653,1.704056e+12,42558.02,-0.000877,923.438130,35.74,36.11,40.26,39.56,40.44,36.91,...,64.88,66.10,0.659,2317.7,3516.1,0.395,1244788.73,3154461.77,9.962486e+07,1.516300e+08
22654,1.704060e+12,42520.73,-0.006201,1811.594110,35.74,36.12,40.26,39.56,40.35,36.91,...,64.87,66.79,0.663,2317.5,3493.8,0.404,1271547.07,3148189.24,9.943740e+07,1.505204e+08
22655,1.704064e+12,42257.89,0.000608,1710.137210,36.10,35.45,40.30,39.58,40.36,36.92,...,64.84,66.82,0.622,2291.9,3685.6,0.477,1438316.00,3017274.86,9.771319e+07,1.577420e+08


## 2.1 Feature Reduction / PCA

In [None]:
options_metrics = {
    # # Price and returns metrics
    # "price_returns": [
    #     "open", "returns_log_lag1", "volume", 
    # ],
    
    # Realized volatility metrics (over different time periods)
    "realized_volatility": [
        # "rv_3d", 
        # "rv_7d", 
        # "rv_30d", 
        # "rv_60d", 
        # "rv_90d", 
        # "rv_180d", 
        # "rv_270d", 
        # "rv_365d"
    ],
    
    # Perpetual futures metrics
    "perpetual_futures": [
        "perpetual_basis", 
        "perpetual_funding", 
        "perpetual_yield", 
        "perpetual_volume", 
        "perpetual_oi"
    ],
    
    # At-the-money implied volatility (over different time periods)
    "atm_implied_volatility": [
        "atmiv_1d", 
        "atmiv_7d", 
        "atmiv_14d", 
        "atmiv_30d", 
        "atmiv_60d", 
        "atmiv_90d",
        "atmiv_180d", 
        "atmiv_365d"
    ],
    
    # General volatility metrics
    "volatility_metrics": [
        "dvol", 
        "vix_15d", 
        "vix_30d", 
        "vix_60d", 
        "vix_90d", 
        "vix_180d"
    ],
    
    # Options positioning metrics
    "options_exposure": [
        "gex",
        "vex", 
        "gex_plus"
    ],
    
    # Open interest metrics for options
    "open_interest": [
        "oi_pcratio", 
        "oi_put", 
        "oi_call", 
        "oi_premium_pcratio", 
        "oi_premium_put",
        "oi_premium_call",
        "oi_put_notional", 
        "oi_call_notional"
    ],
    
    # Volume metrics for options
    "volume_metrics": [
        "volume_pcratio", 
        "volume_put", 
        "volume_call",
        "volume_premium_ratio", 
        "volume_premium_put", 
        "volume_premium_call",
        "volume_put_notional", 
        "volume_call_notional"
    ],
    
    # Volatility skew metrics (butterfly)
    "butterfly_metrics": [
        "butterfly10D_1d", 
        "butterfly10D_7d", 
        "butterfly10D_14d", 
        "butterfly10D_30d", 
        "butterfly10D_60d", 
        "butterfly10D_90d", 
        "butterfly10D_180d", 
        "butterfly10D_365d",
        "butterfly25D_1d", 
        "butterfly25D_7d", 
        "butterfly25D_14d", 
        "butterfly25D_30d", 
        "butterfly25D_60d", 
        "butterfly25D_90d", 
        "butterfly25D_180d", 
        "butterfly25D_365d"
    ],
    
    # Gamma bands (price levels with high gamma)
    # "gamma_bands": [
    #     # 1-day gamma bands
    #     "gammaband1D_upper_1_4", "gammaband1D_upper_1_2", "gammaband1D_upper_1", "gammaband1D_upper_2",
    #     "gammaband1D_lower_1_4", "gammaband1D_lower_1_2", "gammaband1D_lower_1", "gammaband1D_lower_2",
    #     # 7-day gamma bands
    #     "gammaband7D_upper_1_4", "gammaband7D_upper_1_2", "gammaband7D_upper_1", "gammaband7D_upper_2",
    #     "gammaband7D_lower_1_4", "gammaband7D_lower_1_2", "gammaband7D_lower_1", "gammaband7D_lower_2",
    #     # 30-day gamma bands
    #     "gammaband30D_upper_1_4", "gammaband30D_upper_1_2", "gammaband30D_upper_1", "gammaband30D_upper_2",
    #     "gammaband30D_lower_1_4", "gammaband30D_lower_1_2", "gammaband30D_lower_1", "gammaband30D_lower_2"
    # ],
    
    # Risk reversal metrics (volatility skew indicators)
    "risk_reversal": [
        "riskreversal10D_1d", 
        "riskreversal10D_7d", 
        "riskreversal10D_14d", 
        "riskreversal10D_30d",
        "riskreversal10D_60d", 
        "riskreversal10D_90d", 
        "riskreversal10D_180d", 
        "riskreversal10D_365d",
        "riskreversal25D_1d", 
        "riskreversal25D_7d", 
        "riskreversal25D_14d", 
        "riskreversal25D_30d",
        "riskreversal25D_60d", 
        "riskreversal25D_90d", 
        "riskreversal25D_180d", 
        "riskreversal25D_365d"
    ],
    
    # Skew metrics (asymmetry in implied volatility)
    "skew_metrics": [
        "skew10D_1d", 
        "skew10D_7d", 
        "skew10D_14d", 
        "skew10D_30d",
        "skew10D_60d",
        "skew10D_90d", 
        "skew10D_180d", 
        "skew10D_365d",
        "skew25D_1d", 
        "skew25D_7d", 
        "skew25D_14d", 
        "skew25D_30d",
        "skew25D_60d", 
        "skew25D_90d", 
        "skew25D_180d", 
        "skew25D_365d"
    ]
}

## 2.2 Feature Analysis