# 01 — Feature Engineering (Market + Options Sentiment)

This notebook builds a **feature-rich dataset** by combining:
- technical indicators from market data (via `ta`), and
- options-based sentiment features (put/call, OI skew, rolling stats).

We also create lagged features and compute quick correlation & model-based feature importance scores.

In [1]:
# Bootstrap: ensure the 'src' package path is visible from notebooks directory
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))

from oami.config import initialize_environment
api_key = initialize_environment()
print('Environment initialized. API key status:', 'set' if api_key != 'YOUR_KEY_HERE' else 'not set')

In [2]:
import pandas as pd
import numpy as np
import logging
from pathlib import Path
from oami.data_layer import get_market_data, get_options_data

DATA_DIR = Path('..') / 'data' / 'csv' / 'day'
FEATURES_OUT = Path('..') / 'data' / 'features' / 'day'
FEATURES_OUT.mkdir(parents=True, exist_ok=True)

SYMBOL = 'SPY'
start, end = '2024-01-01', '2025-10-01'

def load_or_fetch_market(symbol: str, start_date: str, end_date: str) -> pd.DataFrame:
    """Load market data from cache or fetch via Polygon.

    Parameters
    ----------
    symbol : str
        Ticker symbol (e.g., 'SPY').
    start_date : str
        ISO date string for start.
    end_date : str
        ISO date string for end.

    Returns
    -------
    pandas.DataFrame
        Market OHLCV with a Date column.
    """
    path = DATA_DIR / f'{symbol}.csv'
    if path.exists():
        logging.info('Loading market data from cache: %s', path)
        return pd.read_csv(path, parse_dates=['Date']).sort_values('Date')
    logging.info('Fetching market data via Polygon: %s', symbol)
    return get_market_data(symbol, start_date, end_date)

def load_or_fetch_options(symbol: str, start_date: str, end_date: str) -> pd.DataFrame:
    """Load options summary from cache or fetch via Polygon.

    Parameters
    ----------
    symbol : str
        Underlying ticker symbol.
    start_date : str
        Start date (ISO).
    end_date : str
        End date (ISO).

    Returns
    -------
    pandas.DataFrame
        Daily options aggregates with sentiment fields if present.
    """
    path = DATA_DIR / f'{symbol}_options.csv'
    if path.exists():
        logging.info('Loading options data from cache: %s', path)
        return pd.read_csv(path, parse_dates=['Date']).sort_values('Date')
    logging.info('Fetching options data via Polygon: %s', symbol)
    return get_options_data(symbol, start_date, end_date)

market_df = load_or_fetch_market(SYMBOL, start, end)
options_df = load_or_fetch_options(SYMBOL, start, end)
market_df.head(), options_df.head()

In [10]:
from typing import Iterable, List
import ta

def build_ta_features(df: pd.DataFrame) -> pd.DataFrame:
    """Create technical indicators using `ta` library.

    Parameters
    ----------
    df : pandas.DataFrame
        Market OHLCV DataFrame with columns: Date, Open, High, Low, Close, Volume.

    Returns
    -------
    pandas.DataFrame
        Copy of input with added indicator columns.
    """
    out = df.copy().sort_values('Date')
    c, h, l, v = out['Close'], out['High'], out['Low'], out['Volume']

    # Trend & Momentum
    for w in (5, 10, 20):
        out[f'sma_{w}'] = c.rolling(w).mean()
    for w in (10, 20):
        out[f'ema_{w}'] = c.ewm(span=w, adjust=False).mean()
    out['rsi_14'] = ta.momentum.RSIIndicator(close=c, window=14).rsi()
    macd = ta.trend.MACD(close=c, window_fast=12, window_slow=26, window_sign=9)
    out['macd'], out['macd_signal'], out['macd_hist'] = macd.macd(), macd.macd_signal(), macd.macd_diff()

    # Volatility
    bb = ta.volatility.BollingerBands(close=c, window=20, window_dev=2)
    out['bb_width'] = bb.bollinger_wband()
    out['atr_14'] = ta.volatility.AverageTrueRange(high=h, low=l, close=c, window=14).average_true_range()
    for w in (10, 20):
        out[f'roll_std_{w}'] = c.rolling(w).std()

    # Volume features
    out['vol_ma_10'] = v.rolling(10).mean()
    out['vol_z_20'] = (v - v.rolling(20).mean()) / (v.rolling(20).std())

    # Returns & target
    out['ret_1'] = c.pct_change(1)
    out['next_return'] = out['ret_1'].shift(-1)
    return out

def build_options_sentiment(df_opt: pd.DataFrame) -> pd.DataFrame:
    """Create options sentiment features from options aggregates.

    Parameters
    ----------
    df_opt : pandas.DataFrame
        Options summary with columns such as Date, PutVol, CallVol, PutOI, CallOI.

    Returns
    -------
    pandas.DataFrame
        Options features aligned by Date with sentiment statistics.
    """
    if df_opt is None or df_opt.empty:
        return pd.DataFrame(columns=['Date'])
    df = df_opt.copy().sort_values('Date')
    # Ensure expected columns exist
    for col in ['PutVol','CallVol','PutOI','CallOI']:
        if col not in df.columns:
            df[col] = np.nan

    # Core ratios
    df['pcr_vol'] = df['PutVol'] / df['CallVol'].replace({0: np.nan})
    df['oi_skew'] = (df['CallOI'] - df['PutOI']) / (df['CallOI'] + df['PutOI']).replace({0: np.nan})
    df['opt_total_vol'] = df['PutVol'].fillna(0) + df['CallVol'].fillna(0)

    # Sentiment index: prefer provided, else derive as 1 - pcr
    if 'SentimentIndex' in df.columns:
        df['sentiment'] = df['SentimentIndex']
    else:
        df['sentiment'] = 1 - df['pcr_vol']

    # Rolling stats
    for w in (5, 10, 20):
        df[f'pcr_rollmean_{w}'] = df['pcr_vol'].rolling(w).mean()
        df[f'pcr_rollstd_{w}'] = df['pcr_vol'].rolling(w).std()
        df[f'sent_rollmean_{w}'] = df['sentiment'].rolling(w).mean()
        df[f'sent_rollstd_{w}'] = df['sentiment'].rolling(w).std()

    # Optional IV rank if IV column is present
    if 'IV' in df.columns:
        iv = df['IV']
        iv_min, iv_max = iv.rolling(252).min(), iv.rolling(252).max()
        df['iv_rank_1y'] = (iv - iv_min) / (iv_max - iv_min)

    return df

mkt = build_ta_features(market_df)
opt = build_options_sentiment(options_df)

features = pd.merge(mkt, opt, on='Date', how='left')
features.head()

In [11]:
get_market_data("SPY", "2024-01-01", "2025-10-01")

In [4]:
def make_lags(df: pd.DataFrame, cols: list, lags=(1,3,5,10)) -> pd.DataFrame:
    """Add lagged features for selected columns.

    Parameters
    ----------
    df : pandas.DataFrame
        Input feature table with a `Date` column.
    cols : list
        Columns to lag.
    lags : tuple, optional
        Lags in days to create, by default (1, 3, 5, 10).

    Returns
    -------
    pandas.DataFrame
        DataFrame with lagged columns appended.
    """
    out = df.copy().sort_values('Date')
    for c in cols:
        if c not in out.columns:
            continue
        for L in lags:
            out[f'{c}_lag{L}'] = out[c].shift(L)
    return out

# Choose a focused set of columns for lagging
lag_columns = [
    'ret_1','rsi_14','macd','macd_signal','bb_width','atr_14',
    'pcr_vol','oi_skew','sentiment'
]
features = make_lags(features, lag_columns)
features = features.dropna().reset_index(drop=True)
features.head()

In [5]:
# Save engineered dataset
out_path = FEATURES_OUT / f'{SYMBOL}_features.csv'
features.to_csv(out_path, index=False)
print('Saved features to:', out_path)

In [6]:
import matplotlib.pyplot as plt
import numpy as np

numeric = features.drop(columns=['Date']).select_dtypes(include=[np.number])
corr = numeric.corr()

# Plot a correlation heatmap (matplotlib only; single chart; default colormap)
plt.figure(figsize=(10, 8))
plt.imshow(corr, aspect='auto')
plt.title('Feature Correlation Heatmap')
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.index)), corr.index)
plt.colorbar()
plt.tight_layout()
plt.show()

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np

# Clean feature matrix
drop_cols = ['Open','High','Low','Close','Volume']
X = features.drop(columns=[c for c in drop_cols if c in features.columns])
X = X.drop(columns=['Date','next_return'], errors='ignore').select_dtypes(include=[np.number])

# Target (optional: try next_5d_return instead)
y = features['next_return']

# Scale
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)

# Models
models = {
    'RandomForest': RandomForestRegressor(n_estimators=500, max_depth=10, min_samples_leaf=5, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=42),
    'Ridge': Ridge(alpha=0.1, fit_intercept=True)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results[name] = r2_score(y_test, pred)
    print(f"{name:<18} R² = {results[name]:.6f}")

pd.DataFrame({'R2': results})


In [8]:
import pandas as pd
from IPython.display import display

for name, imp in importances.items():
    print(f'\nTop features — {name}')
    display(imp.to_frame('importance'))

## ✅ Summary

- Built technical indicators (SMA/EMA/RSI/MACD/BB/ATR) and rolling stats.
- Engineered options sentiment features (put/call ratio, OI skew, rolling means/stds).
- Created lagged predictors and saved the unified dataset.
- Computed correlations and model-based feature importances (RF/GB/Ridge).

Next step: train, evaluate, and compare ML models in `02_modeling.ipynb` (train/validation splits, CV, and robust importance aggregation).