# 01 — Feature Engineering (Market + Options Sentiment)

This notebook builds a **feature-rich dataset** by combining:
- technical indicators from market data (via `ta`), and
- options-based sentiment features (put/call, OI skew, rolling stats).

We also create lagged features and compute quick correlation & model-based feature importance scores.

In [1]:
# Bootstrap: ensure the 'src' package path is visible from notebooks directory
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))

from oami.config import initialize_environment
api_key = initialize_environment()
print('Environment initialized. API key status:', 'set' if api_key != 'YOUR_KEY_HERE' else 'not set')

In [2]:
import pandas as pd
import numpy as np

from oami.data_layer import get_market_data, get_options_data
from oami.options_features import build_options_sentiment, compute_sentiment_scores

SYMBOL = 'SPY'
START_DATE = '2024-01-01'
END_DATE = '2024-06-30'

market_df = get_market_data(
    symbol=SYMBOL,
    start=START_DATE,
    end=END_DATE,
    interval='1D',
    use_cache=True,
    look_forward=14,
)

options_contracts = get_options_data(
    symbol=SYMBOL,
    start_date=START_DATE,
    end_date=END_DATE,
    interval='1D',
    use_cache=True,
    look_forward=14,
)

options_features = build_options_sentiment(options_contracts)
if not options_features.empty:
    sentiment_scores = compute_sentiment_scores(options_features)
    options_features = options_features.merge(sentiment_scores, on='Date', how='left')
else:
    sentiment_scores = pd.DataFrame(columns=['Date', 'option_sentiment_score', 'fear_greed_skew'])

options_feature_set = options_features.sort_values('Date').reset_index(drop=True)
market_df = market_df.sort_values('Date').reset_index(drop=True)

market_df.head(), options_feature_set.head()

In [10]:
from typing import Iterable, List
import ta


def build_ta_features(df: pd.DataFrame) -> pd.DataFrame:
    """Create technical indicators using the ``ta`` library."""
    out = df.copy().sort_values('Date')
    c, h, l, v = out['Close'], out['High'], out['Low'], out['Volume']

    # Trend & momentum
    for w in (5, 10, 20):
        out[f'sma_{w}'] = c.rolling(w).mean()
    for w in (10, 20):
        out[f'ema_{w}'] = c.ewm(span=w, adjust=False).mean()
    out['rsi_14'] = ta.momentum.RSIIndicator(close=c, window=14).rsi()
    macd = ta.trend.MACD(close=c, window_fast=12, window_slow=26, window_sign=9)
    out['macd'], out['macd_signal'], out['macd_hist'] = macd.macd(), macd.macd_signal(), macd.macd_diff()

    # Volatility
    bb = ta.volatility.BollingerBands(close=c, window=20, window_dev=2)
    out['bb_width'] = bb.bollinger_wband()
    out['atr_14'] = ta.volatility.AverageTrueRange(high=h, low=l, close=c, window=14).average_true_range()
    for w in (10, 20):
        out[f'roll_std_{w}'] = c.rolling(w).std()

    # Volume features
    out['vol_ma_10'] = v.rolling(10).mean()
    out['vol_z_20'] = (v - v.rolling(20).mean()) / (v.rolling(20).std())

    # Returns & target
    out['ret_1'] = c.pct_change(1)
    out['next_return'] = out['ret_1'].shift(-1)
    return out


mkt = build_ta_features(market_df)
opt = options_feature_set.copy()
features = pd.merge(mkt, opt, on='Date', how='left')
features.head()

In [4]:
def make_lags(df: pd.DataFrame, cols: list, lags=(1, 3, 5, 10)) -> pd.DataFrame:
    """Add lagged features for selected columns."""
    out = df.copy().sort_values('Date')
    for c in cols:
        if c not in out.columns:
            continue
        for L in lags:
            out[f'{c}_lag{L}'] = out[c].shift(L)
    return out

lag_columns = [
    'ret_1',
    'rsi_14',
    'macd',
    'macd_signal',
    'bb_width',
    'atr_14',
    'weighted_average_moneyness',
    'implied_direction_bias',
    'near_far_expiry_vol_ratio',
    'option_sentiment_score',
    'fear_greed_skew',
]

features = make_lags(features, lag_columns)
features = features.dropna().reset_index(drop=True)
features.head()

In [6]:
import matplotlib.pyplot as plt
import numpy as np

numeric = features.drop(columns=['Date']).select_dtypes(include=[np.number])
corr = numeric.corr()

# Plot a correlation heatmap (matplotlib only; single chart; default colormap)
plt.figure(figsize=(10, 8))
plt.imshow(corr, aspect='auto')
plt.title('Feature Correlation Heatmap')
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.index)), corr.index)
plt.colorbar()
plt.tight_layout()
plt.show()

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np

# Clean feature matrix
drop_cols = ['Open','High','Low','Close','Volume']
X = features.drop(columns=[c for c in drop_cols if c in features.columns])
X = X.drop(columns=['Date','next_return'], errors='ignore').select_dtypes(include=[np.number])

# Target (optional: try next_5d_return instead)
y = features['next_return']

# Scale
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)

# Models
models = {
    'RandomForest': RandomForestRegressor(n_estimators=500, max_depth=10, min_samples_leaf=5, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=42),
    'Ridge': Ridge(alpha=0.1, fit_intercept=True)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results[name] = r2_score(y_test, pred)
    print(f"{name:<18} R² = {results[name]:.6f}")

pd.DataFrame({'R2': results})


importances = {}
for name, model in models.items():
    fitted_model = model
    if hasattr(fitted_model, 'feature_importances_'):
        importances[name] = pd.Series(fitted_model.feature_importances_, index=X.columns).sort_values(ascending=False)
    elif hasattr(fitted_model, 'coef_'):
        coef = getattr(fitted_model, 'coef_')
        if coef.ndim > 1:
            coef = coef[-1]
        importances[name] = pd.Series(np.abs(coef), index=X.columns).sort_values(ascending=False)


In [8]:
import pandas as pd
from IPython.display import display

for name, imp in importances.items():
    print(f'\nTop features — {name}')
    display(imp.to_frame('importance'))

## ✅ Summary

- Built technical indicators (SMA/EMA/RSI/MACD/BB/ATR) and volatility/volume stats from daily market data.
- Derived options sentiment features using OCC contract OHLCV (PCR, moneyness, expiry ratios, term structure) and computed OSS + Fear-Greed Skew.
- Created lagged predictors, visualised correlations, and benchmarked a few regression models.
- Cached all raw inputs and contract aggregates in `data/cache/oami_store.h5` for reproducible re-runs.