# Week 5 — Homework Solution: Cross-Sectional Alpha Model v2 (Trees)

**Course:** ML for Quantitative Finance  
**Status:** SOLUTION — do not distribute to students before deadline

---

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from scipy import stats
import xgboost as xgb
import lightgbm as lgb
import optuna
import shap
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 5)

## Part 1: Extended Pipeline

In [None]:
# Same 200 tickers as Week 4 (abbreviated here for readability)
TICKERS = [
    'AAPL', 'MSFT', 'AMZN', 'GOOGL', 'META', 'NVDA', 'JPM', 'JNJ', 'V', 'PG',
    'UNH', 'HD', 'MA', 'DIS', 'BAC', 'XOM', 'CSCO', 'PFE', 'COST', 'ABT',
    'PEP', 'AVGO', 'CRM', 'NKE', 'CVX', 'WMT', 'MRK', 'LLY', 'ABBV', 'INTC',
    'T', 'VZ', 'QCOM', 'TXN', 'PM', 'UNP', 'NEE', 'LOW', 'BMY', 'AMGN',
    'MDT', 'HON', 'SBUX', 'GS', 'MS', 'BLK', 'GILD', 'MMC', 'ADP', 'AMT',
    'CME', 'CI', 'LRCX', 'MO', 'MDLZ', 'SO', 'DUK', 'CL', 'ZTS', 'BDX',
    'REGN', 'ITW', 'APD', 'SHW', 'FISV', 'NOC', 'ICE', 'CSX', 'WM', 'FDX',
    'EMR', 'PNC', 'USB', 'NSC', 'CCI', 'D', 'GM', 'F', 'TGT', 'AEP',
]

cache_path = Path('w5_data_cache.pkl')
if cache_path.exists():
    raw = pd.read_pickle(cache_path)
else:
    raw = yf.download(TICKERS, start='2010-01-01', end='2024-12-31', progress=True)
    raw.to_pickle(cache_path)

prices = raw['Close'].ffill()
volume = raw['Volume'].ffill()
returns_daily = prices.pct_change()
monthly_prices = prices.resample('M').last()
monthly_returns = monthly_prices.pct_change()

# Drop tickers with too much missing
good = prices.isnull().mean() < 0.2
prices = prices.loc[:, good]
volume = volume.loc[:, prices.columns]

print(f"Universe: {prices.shape[1]} stocks")

In [None]:
# Feature computation (same as Week 4)
features = {}
features['mom_1m'] = monthly_prices.pct_change(1)
features['mom_3m'] = monthly_prices.pct_change(3)
features['mom_6m'] = monthly_prices.pct_change(6)
features['mom_12m_skip1'] = monthly_prices.pct_change(12).shift(1)
features['reversal_1m'] = -monthly_prices.pct_change(1)
features['vol_20d'] = returns_daily.rolling(20).std().resample('M').last()
features['vol_60d'] = returns_daily.rolling(60).std().resample('M').last()
features['vol_ratio'] = (volume.rolling(5).mean() / volume.rolling(60).mean()).resample('M').last()
features['ma_50_ratio'] = (prices / prices.rolling(50).mean()).resample('M').last()
features['ma_200_ratio'] = (prices / prices.rolling(200).mean()).resample('M').last()

target = monthly_returns.shift(-1)

# Build panel
months = sorted(set.intersection(*[set(f.index) for f in features.values()]))
months = [m for m in months if pd.Timestamp('2012-01-01') <= m <= pd.Timestamp('2024-06-30')]

X_all, y_all, dates_all = [], [], []
for month in months:
    X_cs = pd.DataFrame({name: feat.loc[month] for name, feat in features.items() if month in feat.index})
    y_cs = target.loc[month] if month in target.index else pd.Series(dtype=float)
    valid = X_cs.dropna().index.intersection(y_cs.dropna().index)
    if len(valid) > 5:
        X_all.append(X_cs.loc[valid])
        y_all.append(y_cs.loc[valid])
        dates_all.extend([month] * len(valid))

X_panel = pd.concat(X_all)
y_panel = pd.concat(y_all)
dates_panel = np.array(dates_all)

print(f"Panel: {len(X_panel)} obs, {X_panel.shape[1]} features")

## Part 2: Optuna Tuning

In [None]:
def make_objective(model_type='xgb'):
    def objective(trial):
        if model_type == 'xgb':
            params = {
                'max_depth': trial.suggest_int('max_depth', 2, 6),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'n_estimators': trial.suggest_int('n_estimators', 50, 300),
                'subsample': trial.suggest_float('subsample', 0.5, 0.9),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
                'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10, log=True),
                'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10, log=True),
            }
            model_cls = lambda: xgb.XGBRegressor(**params, verbosity=0)
        else:
            params = {
                'max_depth': trial.suggest_int('max_depth', 2, 6),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'n_estimators': trial.suggest_int('n_estimators', 50, 300),
                'subsample': trial.suggest_float('subsample', 0.5, 0.9),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
                'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10, log=True),
                'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10, log=True),
            }
            model_cls = lambda: lgb.LGBMRegressor(**params, verbosity=-1)

        splits = [(pd.Timestamp('2016-01-31'), pd.Timestamp('2018-01-31')),
                  (pd.Timestamp('2018-01-31'), pd.Timestamp('2020-01-31')),
                  (pd.Timestamp('2020-01-31'), pd.Timestamp('2022-01-31'))]

        ic_scores = []
        for split, test_end in splits:
            tr_mask = dates_panel < split
            te_mask = (dates_panel >= split) & (dates_panel < test_end)
            if te_mask.sum() < 10:
                continue
            model = model_cls()
            model.fit(X_panel.values[tr_mask], y_panel.values[tr_mask])
            pred = model.predict(X_panel.values[te_mask])
            test_dates = dates_panel[te_mask]
            for m in np.unique(test_dates):
                m_mask = test_dates == m
                if m_mask.sum() > 5:
                    ic = stats.spearmanr(pred[m_mask], y_panel.values[te_mask][m_mask])[0]
                    ic_scores.append(ic)
        return np.mean(ic_scores) if ic_scores else 0
    return objective

# Tune XGBoost
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(make_objective('xgb'), n_trials=30)
print(f"XGBoost best IC: {study_xgb.best_value:.4f}")

# Tune LightGBM
study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(make_objective('lgb'), n_trials=30)
print(f"LightGBM best IC: {study_lgb.best_value:.4f}")

## Part 3: Full Model Comparison

In [None]:
# Run all models with expanding window
pred_start = pd.Timestamp('2018-01-31')
pred_months = [m for m in months if m >= pred_start]

all_models = {
    'OLS': lambda: LinearRegression(),
    'Ridge': lambda: Ridge(alpha=1.0),
    'RF': lambda: RandomForestRegressor(n_estimators=100, max_depth=5, n_jobs=-1),
    'XGBoost': lambda: xgb.XGBRegressor(**study_xgb.best_params, verbosity=0),
    'LightGBM': lambda: lgb.LGBMRegressor(**study_lgb.best_params, verbosity=-1),
}

results = {name: [] for name in all_models}
predictions_store = {name: {} for name in all_models}

for month in pred_months:
    train_mask = dates_panel < month
    test_mask = dates_panel == month
    if test_mask.sum() < 5 or train_mask.sum() < 100:
        continue

    X_tr, y_tr = X_panel.values[train_mask], y_panel.values[train_mask]
    X_te, y_te = X_panel.values[test_mask], y_panel.values[test_mask]

    for name, model_fn in all_models.items():
        model = model_fn()
        model.fit(X_tr, y_tr)
        pred = model.predict(X_te)
        ic = stats.spearmanr(pred, y_te)[0]
        results[name].append({'month': month, 'IC': ic})
        predictions_store[name][month] = pred

# Comparison table
comparison = []
for name in all_models:
    ic_vals = [x['IC'] for x in results[name]]
    comparison.append({
        'Model': name,
        'Avg IC': f"{np.mean(ic_vals):.4f}",
        'IC t-stat': f"{np.mean(ic_vals)/np.std(ic_vals)*np.sqrt(len(ic_vals)):.2f}",
        'IC>0 %': f"{np.mean([x>0 for x in ic_vals]):.0%}",
    })

pd.DataFrame(comparison).set_index('Model')

## Part 4: SHAP Analysis

In [None]:
# Final XGBoost model for SHAP
xgb_final = xgb.XGBRegressor(**study_xgb.best_params, verbosity=0)
xgb_final.fit(X_panel.values, y_panel.values)

explainer = shap.TreeExplainer(xgb_final)
# Use a subsample for speed
sample_idx = np.random.choice(len(X_panel), min(1000, len(X_panel)), replace=False)
shap_values = explainer.shap_values(X_panel.values[sample_idx])

# Summary plot
shap.summary_plot(shap_values, X_panel.iloc[sample_idx], show=False)
plt.title('SHAP Feature Importance (XGBoost)')
plt.tight_layout()
plt.show()

In [None]:
# Feature importance bar chart
mean_shap = np.abs(shap_values).mean(axis=0)
feat_importance = pd.Series(mean_shap, index=X_panel.columns).sort_values(ascending=True)

fig, ax = plt.subplots(figsize=(8, 5))
feat_importance.plot(kind='barh', ax=ax, color='steelblue')
ax.set_title('Mean |SHAP| by Feature')
ax.set_xlabel('Mean |SHAP value|')
plt.tight_layout()
plt.show()

## Part 5: Model Combination (Stretch)

In [None]:
# Average predictions from Ridge, XGBoost, LightGBM
combo_ics = []
combo_models = ['Ridge', 'XGBoost', 'LightGBM']

for month in pred_months:
    preds = []
    for name in combo_models:
        if month in predictions_store[name]:
            preds.append(predictions_store[name][month])
    if len(preds) != len(combo_models):
        continue

    avg_pred = np.mean(preds, axis=0)
    test_mask = dates_panel == month
    y_te = y_panel.values[test_mask]
    if len(avg_pred) == len(y_te) and len(y_te) > 5:
        ic = stats.spearmanr(avg_pred, y_te)[0]
        combo_ics.append(ic)

print(f"Ensemble (Ridge + XGB + LGBM average):")
print(f"  Avg IC: {np.mean(combo_ics):.4f}")
print(f"  IC t-stat: {np.mean(combo_ics)/np.std(combo_ics)*np.sqrt(len(combo_ics)):.2f}")
print(f"\nConclusion: Ensembles often beat individual models because they average out")
print(f"model-specific errors while preserving shared signal.")