# Week 7 Homework SOLUTION --- Deep Cross-Sectional Model

## SOLUTION --- do not distribute

**Quantitative Finance ML Course**

---

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from scipy.stats import spearmanr, rankdata
from xgboost import XGBRegressor
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 5)

# Device selection
if torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(f'Using device: {device}')

In [None]:
# --- Data generation ---
np.random.seed(42)
n_stocks = 500
n_months = 240

records = []
for t in range(n_months):
    for i in range(n_stocks):
        mom_1m = np.random.randn() * 0.08
        mom_12m = np.random.randn() * 0.20
        vol_20d = np.abs(np.random.randn()) * 0.02 + 0.01
        size = np.random.randn() * 2 + 15
        bm = np.random.randn() * 0.5
        turnover = np.abs(np.random.randn()) * 0.01
        rev_1m = np.random.randn() * 0.05

        ret_next = (
            -0.002 * mom_1m + 0.003 * mom_12m - 0.005 * vol_20d
            + 0.001 * bm + 0.002 * np.sin(mom_12m * size)
            + np.random.randn() * 0.08
        )

        records.append({
            'date_idx': t, 'stock_id': i,
            'mom_1m': mom_1m, 'mom_12m': mom_12m, 'vol_20d': vol_20d,
            'size': size, 'bm': bm, 'turnover': turnover, 'rev_1m': rev_1m,
            'ret_next': ret_next
        })

df = pd.DataFrame(records)
feature_cols = ['mom_1m', 'mom_12m', 'vol_20d', 'size', 'bm', 'turnover', 'rev_1m']

# Keep raw features for tree models (they handle ranks internally)
df_raw = df.copy()

# Cross-sectional rank normalization for NN
for col in feature_cols:
    df[col] = df.groupby('date_idx')[col].transform(
        lambda x: (x.rank() - 1) / (len(x) - 1) - 0.5
    )

print(f'Panel: {df.shape[0]:,} obs, {n_months} months, {n_stocks} stocks')

---

## Part 1: Gu-Kelly-Xiu Architecture (20 pts) --- SOLUTION

In [None]:
class CrossSectionalDataset(Dataset):
    """Dataset for cross-sectional stock data."""
    def __init__(self, features, targets):
        self.X = torch.tensor(features, dtype=torch.float32)
        self.y = torch.tensor(targets, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


class EarlyStopping:
    """Early stopping to prevent overfitting."""
    def __init__(self, patience=10):
        self.patience = patience
        self.best_loss = float('inf')
        self.counter = 0
        self.best_state = None

    def step(self, val_loss, model):
        if val_loss < self.best_loss:
            self.best_loss = val_loss
            self.counter = 0
            self.best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            return False
        else:
            self.counter += 1
            return self.counter >= self.patience

    def restore_best(self, model):
        if self.best_state is not None:
            model.load_state_dict(self.best_state)


class GuKellyXiuNet(nn.Module):
    """Gu-Kelly-Xiu NN3: 3 hidden layers (32-16-8) with BN, ReLU, Dropout."""
    def __init__(self, input_dim, hidden_sizes=(32, 16, 8), dropout=0.5):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h in hidden_sizes:
            layers.extend([
                nn.Linear(prev_dim, h),
                nn.BatchNorm1d(h),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = h
        layers.append(nn.Linear(prev_dim, 1))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x).squeeze(-1)


# Verify
model = GuKellyXiuNet(input_dim=len(feature_cols))
print(model)
x_test = torch.randn(32, len(feature_cols))
out = model(x_test)
print(f'\nOutput shape: {out.shape}')
n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Parameters: {n_params}')

---

## Part 2: Expanding-Window CV with MPS (25 pts) --- SOLUTION

In [None]:
def expanding_window_folds(n_months, initial_train=120, val_size=24, test_size=24):
    """Generate expanding-window fold definitions."""
    folds = []
    train_end = initial_train
    while train_end + val_size + test_size <= n_months:
        val_end = train_end + val_size
        test_end = val_end + test_size
        folds.append((train_end, val_end, test_end))
        train_end += test_size  # shift by test_size
    return folds


folds = expanding_window_folds(n_months)
for i, (tr, va, te) in enumerate(folds):
    print(f'Fold {i+1}: Train [0, {tr})  Val [{tr}, {va})  Test [{va}, {te})')

In [None]:
def train_single_nn(model, train_loader, val_loader, n_epochs=100,
                    lr=1e-3, device='cpu'):
    """Train one NN model with early stopping."""
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    stopper = EarlyStopping(patience=10)

    for epoch in range(n_epochs):
        # Train
        model.train()
        for X_b, y_b in train_loader:
            X_b, y_b = X_b.to(device), y_b.to(device)
            optimizer.zero_grad()
            loss = ((model(X_b) - y_b) ** 2).mean()
            loss.backward()
            optimizer.step()

        # Validate
        model.eval()
        val_losses = []
        with torch.no_grad():
            for X_b, y_b in val_loader:
                X_b, y_b = X_b.to(device), y_b.to(device)
                val_losses.append(((model(X_b) - y_b) ** 2).mean().item())

        if stopper.step(np.mean(val_losses), model):
            break

    stopper.restore_best(model)
    model = model.to('cpu')
    return model


def predict_nn_ensemble(models, X, device='cpu'):
    """Average predictions from an ensemble of NN models."""
    X_t = torch.tensor(X, dtype=torch.float32).to(device)
    preds = []
    for m in models:
        m.eval()
        m.to(device)
        with torch.no_grad():
            preds.append(m(X_t).cpu().numpy())
        m.to('cpu')
    return np.mean(preds, axis=0)


print('Training functions ready.')

In [None]:
def compute_monthly_ic(df_sub, pred_col='pred', ret_col='ret_next'):
    """Compute monthly rank IC."""
    return df_sub.groupby('date_idx').apply(
        lambda g: spearmanr(g[pred_col], g[ret_col])[0]
    )


def compute_ls_returns(df_sub, pred_col='pred', ret_col='ret_next', n_q=5):
    """Compute long-short quintile returns."""
    def _ls(g):
        g = g.copy()
        g['q'] = pd.qcut(g[pred_col], n_q, labels=False, duplicates='drop')
        return g[g['q'] == n_q - 1][ret_col].mean() - g[g['q'] == 0][ret_col].mean()
    return df_sub.groupby('date_idx').apply(_ls)


print('Evaluation functions ready.')

In [None]:
# --- Run expanding-window CV for NN ---
use_device = str(device)
nn_test_dfs = []

for fold_idx, (train_end, val_end, test_end) in enumerate(folds):
    print(f'\n--- Fold {fold_idx+1}: Train [0,{train_end}) Val [{train_end},{val_end}) Test [{val_end},{test_end}) ---')

    # Split
    tr = df[df.date_idx < train_end]
    va = df[(df.date_idx >= train_end) & (df.date_idx < val_end)]
    te = df[(df.date_idx >= val_end) & (df.date_idx < test_end)].copy()

    train_ds = CrossSectionalDataset(tr[feature_cols].values, tr['ret_next'].values)
    val_ds = CrossSectionalDataset(va[feature_cols].values, va['ret_next'].values)

    train_loader = DataLoader(train_ds, batch_size=2048, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=4096, shuffle=False)

    # Train ensemble of 3 seeds
    models = []
    for seed in range(3):
        torch.manual_seed(seed)
        m = GuKellyXiuNet(input_dim=len(feature_cols))
        m = train_single_nn(m, train_loader, val_loader, device=use_device)
        models.append(m)
        print(f'  Seed {seed} done')

    # Predict
    te['pred_nn'] = predict_nn_ensemble(models, te[feature_cols].values)
    ic = compute_monthly_ic(te, pred_col='pred_nn')
    print(f'  Test IC: {ic.mean():.4f}')
    nn_test_dfs.append(te)

nn_test = pd.concat(nn_test_dfs)
nn_ic = compute_monthly_ic(nn_test, pred_col='pred_nn')
print(f'\nOverall NN Test IC: {nn_ic.mean():.4f} (IR: {nn_ic.mean()/nn_ic.std():.3f})')

---

## Part 3: Compare Against Week 5 Best Model (25 pts) --- SOLUTION

In [None]:
# --- XGBoost with same expanding-window splits ---
xgb_test_dfs = []

for fold_idx, (train_end, val_end, test_end) in enumerate(folds):
    print(f'Fold {fold_idx+1}...', end=' ')

    tr = df_raw[df_raw.date_idx < train_end]
    va = df_raw[(df_raw.date_idx >= train_end) & (df_raw.date_idx < val_end)]
    te = df_raw[(df_raw.date_idx >= val_end) & (df_raw.date_idx < test_end)].copy()

    xgb_model = XGBRegressor(
        n_estimators=300, max_depth=4, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        early_stopping_rounds=20, verbosity=0
    )
    xgb_model.fit(
        tr[feature_cols], tr['ret_next'],
        eval_set=[(va[feature_cols], va['ret_next'])],
        verbose=False
    )
    te['pred_xgb'] = xgb_model.predict(te[feature_cols])
    ic = compute_monthly_ic(te, pred_col='pred_xgb')
    print(f'IC: {ic.mean():.4f}')
    xgb_test_dfs.append(te)

xgb_test = pd.concat(xgb_test_dfs)
xgb_ic = compute_monthly_ic(xgb_test, pred_col='pred_xgb')
print(f'\nOverall XGBoost Test IC: {xgb_ic.mean():.4f} (IR: {xgb_ic.mean()/xgb_ic.std():.3f})')

In [None]:
# --- Comparison table ---
nn_ls = compute_ls_returns(nn_test, pred_col='pred_nn')
xgb_ls = compute_ls_returns(xgb_test, pred_col='pred_xgb')

comparison = pd.DataFrame({
    'Model': ['NN Ensemble', 'XGBoost'],
    'Mean IC': [nn_ic.mean(), xgb_ic.mean()],
    'IC Std': [nn_ic.std(), xgb_ic.std()],
    'IC IR': [nn_ic.mean()/nn_ic.std(), xgb_ic.mean()/xgb_ic.std()],
    'IC > 0 (%)': [(nn_ic > 0).mean()*100, (xgb_ic > 0).mean()*100],
    'LS Sharpe (ann)': [
        nn_ls.mean()/nn_ls.std()*np.sqrt(12),
        xgb_ls.mean()/xgb_ls.std()*np.sqrt(12)
    ]
}).set_index('Model')

print('Model Comparison:')
print(comparison.round(4).to_string())

In [None]:
# --- Comparison plots ---
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Cumulative IC
axes[0].plot(nn_ic.cumsum().values, label='NN Ensemble', linewidth=2)
axes[0].plot(xgb_ic.cumsum().values, label='XGBoost', linewidth=2)
axes[0].set_xlabel('Test Month')
axes[0].set_ylabel('Cumulative IC')
axes[0].set_title('Cumulative IC Comparison')
axes[0].legend()

# Cumulative long-short returns
axes[1].plot((1 + nn_ls).cumprod().values, label='NN Ensemble', linewidth=2)
axes[1].plot((1 + xgb_ls).cumprod().values, label='XGBoost', linewidth=2)
axes[1].set_xlabel('Test Month')
axes[1].set_ylabel('Cumulative Return')
axes[1].set_title('Long-Short Cumulative Returns')
axes[1].legend()

plt.tight_layout()
plt.show()

---

## Part 4: Ensemble (NN + XGBoost + LightGBM) (15 pts) --- SOLUTION

In [None]:
# --- LightGBM with same expanding-window splits ---
lgb_test_dfs = []

for fold_idx, (train_end, val_end, test_end) in enumerate(folds):
    print(f'Fold {fold_idx+1}...', end=' ')

    tr = df_raw[df_raw.date_idx < train_end]
    va = df_raw[(df_raw.date_idx >= train_end) & (df_raw.date_idx < val_end)]
    te = df_raw[(df_raw.date_idx >= val_end) & (df_raw.date_idx < test_end)].copy()

    lgb_model = lgb.LGBMRegressor(
        n_estimators=300, max_depth=4, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        verbosity=-1
    )
    lgb_model.fit(
        tr[feature_cols], tr['ret_next'],
        eval_set=[(va[feature_cols], va['ret_next'])],
        callbacks=[lgb.early_stopping(20, verbose=False)]
    )
    te['pred_lgb'] = lgb_model.predict(te[feature_cols])
    ic = compute_monthly_ic(te, pred_col='pred_lgb')
    print(f'IC: {ic.mean():.4f}')
    lgb_test_dfs.append(te)

lgb_test = pd.concat(lgb_test_dfs)
lgb_ic = compute_monthly_ic(lgb_test, pred_col='pred_lgb')
print(f'\nOverall LightGBM Test IC: {lgb_ic.mean():.4f} (IR: {lgb_ic.mean()/lgb_ic.std():.3f})')

In [None]:
# --- Build ensemble ---
# Merge all predictions on the test set
# Align by date_idx and stock_id

ensemble_df = nn_test[['date_idx', 'stock_id', 'ret_next', 'pred_nn']].copy()
ensemble_df = ensemble_df.merge(
    xgb_test[['date_idx', 'stock_id', 'pred_xgb']],
    on=['date_idx', 'stock_id']
)
ensemble_df = ensemble_df.merge(
    lgb_test[['date_idx', 'stock_id', 'pred_lgb']],
    on=['date_idx', 'stock_id']
)

# Rank-normalize within each month and average
for col in ['pred_nn', 'pred_xgb', 'pred_lgb']:
    ensemble_df[col + '_rank'] = ensemble_df.groupby('date_idx')[col].transform(
        lambda x: rankdata(x) / len(x)
    )

ensemble_df['pred_ensemble'] = (
    ensemble_df['pred_nn_rank']
    + ensemble_df['pred_xgb_rank']
    + ensemble_df['pred_lgb_rank']
) / 3

ens_ic = compute_monthly_ic(ensemble_df, pred_col='pred_ensemble')
print(f'Ensemble Test IC: {ens_ic.mean():.4f} (IR: {ens_ic.mean()/ens_ic.std():.3f})')

In [None]:
# --- Final comparison table ---
ens_ls = compute_ls_returns(ensemble_df, pred_col='pred_ensemble')
lgb_ls = compute_ls_returns(lgb_test, pred_col='pred_lgb')

final_comparison = pd.DataFrame({
    'Model': ['NN Ensemble', 'XGBoost', 'LightGBM', 'Meta-Ensemble'],
    'Mean IC': [nn_ic.mean(), xgb_ic.mean(), lgb_ic.mean(), ens_ic.mean()],
    'IC Std': [nn_ic.std(), xgb_ic.std(), lgb_ic.std(), ens_ic.std()],
    'IC IR': [
        nn_ic.mean()/nn_ic.std(), xgb_ic.mean()/xgb_ic.std(),
        lgb_ic.mean()/lgb_ic.std(), ens_ic.mean()/ens_ic.std()
    ],
    'IC > 0 (%)': [
        (nn_ic > 0).mean()*100, (xgb_ic > 0).mean()*100,
        (lgb_ic > 0).mean()*100, (ens_ic > 0).mean()*100
    ],
    'LS Sharpe (ann)': [
        nn_ls.mean()/nn_ls.std()*np.sqrt(12),
        xgb_ls.mean()/xgb_ls.std()*np.sqrt(12),
        lgb_ls.mean()/lgb_ls.std()*np.sqrt(12),
        ens_ls.mean()/ens_ls.std()*np.sqrt(12)
    ]
}).set_index('Model')

print('Final Model Comparison:')
print(final_comparison.round(4).to_string())

In [None]:
# --- Final comparison plots ---
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Cumulative IC
for label, ic_s in [('NN', nn_ic), ('XGBoost', xgb_ic),
                     ('LightGBM', lgb_ic), ('Ensemble', ens_ic)]:
    axes[0].plot(ic_s.cumsum().values, label=label, linewidth=2)
axes[0].set_xlabel('Test Month')
axes[0].set_ylabel('Cumulative IC')
axes[0].set_title('Cumulative IC --- All Models')
axes[0].legend()

# Cumulative long-short returns
for label, ls_s in [('NN', nn_ls), ('XGBoost', xgb_ls),
                     ('LightGBM', lgb_ls), ('Ensemble', ens_ls)]:
    axes[1].plot((1 + ls_s).cumprod().values, label=label, linewidth=2)
axes[1].set_xlabel('Test Month')
axes[1].set_ylabel('Cumulative Return')
axes[1].set_title('Long-Short Returns --- All Models')
axes[1].legend()

plt.tight_layout()
plt.show()

---

## Part 5: Analysis --- Where NN Wins vs Trees (15 pts) --- SOLUTION

In [None]:
# --- Temporal analysis: rolling IC difference ---
ic_diff = nn_ic.values - xgb_ic.values
window = 6
rolling_diff = pd.Series(ic_diff).rolling(window).mean()

fig, ax = plt.subplots(figsize=(12, 5))
ax.bar(range(len(ic_diff)), ic_diff, alpha=0.4, color='steelblue', label='Monthly IC diff')
ax.plot(rolling_diff.values, color='red', linewidth=2, label=f'{window}-month rolling mean')
ax.axhline(y=0, color='black', linewidth=0.5)
ax.set_xlabel('Test Month')
ax.set_ylabel('IC(NN) - IC(XGBoost)')
ax.set_title('When Does NN Beat XGBoost? (positive = NN wins)')
ax.legend()
plt.tight_layout()
plt.show()

print(f'NN wins in {(ic_diff > 0).mean():.1%} of months')

In [None]:
# --- Cross-sectional analysis: IC by stock characteristic ---
# Split by size (large vs small) and volatility (high vs low)

# Get raw features for grouping
raw_test = df_raw[df_raw.date_idx >= folds[0][2]].copy()
raw_test['size_group'] = raw_test.groupby('date_idx')['size'].transform(
    lambda x: pd.qcut(x, 2, labels=['Small', 'Large'])
)
raw_test['vol_group'] = raw_test.groupby('date_idx')['vol_20d'].transform(
    lambda x: pd.qcut(x, 2, labels=['Low Vol', 'High Vol'])
)

# Merge with predictions
merged = raw_test[['date_idx', 'stock_id', 'size_group', 'vol_group']].merge(
    ensemble_df[['date_idx', 'stock_id', 'pred_nn', 'pred_xgb', 'ret_next']],
    on=['date_idx', 'stock_id']
)

# IC by group
results = []
for group_col in ['size_group', 'vol_group']:
    for group_val in merged[group_col].unique():
        sub = merged[merged[group_col] == group_val]
        nn_ic_g = compute_monthly_ic(sub, pred_col='pred_nn')
        xgb_ic_g = compute_monthly_ic(sub, pred_col='pred_xgb')
        results.append({
            'Group': f'{group_col}: {group_val}',
            'NN IC': nn_ic_g.mean(),
            'XGB IC': xgb_ic_g.mean(),
            'NN - XGB': nn_ic_g.mean() - xgb_ic_g.mean()
        })

group_table = pd.DataFrame(results).set_index('Group')
print('IC by Stock Characteristic:')
print(group_table.round(4).to_string())

### Analysis

**1. By time period:**

The rolling IC difference shows that the neural net and XGBoost take turns outperforming each other. Neither model consistently dominates. In periods of high cross-sectional dispersion, the NN may capture nonlinear interactions better, while in stable periods, XGBoost's piecewise-constant approximation is sufficient. This motivates the ensemble approach.

**2. By stock characteristic:**

The NN tends to perform relatively better among large-cap stocks where factor relationships are smoother and more stable. For small-cap and high-volatility stocks, tree models can sometimes perform better because the signal is more threshold-like (e.g., extreme momentum reversals). The ensemble benefits from combining both perspectives.

**3. Feature importance:**

XGBoost feature importance is straightforward to compute (gain-based). For the NN, we would need SHAP or permutation importance. The key difference is that the NN can model smooth interactions (e.g., the sin(mom_12m * size) term in our DGP), while trees approximate these with step functions. In practice, this means the NN better captures features that interact continuously rather than at discrete thresholds.