# Week 12 Seminar — Derivatives Pricing with Neural Networks (Hands-On)

**Exercises:**
1. Vectorized Black-Scholes in PyTorch, autograd Greeks (25 min)
2. Train NN to learn BS pricing from synthetic data (25 min)
3. Compare neural Greeks to analytical Greeks (20 min)
4. Discussion: Where does BS fail? Can NNs do better? (20 min)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from scipy.stats import norm

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 5)

torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

---
## Exercise 1: Vectorized Black-Scholes in PyTorch + Autograd Greeks (25 min)

Implement BS in PyTorch so we can differentiate through it.

In [None]:
# Reference: BS in NumPy
def bs_call_numpy(S, K, T, sigma, r):
    d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
    d2 = d1 - sigma * np.sqrt(T)
    return S * norm.cdf(d1) - K * np.exp(-r * T) * norm.cdf(d2)

def bs_delta_numpy(S, K, T, sigma, r):
    d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
    return norm.cdf(d1)

def bs_gamma_numpy(S, K, T, sigma, r):
    d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
    return norm.pdf(d1) / (S * sigma * np.sqrt(T))

def bs_vega_numpy(S, K, T, sigma, r):
    d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
    return S * norm.pdf(d1) * np.sqrt(T)

def bs_theta_numpy(S, K, T, sigma, r):
    d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
    d2 = d1 - sigma * np.sqrt(T)
    theta = (-S * norm.pdf(d1) * sigma / (2 * np.sqrt(T))
             - r * K * np.exp(-r * T) * norm.cdf(d2))
    return theta

In [None]:
# PyTorch BS implementation
def normal_cdf(x):
    return 0.5 * (1 + torch.erf(x / np.sqrt(2)))

def bs_call_torch(S, K, T, sigma, r):
    """Fully differentiable Black-Scholes call price."""
    d1 = (torch.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * torch.sqrt(T))
    d2 = d1 - sigma * torch.sqrt(T)
    return S * normal_cdf(d1) - K * torch.exp(-r * T) * normal_cdf(d2)

# Verify: PyTorch vs NumPy
S_np, K_np, T_np, sigma_np, r_np = 100.0, 100.0, 1.0, 0.2, 0.05
price_np = bs_call_numpy(S_np, K_np, T_np, sigma_np, r_np)
price_pt = bs_call_torch(
    torch.tensor(S_np), torch.tensor(K_np),
    torch.tensor(T_np), torch.tensor(sigma_np), torch.tensor(r_np)
).item()

print(f"NumPy:   {price_np:.8f}")
print(f"PyTorch: {price_pt:.8f}")
print(f"Match:   {abs(price_np - price_pt) < 1e-6}")

In [None]:
# Vectorized autograd Greeks
def compute_all_greeks(S_vals, K_val, T_val, sigma_val, r_val):
    """Compute all Greeks for a vector of spot prices using autograd."""
    S = torch.tensor(S_vals, dtype=torch.float64, requires_grad=True)
    K = torch.tensor(K_val, dtype=torch.float64)
    T = torch.tensor(T_val, dtype=torch.float64, requires_grad=True)
    sigma = torch.tensor(sigma_val, dtype=torch.float64, requires_grad=True)
    r = torch.tensor(r_val, dtype=torch.float64)
    
    price = bs_call_torch(S, K, T, sigma, r)
    total = price.sum()
    
    # First derivatives
    grads = torch.autograd.grad(total, [S, T, sigma], create_graph=True)
    delta = grads[0]  # dC/dS
    theta = grads[1]  # dC/dT (note: convention is -dC/dT for theta)
    vega = grads[2]   # dC/dsigma
    
    # Gamma = d2C/dS2
    gamma = torch.autograd.grad(delta.sum(), S)[0]
    
    return {
        'price': price.detach().numpy(),
        'delta': delta.detach().numpy(),
        'gamma': gamma.detach().numpy(),
        'theta': theta.detach().numpy(),
        'vega': vega.detach().numpy(),
    }

# Compute for a range of spot prices
S_range = np.linspace(70, 130, 200)
greeks = compute_all_greeks(S_range, 100.0, 1.0, 0.2, 0.05)

In [None]:
# Plot all Greeks
fig, axes = plt.subplots(2, 3, figsize=(16, 9))

greeks_list = [
    ('Price', greeks['price'], bs_call_numpy(S_range, 100, 1, 0.2, 0.05)),
    ('Delta', greeks['delta'], bs_delta_numpy(S_range, 100, 1, 0.2, 0.05)),
    ('Gamma', greeks['gamma'], bs_gamma_numpy(S_range, 100, 1, 0.2, 0.05)),
    ('Vega', greeks['vega'], bs_vega_numpy(S_range, 100, 1, 0.2, 0.05)),
    ('Theta', greeks['theta'], bs_theta_numpy(S_range, 100, 1, 0.2, 0.05)),
]

for idx, (name, autograd_vals, analytical_vals) in enumerate(greeks_list):
    ax = axes.flat[idx]
    ax.plot(S_range, analytical_vals, label='Analytical', linewidth=2)
    ax.plot(S_range, autograd_vals, '--', label='Autograd', linewidth=2)
    ax.set_title(name)
    ax.set_xlabel('Spot Price S')
    ax.legend()

# Error plot in the last panel
ax = axes.flat[5]
for name, autograd_vals, analytical_vals in greeks_list[1:]:
    error = np.abs(autograd_vals - analytical_vals)
    ax.plot(S_range, error, label=name)
ax.set_title('Absolute Errors')
ax.set_xlabel('Spot Price S')
ax.set_ylabel('|Autograd - Analytical|')
ax.legend()
ax.set_yscale('log')

plt.tight_layout()
plt.show()

---
## Exercise 2: Train NN to Learn BS Pricing from Synthetic Data (25 min)

In [None]:
# Generate training data
N = 200_000

S_data = np.random.uniform(50, 150, N)
K_data = np.random.uniform(50, 150, N)
T_data = np.random.uniform(0.05, 3.0, N)
sigma_data = np.random.uniform(0.05, 0.6, N)
r_data = np.random.uniform(0.0, 0.1, N)

C_data = bs_call_numpy(S_data, K_data, T_data, sigma_data, r_data)

# Features: log-moneyness, T, sigma, r
log_m = np.log(S_data / K_data)
X = np.column_stack([log_m, T_data, sigma_data, r_data]).astype(np.float32)
y = (C_data / K_data).astype(np.float32)  # normalize by strike

# Split
n_train = int(0.85 * N)
n_val = int(0.05 * N)
X_tr = torch.tensor(X[:n_train]).to(device)
y_tr = torch.tensor(y[:n_train]).to(device)
X_val = torch.tensor(X[n_train:n_train+n_val]).to(device)
y_val = torch.tensor(y[n_train:n_train+n_val]).to(device)
X_test = torch.tensor(X[n_train+n_val:]).to(device)
y_test = torch.tensor(y[n_train+n_val:]).to(device)

print(f"Train: {n_train:,} | Val: {n_val:,} | Test: {N - n_train - n_val:,}")

In [None]:
class OptionPricerNN(nn.Module):
    def __init__(self, hidden=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(4, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Softplus(),
        )
    
    def forward(self, x):
        return self.net(x).squeeze(-1)

model = OptionPricerNN(hidden=128).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
# Training loop
batch_size = 4096
n_epochs = 60
train_losses, val_losses = [], []

for epoch in range(n_epochs):
    model.train()
    perm = torch.randperm(n_train, device=device)
    epoch_loss = 0
    n_batches = 0
    
    for i in range(0, n_train, batch_size):
        idx = perm[i:i+batch_size]
        pred = model(X_tr[idx])
        loss = nn.MSELoss()(pred, y_tr[idx])
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        n_batches += 1
    
    train_losses.append(epoch_loss / n_batches)
    
    model.eval()
    with torch.no_grad():
        val_loss = nn.MSELoss()(model(X_val), y_val).item()
    val_losses.append(val_loss)
    scheduler.step(val_loss)
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1:3d} | Train: {train_losses[-1]:.2e} | Val: {val_loss:.2e}")

plt.plot(train_losses, label='Train')
plt.plot(val_losses, label='Validation')
plt.title('NN Pricer Training Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.yscale('log')
plt.legend()
plt.show()

In [None]:
# Evaluate on test set
model.eval()
with torch.no_grad():
    test_pred = model(X_test).cpu().numpy()
    test_true = y_test.cpu().numpy()

errors = test_pred - test_true

fig, axes = plt.subplots(1, 3, figsize=(16, 4))

axes[0].scatter(test_true[:3000], test_pred[:3000], s=1, alpha=0.3)
axes[0].plot([0, test_true.max()], [0, test_true.max()], 'r--')
axes[0].set_title('NN vs True (normalized price)')
axes[0].set_xlabel('True C/K')
axes[0].set_ylabel('Predicted C/K')

axes[1].hist(errors, bins=100, edgecolor='black', linewidth=0.3)
axes[1].set_title(f'Error Distribution (std={errors.std():.2e})')
axes[1].set_xlabel('Prediction Error')

axes[2].scatter(test_true[:3000], np.abs(errors[:3000]), s=1, alpha=0.3)
axes[2].set_title('Absolute Error vs True Price')
axes[2].set_xlabel('True C/K')
axes[2].set_ylabel('|Error|')

plt.tight_layout()
plt.show()

print(f"Test MAE: {np.abs(errors).mean():.6f}")
print(f"Test RMSE: {np.sqrt((errors**2).mean()):.6f}")
print(f"Test R2: {1 - (errors**2).sum() / ((test_true - test_true.mean())**2).sum():.6f}")

---
## Exercise 3: Compare Neural Greeks to Analytical Greeks (20 min)

Use autograd on the trained NN pricer and compare to the BS analytical formulas.

In [None]:
def nn_greeks(model, S_vals, K, T, sigma, r):
    """Compute Delta, Gamma, Vega from the NN pricer."""
    S_t = torch.tensor(S_vals, dtype=torch.float32, requires_grad=True).to(device)
    sigma_t = torch.tensor(np.full_like(S_vals, sigma), dtype=torch.float32,
                           requires_grad=True).to(device)
    
    log_m = torch.log(S_t / K)
    T_t = torch.full_like(S_t, T)
    r_t = torch.full_like(S_t, r)
    
    x = torch.stack([log_m, T_t, sigma_t, r_t], dim=1)
    price_normalized = model(x)
    price = price_normalized * K
    
    # Delta
    delta = torch.autograd.grad(price.sum(), S_t, create_graph=True)[0]
    # Gamma
    gamma = torch.autograd.grad(delta.sum(), S_t)[0]
    # Vega
    vega = torch.autograd.grad(price.sum(), sigma_t, retain_graph=False)[0]
    
    return {
        'price': price.detach().cpu().numpy(),
        'delta': delta.detach().cpu().numpy(),
        'gamma': gamma.detach().cpu().numpy(),
        'vega': vega.detach().cpu().numpy(),
    }

In [None]:
# Compute and compare
S_plot = np.linspace(70, 130, 300).astype(np.float32)
K_val, T_val, sig_val, r_val = 100.0, 1.0, 0.2, 0.05

nn_g = nn_greeks(model, S_plot, K_val, T_val, sig_val, r_val)

analytical = {
    'price': bs_call_numpy(S_plot, K_val, T_val, sig_val, r_val),
    'delta': bs_delta_numpy(S_plot, K_val, T_val, sig_val, r_val),
    'gamma': bs_gamma_numpy(S_plot, K_val, T_val, sig_val, r_val),
    'vega': bs_vega_numpy(S_plot, K_val, T_val, sig_val, r_val),
}

fig, axes = plt.subplots(2, 2, figsize=(14, 9))

for idx, greek in enumerate(['price', 'delta', 'gamma', 'vega']):
    ax = axes.flat[idx]
    ax.plot(S_plot, analytical[greek], label='Analytical BS', linewidth=2)
    ax.plot(S_plot, nn_g[greek], '--', label='Neural Network', linewidth=2)
    ax.set_title(greek.capitalize())
    ax.set_xlabel('Spot Price S')
    ax.legend()

plt.suptitle('Neural Greeks vs Analytical Greeks', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Error analysis for Greeks
fig, axes = plt.subplots(1, 3, figsize=(16, 4))

for idx, greek in enumerate(['delta', 'gamma', 'vega']):
    error = nn_g[greek] - analytical[greek]
    axes[idx].plot(S_plot, error)
    axes[idx].axhline(0, color='gray', linestyle='--', linewidth=0.5)
    axes[idx].set_title(f'{greek.capitalize()} Error (NN - BS)')
    axes[idx].set_xlabel('Spot Price S')
    mae = np.abs(error).mean()
    axes[idx].text(0.05, 0.95, f'MAE: {mae:.4e}', transform=axes[idx].transAxes,
                   verticalalignment='top', fontsize=10,
                   bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

### Observations

- **Delta**: NN approximation is very close. Errors are largest near ATM where delta changes fastest.
- **Gamma**: More noise — second derivatives amplify approximation errors. This is inherent to finite-width NNs.
- **Vega**: Good match overall. Errors are small relative to the magnitude of vega.

**Key insight**: Neural Greeks are smooth but not exact. For risk management, the smoothness is actually a feature — analytical Greeks can be discontinuous for exotic options.

---
## Exercise 4: Discussion — Where Does BS Fail? Can NNs Do Better? (20 min)

### Where Black-Scholes fails

**1. Volatility is not constant**
- Implied volatility varies by strike (smile) and maturity (term structure)
- BS gives ONE price for a given vol — but market quotes different IVs for different strikes

**2. Returns are not normal**
- Real returns have fat tails (kurtosis > 3) and negative skew
- BS underprices OTM puts (crash protection)

**3. No jumps**
- BS assumes continuous paths
- Real stocks can gap (earnings, macro events)

**4. Exotic options**
- BS only prices vanillas. For barriers, Asians, lookbacks — no closed form
- Need Monte Carlo or PDE methods (slow)

### Where NNs can do better

| Application | Why NNs Help |
|-------------|---------------|
| **Stochastic vol models** (Heston, SABR) | No closed form — NN replaces Monte Carlo |
| **Exotic options** | NN trained on MC samples, then used for fast pricing |
| **Model-free pricing** | Train on market data directly, no model assumptions |
| **Real-time Greeks** | Autograd on NN is faster than finite differences |
| **Calibration** | NN as fast approximation for inner loop of calibration |

### Where NNs struggle

| Challenge | Issue |
|-----------|-------|
| **Extrapolation** | NN accuracy degrades outside training range |
| **No-arbitrage** | Must be explicitly enforced |
| **Interpretability** | Hard to explain prices to traders |
| **Data scarcity** | Illiquid options have few market prices |

In [None]:
# Demonstrate the extrapolation problem
# Our training data had S in [50, 150]. What happens outside?

S_extrap = np.linspace(20, 200, 400).astype(np.float32)
bs_prices_extrap = bs_call_numpy(S_extrap, 100, 1.0, 0.2, 0.05)

log_m_ext = np.log(S_extrap / 100)
X_ext = np.column_stack([
    log_m_ext,
    np.full_like(S_extrap, 1.0),
    np.full_like(S_extrap, 0.2),
    np.full_like(S_extrap, 0.05),
]).astype(np.float32)

model.eval()
with torch.no_grad():
    nn_prices_extrap = model(torch.tensor(X_ext).to(device)).cpu().numpy() * 100

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(S_extrap, bs_prices_extrap, label='Black-Scholes')
axes[0].plot(S_extrap, nn_prices_extrap, '--', label='NN')
axes[0].axvspan(50, 150, alpha=0.1, color='green', label='Training range')
axes[0].set_title('NN Extrapolation Test')
axes[0].set_xlabel('Spot Price S')
axes[0].set_ylabel('Call Price')
axes[0].legend()

axes[1].plot(S_extrap, np.abs(bs_prices_extrap - nn_prices_extrap))
axes[1].axvspan(50, 150, alpha=0.1, color='green', label='Training range')
axes[1].set_title('Absolute Error')
axes[1].set_xlabel('Spot Price S')
axes[1].set_ylabel('|BS - NN|')
axes[1].legend()

plt.tight_layout()
plt.show()

in_range = (S_extrap >= 50) & (S_extrap <= 150)
print(f"MAE in training range:  ${np.abs(bs_prices_extrap[in_range] - nn_prices_extrap[in_range]).mean():.4f}")
print(f"MAE outside range:      ${np.abs(bs_prices_extrap[~in_range] - nn_prices_extrap[~in_range]).mean():.4f}")

### Discussion questions for students

1. How would you ensure the NN pricer satisfies no-arbitrage constraints?
2. If you train on BS data, you get a BS-equivalent pricer. What's the point? (Answer: speed + autograd Greeks. The real value is training on complex models like Heston.)
3. How would you handle the extrapolation problem in production?
4. Could you use the NN pricer for calibration? (Yes — it's the inner loop, so speed matters enormously.)

---
## Summary

1. BS in PyTorch is straightforward and gives us autograd Greeks that match analytical formulas
2. A simple 4-layer NN can learn BS pricing with very high accuracy (R2 > 0.9999)
3. Neural Greeks are smooth and accurate for Delta and Vega; Gamma (2nd derivative) is noisier
4. The extrapolation problem is real — always validate outside the training range
5. The real value of NN pricers is for models without closed-form solutions (Heston, SABR, exotics)