# Homework 12 â€” SOLUTION

# Neural Network Options Pricer

**This is the instructor solution. Do not distribute to students.**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import torch
import torch.nn as nn
from scipy.stats import norm
from scipy.optimize import brentq
import time

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 5)

torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

---
## Part 1: Generate 500K Black-Scholes Training Samples (15 pts)

In [None]:
# Black-Scholes implementation
def bs_call(S, K, T, sigma, r):
    """European call price under Black-Scholes."""
    d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
    d2 = d1 - sigma * np.sqrt(T)
    return S * norm.cdf(d1) - K * np.exp(-r * T) * norm.cdf(d2)

def bs_delta(S, K, T, sigma, r):
    d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
    return norm.cdf(d1)

def bs_gamma(S, K, T, sigma, r):
    d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
    return norm.pdf(d1) / (S * sigma * np.sqrt(T))

def bs_vega(S, K, T, sigma, r):
    d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
    return S * norm.pdf(d1) * np.sqrt(T)

def bs_theta(S, K, T, sigma, r):
    d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
    d2 = d1 - sigma * np.sqrt(T)
    return (-S * norm.pdf(d1) * sigma / (2 * np.sqrt(T))
            - r * K * np.exp(-r * T) * norm.cdf(d2))

In [None]:
# Generate 500K samples
N = 500_000

S_raw = np.random.uniform(50, 150, N)
K_raw = np.random.uniform(50, 150, N)
T_raw = np.random.uniform(0.05, 3.0, N)
sigma_raw = np.random.uniform(0.05, 0.6, N)
r_raw = np.random.uniform(0.0, 0.1, N)

C_raw = bs_call(S_raw, K_raw, T_raw, sigma_raw, r_raw)

# Normalized features
log_moneyness = np.log(S_raw / K_raw)
X_all = np.column_stack([log_moneyness, T_raw, sigma_raw, r_raw]).astype(np.float32)
y_all = (C_raw / K_raw).astype(np.float32)  # normalized price

# Store raw data for later use
raw_data = np.column_stack([S_raw, K_raw, T_raw, sigma_raw, r_raw, C_raw])

# Split: 80/10/10
n_train = int(0.8 * N)
n_val = int(0.1 * N)

X_train = torch.tensor(X_all[:n_train]).to(device)
y_train = torch.tensor(y_all[:n_train]).to(device)
X_val = torch.tensor(X_all[n_train:n_train+n_val]).to(device)
y_val = torch.tensor(y_all[n_train:n_train+n_val]).to(device)
X_test = torch.tensor(X_all[n_train+n_val:]).to(device)
y_test = torch.tensor(y_all[n_train+n_val:]).to(device)

print(f"Dataset: {N:,} samples")
print(f"Train: {n_train:,} | Val: {n_val:,} | Test: {N - n_train - n_val:,}")
print(f"\nFeature statistics:")
feat_names = ['log(S/K)', 'T', 'sigma', 'r']
for i, name in enumerate(feat_names):
    print(f"  {name:10s}: mean={X_all[:, i].mean():.4f}, std={X_all[:, i].std():.4f}, "
          f"range=[{X_all[:, i].min():.4f}, {X_all[:, i].max():.4f}]")
print(f"\nTarget (C/K): mean={y_all.mean():.4f}, std={y_all.std():.4f}, range=[{y_all.min():.4f}, {y_all.max():.4f}]")

---
## Part 2: Train NN Pricer, Evaluate Extrapolation (25 pts)

In [None]:
class OptionPricer(nn.Module):
    def __init__(self, input_dim=4, hidden=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Softplus(),  # positive output
        )
    
    def forward(self, x):
        return self.net(x).squeeze(-1)

model = OptionPricer(hidden=256).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
loss_fn = nn.MSELoss()

print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
# Training loop
batch_size = 8192
n_epochs = 80
train_losses, val_losses = [], []
best_val_loss = float('inf')

for epoch in range(n_epochs):
    model.train()
    perm = torch.randperm(n_train, device=device)
    epoch_loss = 0
    n_batches = 0
    
    for i in range(0, n_train, batch_size):
        idx = perm[i:i+batch_size]
        pred = model(X_train[idx])
        loss = loss_fn(pred, y_train[idx])
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        n_batches += 1
    
    train_losses.append(epoch_loss / n_batches)
    
    model.eval()
    with torch.no_grad():
        val_pred = model(X_val)
        val_loss = loss_fn(val_pred, y_val).item()
    val_losses.append(val_loss)
    scheduler.step(val_loss)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_state = {k: v.clone() for k, v in model.state_dict().items()}
    
    if (epoch + 1) % 10 == 0:
        lr = optimizer.param_groups[0]['lr']
        print(f"Epoch {epoch+1:3d} | Train: {train_losses[-1]:.2e} | Val: {val_loss:.2e} | LR: {lr:.1e}")

# Load best model
model.load_state_dict(best_state)

plt.plot(train_losses, label='Train')
plt.plot(val_losses, label='Validation')
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.yscale('log')
plt.legend()
plt.show()

In [None]:
# Test set evaluation
model.eval()
with torch.no_grad():
    test_pred = model(X_test).cpu().numpy()
    test_true = y_test.cpu().numpy()

errors = test_pred - test_true
mse = (errors**2).mean()
mae = np.abs(errors).mean()
r2 = 1 - (errors**2).sum() / ((test_true - test_true.mean())**2).sum()

print(f"Test MSE:  {mse:.2e}")
print(f"Test MAE:  {mae:.2e}")
print(f"Test R2:   {r2:.6f}")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].scatter(test_true[:5000], test_pred[:5000], s=1, alpha=0.3)
axes[0].plot([0, test_true.max()], [0, test_true.max()], 'r--', linewidth=2)
axes[0].set_title(f'Predicted vs True (R2 = {r2:.4f})')
axes[0].set_xlabel('True C/K')
axes[0].set_ylabel('Predicted C/K')

axes[1].hist(errors, bins=100, edgecolor='black', linewidth=0.3)
axes[1].set_title(f'Error Distribution (MAE = {mae:.2e})')
axes[1].set_xlabel('Error')

plt.tight_layout()
plt.show()

In [None]:
# Extrapolation test
S_ext = np.linspace(20, 200, 500).astype(np.float32)
K_fix, T_fix, sig_fix, r_fix = 100.0, 1.0, 0.2, 0.05

bs_ext = bs_call(S_ext, K_fix, T_fix, sig_fix, r_fix)

log_m_ext = np.log(S_ext / K_fix)
X_ext = np.column_stack([
    log_m_ext,
    np.full_like(S_ext, T_fix),
    np.full_like(S_ext, sig_fix),
    np.full_like(S_ext, r_fix),
]).astype(np.float32)

with torch.no_grad():
    nn_ext = model(torch.tensor(X_ext).to(device)).cpu().numpy() * K_fix

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(S_ext, bs_ext, label='Black-Scholes', linewidth=2)
axes[0].plot(S_ext, nn_ext, '--', label='Neural Network', linewidth=2)
axes[0].axvspan(50, 150, alpha=0.1, color='green', label='Training range')
axes[0].set_title('Extrapolation Test')
axes[0].set_xlabel('Spot Price S')
axes[0].set_ylabel('Call Price')
axes[0].legend()

axes[1].plot(S_ext, np.abs(bs_ext - nn_ext))
axes[1].axvspan(50, 150, alpha=0.1, color='green', label='Training range')
axes[1].set_title('Absolute Pricing Error')
axes[1].set_xlabel('Spot Price S')
axes[1].set_ylabel('|BS - NN| ($)')
axes[1].legend()

plt.tight_layout()
plt.show()

in_range = (S_ext >= 50) & (S_ext <= 150)
print(f"MAE in training range:  ${np.abs(bs_ext[in_range] - nn_ext[in_range]).mean():.4f}")
print(f"MAE outside range:      ${np.abs(bs_ext[~in_range] - nn_ext[~in_range]).mean():.4f}")

**Commentary:** The NN pricer achieves very high accuracy within the training range (R2 > 0.9999). Extrapolation accuracy degrades outside the training range, particularly for deep in-the-money options where prices grow linearly. This is expected -- neural networks are interpolators, not extrapolators. In production, the training range should cover all realistic input values with a margin.

---
## Part 3: Autograd Greeks vs Analytical (20 pts)

In [None]:
def compute_nn_greeks(model, S_vals, K, T, sigma, r):
    """Compute Delta, Gamma, Vega, Theta from NN using autograd."""
    S_t = torch.tensor(S_vals, dtype=torch.float32, requires_grad=True).to(device)
    sigma_t = torch.tensor(np.full_like(S_vals, sigma), dtype=torch.float32,
                           requires_grad=True).to(device)
    T_t = torch.tensor(np.full_like(S_vals, T), dtype=torch.float32,
                       requires_grad=True).to(device)
    r_t = torch.full_like(S_t, r)
    
    log_m = torch.log(S_t / K)
    x = torch.stack([log_m, T_t, sigma_t, r_t], dim=1)
    price_norm = model(x)
    price = price_norm * K
    
    total = price.sum()
    
    # First-order Greeks
    grads = torch.autograd.grad(total, [S_t, sigma_t, T_t], create_graph=True)
    delta_t = grads[0]
    vega_t = grads[1]
    theta_t = grads[2]
    
    # Gamma (second derivative)
    gamma_t = torch.autograd.grad(delta_t.sum(), S_t)[0]
    
    return {
        'price': price.detach().cpu().numpy(),
        'delta': delta_t.detach().cpu().numpy(),
        'gamma': gamma_t.detach().cpu().numpy(),
        'vega': vega_t.detach().cpu().numpy(),
        'theta': theta_t.detach().cpu().numpy(),
    }

In [None]:
# Compute for range of spot prices
S_plot = np.linspace(70, 130, 400).astype(np.float32)
K_v, T_v, sig_v, r_v = 100.0, 1.0, 0.2, 0.05

nn_g = compute_nn_greeks(model, S_plot, K_v, T_v, sig_v, r_v)

analytical = {
    'price': bs_call(S_plot, K_v, T_v, sig_v, r_v),
    'delta': bs_delta(S_plot, K_v, T_v, sig_v, r_v),
    'gamma': bs_gamma(S_plot, K_v, T_v, sig_v, r_v),
    'vega': bs_vega(S_plot, K_v, T_v, sig_v, r_v),
    'theta': bs_theta(S_plot, K_v, T_v, sig_v, r_v),
}

# Plot comparisons
greeks_names = ['delta', 'gamma', 'vega', 'theta']

fig, axes = plt.subplots(2, 4, figsize=(18, 8))

# Top row: Greek values
for i, greek in enumerate(greeks_names):
    axes[0, i].plot(S_plot, analytical[greek], label='Analytical', linewidth=2)
    axes[0, i].plot(S_plot, nn_g[greek], '--', label='NN Autograd', linewidth=2)
    axes[0, i].set_title(greek.capitalize())
    axes[0, i].set_xlabel('S')
    axes[0, i].legend(fontsize=8)

# Bottom row: Errors
mae_results = {}
for i, greek in enumerate(greeks_names):
    error = nn_g[greek] - analytical[greek]
    mae_results[greek] = np.abs(error).mean()
    axes[1, i].plot(S_plot, error, color='red', alpha=0.7)
    axes[1, i].axhline(0, color='gray', linestyle='--', linewidth=0.5)
    axes[1, i].set_title(f'{greek.capitalize()} Error (MAE={mae_results[greek]:.2e})')
    axes[1, i].set_xlabel('S')

plt.suptitle('Neural Greeks vs Analytical Greeks', fontsize=14)
plt.tight_layout()
plt.show()

print("\nGreek MAE Summary:")
for greek, mae in mae_results.items():
    print(f"  {greek:8s}: {mae:.6f}")

**Commentary:** Delta and Vega are approximated well by the NN. Gamma (second derivative) is the hardest to approximate because differentiating twice amplifies the approximation errors of the network. The NN produces smoother Gamma profiles than analytical, which can actually be desirable for hedging (less noisy hedge ratios). Theta is similarly well-approximated. In practice, if Greeks accuracy is critical, one can add a Greek-matching loss term during training.

---
## Part 4: Heston Stochastic Volatility Model Pricing (25 pts)

In [None]:
def heston_mc_price(S0, K, T, v0, kappa, theta, xi, rho, r,
                    n_paths=50_000, n_steps=100):
    """Price European call under Heston model using Euler Monte Carlo."""
    dt = T / n_steps
    sqrt_dt = np.sqrt(dt)
    
    S = np.full(n_paths, S0, dtype=np.float64)
    v = np.full(n_paths, v0, dtype=np.float64)
    
    for _ in range(n_steps):
        z1 = np.random.standard_normal(n_paths)
        z2 = rho * z1 + np.sqrt(1 - rho**2) * np.random.standard_normal(n_paths)
        
        v_pos = np.maximum(v, 0)  # full truncation scheme
        sqrt_v = np.sqrt(v_pos)
        
        S = S * np.exp((r - 0.5 * v_pos) * dt + sqrt_v * sqrt_dt * z1)
        v = v + kappa * (theta - v_pos) * dt + xi * sqrt_v * sqrt_dt * z2
    
    payoff = np.maximum(S - K, 0)
    price = np.exp(-r * T) * payoff.mean()
    return price

In [None]:
# Heston parameters
kappa = 2.0
theta_h = 0.04  # long-run variance
xi = 0.3
rho = -0.7
r_h = 0.05

# Verify: single price
price_heston = heston_mc_price(100, 100, 1.0, 0.04, kappa, theta_h, xi, rho, r_h)
price_bs = bs_call(100, 100, 1.0, np.sqrt(0.04), r_h)
print(f"Heston price: ${price_heston:.4f}")
print(f"BS price (sigma=0.2): ${price_bs:.4f}")

In [None]:
# Generate training data for Heston NN
# Vary: S, K, T, v0.  Fix: kappa, theta, xi, rho, r
N_heston = 50_000

S_h = np.random.uniform(60, 140, N_heston)
K_h = np.random.uniform(60, 140, N_heston)
T_h = np.random.uniform(0.1, 2.0, N_heston)
v0_h = np.random.uniform(0.01, 0.09, N_heston)  # initial variance

print(f"Generating {N_heston:,} Heston prices (this may take a few minutes)...")
C_heston = np.zeros(N_heston)
t_start = time.time()

for i in range(N_heston):
    C_heston[i] = heston_mc_price(
        S_h[i], K_h[i], T_h[i], v0_h[i],
        kappa, theta_h, xi, rho, r_h,
        n_paths=10_000, n_steps=50,
    )
    if (i + 1) % 10_000 == 0:
        elapsed = time.time() - t_start
        print(f"  {i+1:,}/{N_heston:,} ({elapsed:.0f}s)")

mc_time = time.time() - t_start
print(f"\nTotal MC time: {mc_time:.1f}s ({mc_time/N_heston*1000:.1f}ms per price)")

In [None]:
# Prepare Heston training data
log_m_h = np.log(S_h / K_h)
sqrt_v0 = np.sqrt(v0_h)  # use sqrt(v0) as feature (more intuitive)
X_heston = np.column_stack([log_m_h, T_h, sqrt_v0]).astype(np.float32)
y_heston = (C_heston / K_h).astype(np.float32)

# Split
n_train_h = int(0.85 * N_heston)
X_h_tr = torch.tensor(X_heston[:n_train_h]).to(device)
y_h_tr = torch.tensor(y_heston[:n_train_h]).to(device)
X_h_te = torch.tensor(X_heston[n_train_h:]).to(device)
y_h_te = torch.tensor(y_heston[n_train_h:]).to(device)

In [None]:
# Train Heston NN
heston_model = nn.Sequential(
    nn.Linear(3, 128),
    nn.ReLU(),
    nn.Linear(128, 128),
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 1),
    nn.Softplus(),
).to(device)

h_optimizer = torch.optim.Adam(heston_model.parameters(), lr=1e-3)
h_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(h_optimizer, patience=10, factor=0.5)

h_train_losses, h_val_losses = [], []
batch_size = 2048

for epoch in range(100):
    heston_model.train()
    perm = torch.randperm(n_train_h, device=device)
    epoch_loss = 0
    n_b = 0
    
    for i in range(0, n_train_h, batch_size):
        idx = perm[i:i+batch_size]
        pred = heston_model(X_h_tr[idx]).squeeze(-1)
        loss = nn.MSELoss()(pred, y_h_tr[idx])
        h_optimizer.zero_grad()
        loss.backward()
        h_optimizer.step()
        epoch_loss += loss.item()
        n_b += 1
    
    h_train_losses.append(epoch_loss / n_b)
    
    heston_model.eval()
    with torch.no_grad():
        val_loss = nn.MSELoss()(heston_model(X_h_te).squeeze(-1), y_h_te).item()
    h_val_losses.append(val_loss)
    h_scheduler.step(val_loss)
    
    if (epoch + 1) % 20 == 0:
        print(f"Epoch {epoch+1:3d} | Train: {h_train_losses[-1]:.2e} | Val: {val_loss:.2e}")

plt.plot(h_train_losses, label='Train')
plt.plot(h_val_losses, label='Val')
plt.title('Heston NN Training')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.yscale('log')
plt.legend()
plt.show()

In [None]:
# Speed comparison
# MC pricing: time 100 prices
t0 = time.time()
for i in range(100):
    _ = heston_mc_price(100, 100, 1.0, 0.04, kappa, theta_h, xi, rho, r_h,
                        n_paths=10_000, n_steps=50)
mc_100 = time.time() - t0

# NN pricing: time 100 prices
test_x = torch.tensor([[0.0, 1.0, 0.2]], dtype=torch.float32).to(device)
test_x = test_x.repeat(100, 1)
heston_model.eval()
t0 = time.time()
with torch.no_grad():
    _ = heston_model(test_x)
nn_100 = time.time() - t0

print(f"Monte Carlo: {mc_100:.3f}s for 100 prices ({mc_100/100*1000:.1f}ms per price)")
print(f"Neural Net:  {nn_100:.5f}s for 100 prices ({nn_100/100*1000:.3f}ms per price)")
print(f"Speedup:     {mc_100 / nn_100:.0f}x")

In [None]:
# Show Heston vol smile: compare Heston vs BS implied vol
strikes = np.linspace(70, 130, 30)
v0_test = 0.04  # initial vol = 20%

heston_prices = []
for K_i in strikes:
    p = heston_mc_price(100, K_i, 1.0, v0_test, kappa, theta_h, xi, rho, r_h,
                        n_paths=50_000, n_steps=100)
    heston_prices.append(p)
heston_prices = np.array(heston_prices)

# Invert BS to get implied vol
def implied_vol(price, S, K, T, r):
    try:
        return brentq(lambda sig: bs_call(S, K, T, sig, r) - price, 0.01, 2.0)
    except ValueError:
        return np.nan

ivs = [implied_vol(p, 100, K_i, 1.0, r_h) for p, K_i in zip(heston_prices, strikes)]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(strikes, heston_prices, 'o-', label='Heston')
bs_prices_compare = bs_call(100, strikes, 1.0, np.sqrt(v0_test), r_h)
axes[0].plot(strikes, bs_prices_compare, 's--', label=f'BS (sigma=0.2)')
axes[0].set_title('Heston vs BS Prices')
axes[0].set_xlabel('Strike K')
axes[0].set_ylabel('Call Price')
axes[0].legend()

axes[1].plot(strikes, ivs, 'o-', label='Heston IV')
axes[1].axhline(np.sqrt(v0_test), color='red', linestyle='--', label='BS flat vol = 0.2')
axes[1].set_title('Implied Volatility Smile (Heston)')
axes[1].set_xlabel('Strike K')
axes[1].set_ylabel('Implied Volatility')
axes[1].legend()

plt.tight_layout()
plt.show()

print("The Heston model produces a volatility smile/skew due to:")
print("  - Negative correlation (rho=-0.7): leverage effect, OTM puts are expensive")
print("  - Stochastic volatility: fat tails in return distribution")

---
## Part 5: Implied Volatility Surface Learning (15 pts)

In [None]:
# Generate a parametric IV surface
# sigma(K, T) = sigma_0 + alpha * (K/S - 1)^2 + beta / sqrt(T)

sigma_0 = 0.20
alpha = 0.8    # smile curvature
beta = 0.02    # term structure

def true_iv_surface(moneyness, T):
    """Parametric IV surface: moneyness = K/S."""
    return sigma_0 + alpha * (moneyness - 1)**2 + beta / np.sqrt(T)

# Generate training data
N_iv = 50_000
moneyness_data = np.random.uniform(0.7, 1.3, N_iv)  # K/S
T_iv_data = np.random.uniform(0.1, 3.0, N_iv)

iv_true = true_iv_surface(moneyness_data, T_iv_data)

# Add small noise (realistic market data has noise)
iv_noisy = iv_true + np.random.normal(0, 0.005, N_iv)
iv_noisy = np.clip(iv_noisy, 0.05, 1.0)

# Features and targets
X_iv = np.column_stack([moneyness_data, T_iv_data]).astype(np.float32)
y_iv = iv_noisy.astype(np.float32)

n_tr_iv = int(0.85 * N_iv)
X_iv_tr = torch.tensor(X_iv[:n_tr_iv]).to(device)
y_iv_tr = torch.tensor(y_iv[:n_tr_iv]).to(device)
X_iv_te = torch.tensor(X_iv[n_tr_iv:]).to(device)
y_iv_te = torch.tensor(y_iv[n_tr_iv:]).to(device)

In [None]:
# Train IV surface NN
iv_model = nn.Sequential(
    nn.Linear(2, 64),
    nn.ReLU(),
    nn.Linear(64, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 1),
    nn.Softplus(),  # positive vol
).to(device)

iv_opt = torch.optim.Adam(iv_model.parameters(), lr=1e-3)
iv_sched = torch.optim.lr_scheduler.ReduceLROnPlateau(iv_opt, patience=10, factor=0.5)

for epoch in range(80):
    iv_model.train()
    perm = torch.randperm(n_tr_iv, device=device)
    epoch_loss = 0
    n_b = 0
    
    for i in range(0, n_tr_iv, 2048):
        idx = perm[i:i+2048]
        pred = iv_model(X_iv_tr[idx]).squeeze(-1)
        loss = nn.MSELoss()(pred, y_iv_tr[idx])
        iv_opt.zero_grad()
        loss.backward()
        iv_opt.step()
        epoch_loss += loss.item()
        n_b += 1
    
    iv_model.eval()
    with torch.no_grad():
        val_loss = nn.MSELoss()(iv_model(X_iv_te).squeeze(-1), y_iv_te).item()
    iv_sched.step(val_loss)
    
    if (epoch + 1) % 20 == 0:
        print(f"Epoch {epoch+1:3d} | Train: {epoch_loss/n_b:.2e} | Val: {val_loss:.2e}")

print("IV surface model trained.")

In [None]:
# Visualize true vs learned IV surface
m_grid = np.linspace(0.7, 1.3, 60)
t_grid = np.linspace(0.1, 3.0, 60)
M, TT = np.meshgrid(m_grid, t_grid)

# True surface
IV_true = true_iv_surface(M, TT)

# Learned surface
grid_flat = np.column_stack([M.ravel(), TT.ravel()]).astype(np.float32)
iv_model.eval()
with torch.no_grad():
    IV_learned = iv_model(torch.tensor(grid_flat).to(device)).cpu().numpy().reshape(M.shape)

fig, axes = plt.subplots(1, 3, figsize=(18, 5), subplot_kw={'projection': '3d'})

axes[0].plot_surface(M, TT, IV_true, cmap='viridis', alpha=0.8)
axes[0].set_title('True IV Surface')
axes[0].set_xlabel('Moneyness (K/S)')
axes[0].set_ylabel('Maturity T')
axes[0].set_zlabel('IV')

axes[1].plot_surface(M, TT, IV_learned, cmap='viridis', alpha=0.8)
axes[1].set_title('Learned IV Surface')
axes[1].set_xlabel('Moneyness (K/S)')
axes[1].set_ylabel('Maturity T')
axes[1].set_zlabel('IV')

axes[2].plot_surface(M, TT, np.abs(IV_true - IV_learned), cmap='Reds', alpha=0.8)
axes[2].set_title('Absolute Error')
axes[2].set_xlabel('Moneyness (K/S)')
axes[2].set_ylabel('Maturity T')
axes[2].set_zlabel('|Error|')

plt.tight_layout()
plt.show()

print(f"Surface MAE: {np.abs(IV_true - IV_learned).mean():.6f}")
print(f"Surface Max Error: {np.abs(IV_true - IV_learned).max():.6f}")

In [None]:
# Cross-section plots
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Vol smile at different maturities
for T_val in [0.25, 0.5, 1.0, 2.0]:
    axes[0].plot(m_grid, true_iv_surface(m_grid, T_val), '--', alpha=0.5)
    x_slice = np.column_stack([m_grid, np.full_like(m_grid, T_val)]).astype(np.float32)
    with torch.no_grad():
        iv_nn = iv_model(torch.tensor(x_slice).to(device)).cpu().numpy().ravel()
    axes[0].plot(m_grid, iv_nn, label=f'T={T_val}')

axes[0].set_title('Vol Smile at Different Maturities (solid=NN, dashed=true)')
axes[0].set_xlabel('Moneyness (K/S)')
axes[0].set_ylabel('Implied Volatility')
axes[0].legend()

# Term structure at different moneyness
for m_val in [0.8, 0.9, 1.0, 1.1, 1.2]:
    axes[1].plot(t_grid, true_iv_surface(m_val, t_grid), '--', alpha=0.5)
    x_slice = np.column_stack([np.full_like(t_grid, m_val), t_grid]).astype(np.float32)
    with torch.no_grad():
        iv_nn = iv_model(torch.tensor(x_slice).to(device)).cpu().numpy().ravel()
    axes[1].plot(t_grid, iv_nn, label=f'K/S={m_val}')

axes[1].set_title('IV Term Structure (solid=NN, dashed=true)')
axes[1].set_xlabel('Maturity T')
axes[1].set_ylabel('Implied Volatility')
axes[1].legend()

plt.tight_layout()
plt.show()

**Commentary:**

The NN learns the IV surface accurately, capturing both the smile (curvature in moneyness) and the term structure (higher vol at short maturities). The model handles the noise in the training data well, producing a smooth surface. In practice, this approach replaces parametric models like SVI with a more flexible learned surface that can adapt to arbitrary market conditions. The main risk is extrapolation -- the model should not be used outside the training range of moneyness and maturity values.