In [41]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
from math import sqrt
from scipy.stats import norm
import itertools
from torch.distributions.normal import Normal

In [42]:

def bachelier_call_price(forward, strike, vol, time_to_expiry):
    """
    forward, strike, vol, time_to_expiry: Tensors of the same shape
    Bachelier formula for European call options:
      C = sigma * sqrt(T) * phi(d) + (F - K)*Phi(d),
      d = (F - K)/(sigma * sqrt(T))
    """
    # Avoid zero or negative vol or time to avoid Inf
    eps = 1e-8
    vol = torch.clamp(vol, min=eps)
    time_to_expiry = torch.clamp(time_to_expiry, min=eps)
    
    d = (forward - strike) / (vol * torch.sqrt(time_to_expiry))
    
    normal = Normal(0.0, 1.0)
    pdf_d = torch.exp(normal.log_prob(d))  # phi(d)
    cdf_d = normal.cdf(d)                  # Phi(d)
    
    call_price = vol * torch.sqrt(time_to_expiry) * pdf_d + (forward - strike) * cdf_d
    return call_price


In [43]:

# ---------------------
# Define the VAE model
# ---------------------
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(VAE, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU()
        )
        
        self.mu_layer = nn.Linear(hidden_dim, latent_dim)
        self.logvar_layer = nn.Linear(hidden_dim, latent_dim)
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )
        
    def encode(self, x):
        h = self.encoder(x)
        mu = self.mu_layer(h)
        logvar = self.logvar_layer(h)
        return mu, logvar
    
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std
    
    def decode(self, z):
        return self.decoder(z)
    
    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        recon = self.decode(z)
        return recon, mu, logvar



In [44]:

def inverse_transform_gpu(x_standard: torch.Tensor, scale_torch, mean_torch) -> torch.Tensor:
    """
    Inverse StandardScaler transform on GPU:
    x_original = x_standard * scale + mean
    """
    return x_standard * scale_torch + mean_torch


In [45]:
def butterfly_arbitrage_penalty(x_std, recon_std, scale_torch, mean_torch):
    """
    x_std, recon_std: [batch_size, 21] in standardized scale.
    We'll compute butterfly penalty on the reconstructed data 
    (i.e., ensure the model doesn't produce negative convexities).
    
    Steps:
      1. Inverse-transform from standardized -> original scale on GPU.
      2. Extract relevant columns: 
         expiry (col 0), forward (col 2), strikes (3..11), vols (12..20)
      3. Compute Bachelier call prices at each strike.
      4. Discrete second derivative of call prices w.r.t. strike.
      5. Accumulate negative parts -> penalty.
    """
    # 1) Inverse transform on GPU
    recon_orig = inverse_transform_gpu(recon_std, scale_torch, mean_torch)
    
    # 2) Extract columns from the *generated* data
    expiry_gen  = recon_orig[:, 0]        # time to expiry
    forward_gen = recon_orig[:, 2]
    strikes_gen = recon_orig[:, 3:12]
    vols_gen    = recon_orig[:, 12:21]
    
    # We'll apply penalty only on the reconstructed surfaces 
    # to encourage no butterfly arbitrage in the generated data.
    
    penalty_sum = 0.0
    batch_size = x_std.size(0)
    
    for i in range(batch_size):
        T     = expiry_gen[i]        # scalar
        F     = forward_gen[i]       # scalar
        Ks    = strikes_gen[i, :]    # shape [9]
        sigs  = vols_gen[i, :]       # shape [9]
        
        # Repeat T, F to match shape [9] if needed
        T_vec = T.unsqueeze(0).expand_as(Ks)  # shape [9]
        F_vec = F.unsqueeze(0).expand_as(Ks)  # shape [9]
        
        # Bachelier call prices for 9 strikes
        prices = bachelier_call_price(F_vec, Ks, sigs, T_vec / 12.0)  # shape [9]
        
        # Discrete approximation of second derivative
        # slopes_j = (price_{j+1} - price_j) / (K_{j+1} - K_j)
        # second_deriv_j = slopes_{j+1} - slopes_j
        # sum negative parts
        slopes = []
        for j in range(8):
            dx = Ks[j+1] - Ks[j]
            if abs(dx) < 1e-12:  
                continue
            slope_j = (prices[j+1] - prices[j]) / dx
            slopes.append(slope_j)
        
        penalty_i = 0.0
        for j in range(len(slopes) - 1):
            second_deriv = slopes[j+1] - slopes[j]
            # Penalize negative second derivative
            if second_deriv < 0:
                penalty_i += -second_deriv
        
        penalty_sum += penalty_i
    
    return penalty_sum / batch_size


In [46]:

# ---------------------
# Loss function
# ---------------------
def loss_function(recon, x, mu, logvar):
    # (a) Standard VAE: MSE + KL
    recon_loss = nn.MSELoss(reduction='sum')(recon, x)
    kld = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    base_vae_loss = recon_loss + kld
    return base_vae_loss

def total_loss_with_butterfly(recon, x, mu, logvar, lambda_butterfly, scale_torch, mean_torch):
    # Standard VAE loss
    vae_loss = loss_function(recon, x, mu, logvar)
    # Butterfly penalty
    if lambda_butterfly > 0.:
        bfly_pen = butterfly_arbitrage_penalty(x, recon, scale_torch, mean_torch)
    else:
        bfly_pen = 0.
    
    # Weighted sum
    return vae_loss + lambda_butterfly * bfly_pen

In [47]:
def sample_from_vae(model, latent_dim, device, num_samples=1000):
    z = torch.randn(num_samples, latent_dim).to(device)
    with torch.no_grad():
        samples = model.decode(z).cpu().numpy()
    return samples

In [48]:
def check_butterfly_arbitrage(strikes, prices):
    """
    Check that the call price curve is convex in a discrete sense.
    We verify that the slope between consecutive points is non-decreasing:
      slope_i   = (P[i+1] - P[i]) / (K[i+1] - K[i])
      slope_i+1 = (P[i+2] - P[i+1]) / (K[i+2] - K[i+1])
    We need slope_i <= slope_i+1 for i=0..(n_strikes-3).
    """
    n = len(strikes)
    if n < 3:
        return True  # trivial if fewer than 3 points

    slopes = []
    for i in range(n - 1):
        dk = strikes[i+1] - strikes[i]
        dp = prices[i+1] - prices[i]
        if dk <= 0:
            return False  # strikes must be strictly increasing
        if dp > 0:
            return False
        slopes.append(dp / dk)
    
    # Check that slopes are non-decreasing
    for i in range(len(slopes) - 1):
        if slopes[i] > slopes[i+1]:
            return False
    
    return True

In [49]:

def normal_call_price(forward, strike, vol, expiry_years, notional=1.0):
    """
    Bachelier (normal) model call price, ignoring discounting for simplicity.
    
    Formula (for zero rates, notional=1):
        Call = (F - K)*Phi(d) + sigma * sqrt(T) * phi(d),
    where
        d = (F - K) / (sigma * sqrt(T))
        Phi(.) is standard normal CDF, phi(.) is standard normal PDF.
    
    If vol = 0 or expiry ~ 0, fallback to intrinsic value: max(F - K, 0).
    """
    if vol < 1e-8 or expiry_years < 1e-8:
        return notional * max(forward - strike, 0.0)

    sigma_sqrt_t = vol * sqrt(expiry_years)
    d = (forward - strike) / sigma_sqrt_t

    call_price = notional * ((forward - strike) * norm.cdf(d) 
                             + sigma_sqrt_t * norm.pdf(d))
    return call_price

In [50]:
def check_butterfly_arbitrage_vol(strikes, vols, expiry_years, forward):
    prices = [
            normal_call_price(forward, K, vol, expiry_years)
            for K, vol in zip(strikes, vols)
        ]
    return check_butterfly_arbitrage(strikes, prices)

In [51]:
def train_and_evaluate_vae(
    train_loader,
    val_loader,
    data,
    scaler,
    scale_torch,
    mean_torch,
    lr=1e-3,
    hidden_dim=64,
    latent_dim=4,
    epochs=20,
    device='cuda',
    patience=20,
    lambda_butterfly = 1000.0,
):
    
    model = VAE(input_dim=21, hidden_dim=hidden_dim, latent_dim=latent_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=patience, verbose=True)

    best_train_loss = np.inf
    best_val_loss = np.inf
    
    waiting = 0

    for epoch in range(epochs):
        # --- Train ---
        model.train()
        train_loss = 0.0
        for batch in train_loader:
            x_batch = batch[0].to(device)
            optimizer.zero_grad()
            
            recon, mu, logvar = model(x_batch)
            loss = total_loss_with_butterfly(recon, x_batch, mu, logvar, lambda_butterfly, scale_torch, mean_torch)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        train_loss /= len(train_loader.dataset)
        
        # --- Validation ---
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                x_batch = batch[0].to(device)
                recon, mu, logvar = model(x_batch)
                loss = total_loss_with_butterfly(recon, x_batch, mu, logvar, lambda_butterfly, scale_torch, mean_torch)
                val_loss += loss.item()
        val_loss /= len(val_loader.dataset)
        
        scheduler.step(val_loss)
        
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

        if train_loss < best_train_loss:
            best_train_loss = train_loss

        if val_loss < best_val_loss:
            best_val_loss = val_loss
        else:
            waiting += 1
            if waiting > patience:
                print("Early stopping triggered.")
                break

    model.eval()

    gen_samples = sample_from_vae(model, latent_dim=latent_dim, device=device, num_samples=len(data))
    gen_samples_orig = scaler.inverse_transform(gen_samples)
    df_gen = pd.DataFrame(gen_samples_orig, columns=data.columns)

    arbitrages_count = 0
    for idx in range(len(df_gen)):
        gen_strikes = [df_gen["K" + str(x)].values[idx] for x in range(1,10)]
        gen_vols = [df_gen["vol" + str(x)].values[idx] for x in range(1,10)]
        gen_forward = df_gen["forward"].values[idx]
        gen_expiry_years = df_gen["expiry_months"].values[idx] / 12.0

        if not check_butterfly_arbitrage_vol(gen_strikes, gen_vols, gen_expiry_years, gen_forward):
            arbitrages_count += 1

    arbitrages_proportion = arbitrages_count / len(df_gen)

    return best_train_loss, best_val_loss, arbitrages_proportion
    


In [52]:
def main():
    param_grid = {
        "lr":          [1e-3, 5e-4],       # learning rates to try
        "hidden_dim":  [64, 128, 256, 512],           # hidden dims to try
        "latent_dim":  [32, 64, 128, 256],             # latent dims to try
        "batch_size":  [64, 128, 256],          # batch sizes
        "epochs":      [50, 100],               # or more
        'patience': [10, 20],
        'lambda_butterfly': [0., 1000.0, 10000.0],
    }


    data = pd.read_csv('data.csv')  # Replace with your actual data file
    assert data.shape[1] == 22, "Data must have 22 features."

    data = data[[x for x in data.columns if x != "duration"]]
    assert data.shape[1] == 21, "Data must have 21 features."

    X = data.values.astype(np.float32)

    # Train/validation split
    X_train, X_val = train_test_split(X, test_size=0.2, random_state=42)

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    # Convert to torch tensors
    train_data = torch.tensor(X_train, dtype=torch.float32)
    val_data = torch.tensor(X_val, dtype=torch.float32)
    
    train_dataset = TensorDataset(train_data)
    val_dataset = TensorDataset(val_data)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Using device:", device)

    mean_torch = torch.tensor(scaler.mean_, dtype=torch.float32, device=device)
    scale_torch = torch.tensor(scaler.scale_, dtype=torch.float32, device=device)

    best_train_loss   = float('inf')
    best_val_loss = float('inf')
    best_params = None

    results = []

    # We'll do a grid search over all param combos
    keys = list(param_grid.keys())
    for combo in itertools.product(*(param_grid[key] for key in keys)):
        params = dict(zip(keys, combo))   # e.g. {"lr": 1e-3, "hidden_dim": 32, "latent_dim": 4, "batch_size": 64, "epochs": 20}
        print("Testing hyperparams:", params)

        train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
        val_loader   = DataLoader(val_dataset,   batch_size=params["batch_size"], shuffle=False)

        train_loss, val_loss, arbitrages_proportion = train_and_evaluate_vae(
            train_loader  = train_loader,
            val_loader    = val_loader,
            data = data,
            scaler = scaler,
            scale_torch = scale_torch,
            mean_torch = mean_torch,
            lr         = params["lr"],
            hidden_dim = params["hidden_dim"],
            latent_dim = params["latent_dim"],
            epochs     = params["epochs"],
            device     = device,

        )

        run_result = {**params, 'train_loss': train_loss, 'val_loss': val_loss, 'arbitrages_proportion': arbitrages_proportion}
        results.append(run_result)

        print(f" --> Result: train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, arbitrages_proportion={arbitrages_proportion:.4f}")

        if val_loss < best_val_loss:
            best_val_loss   = val_loss
            best_params = params
            print("New best validation loss:", best_val_loss, "with", best_params)

    df_results = pd.DataFrame(results)
    df_results.to_csv('hyperparam_results.csv', index=False)
    print("\nAll results saved to 'hyperparam_results.csv'.")

    print("\n=======================================")
    print("Best overall val_loss:", best_val_loss)
    print("Best hyperparams:", best_params)
    print("=======================================")

In [53]:
if __name__ == "__main__":
    main()

Using device: cpu
Testing hyperparams: {'lr': 0.001, 'hidden_dim': 64, 'latent_dim': 32, 'batch_size': 64, 'epochs': 50, 'patience': 10, 'lambda_butterfly': 0.0}
Epoch 1/50, Train Loss: 14.0785, Val Loss: 8.7599
Epoch 2/50, Train Loss: 8.5411, Val Loss: 8.5002
Epoch 3/50, Train Loss: 8.2986, Val Loss: 7.9829
Epoch 4/50, Train Loss: 7.9782, Val Loss: 7.5421
Epoch 5/50, Train Loss: 7.1581, Val Loss: 6.7969
Epoch 6/50, Train Loss: 6.6564, Val Loss: 6.3615
Epoch 7/50, Train Loss: 5.9213, Val Loss: 5.5948
Epoch 8/50, Train Loss: 5.4378, Val Loss: 5.3251
Epoch 9/50, Train Loss: 5.2859, Val Loss: 5.3271
Epoch 10/50, Train Loss: 5.2312, Val Loss: 5.2099
Epoch 11/50, Train Loss: 5.2648, Val Loss: 5.2279
Epoch 12/50, Train Loss: 5.1619, Val Loss: 5.3484
Epoch 13/50, Train Loss: 5.3082, Val Loss: 5.6372
Epoch 14/50, Train Loss: 5.3370, Val Loss: 5.2617
Epoch 15/50, Train Loss: 5.2116, Val Loss: 5.4388
Epoch 16/50, Train Loss: 5.1977, Val Loss: 5.1982
Epoch 17/50, Train Loss: 5.1867, Val Loss: 5.2

RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 65536 bytes.