<a href="https://colab.research.google.com/github/satyabratkumarsingh/AMLS_23-24_SN-23220011/blob/main/14Oct.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch
!pip install comet_ml
!pip install tqdm
!pip install matplotlib




In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
def delete_file_from_drive(full_file_path):
  if os.path.exists(full_file_path):
      try:
          os.remove(full_file_path)
          print(f"File '{full_file_path}' successfully deleted from Google Drive.")
      except Exception as e:
          print(f"Error deleting file '{full_file_path}': {e}")
  else:
      print(f"File '{full_file_path}' not found at '{full_file_path}'.")


In [4]:
import random
import numpy as np
import torch
import itertools
from itertools import product
from torch.utils.data import Dataset, DataLoader
import gc # For garbage collection
import numpy as np
import yfinance as yf

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def calibrate_sigma_from_sp500(period="1y", ticker="SPY"):
    """
    Calibrate volatility (sigma) using S&P500 proxy (SPY ETF).
    Computes realized annualized volatility from historical data.

    Args:
        period (str): Period to download (e.g. "1y", "2y").
        ticker (str): Symbol to use, default SPY (ETF for S&P 500).

    Returns:
        float: Annualized volatility (sigma).
    """
    # Download daily adjusted close prices
    data = yf.download(ticker, period=period, interval="1d", auto_adjust=True)

    closes = data["Close"].dropna()

    # Compute log returns
    log_returns = np.log(closes / closes.shift(1)).dropna()

    # Daily volatility
    sigma_daily = log_returns.std()

    # Annualized volatility
    sigma_annual = sigma_daily * np.sqrt(252)

    # Convert safely to float
    return sigma_annual.item() if hasattr(sigma_annual, "item") else float(sigma_annual)

# Example: update SIGMA
SIGMA = calibrate_sigma_from_sp500()
print(f"Calibrated SP500 1Y Volatility (Sigma): {SIGMA:.4f}")


[*********************100%***********************]  1 of 1 completed

Calibrated SP500 1Y Volatility (Sigma): 0.1950





In [5]:

########### PARAMETERS ###############
MU = 0.02
T = 1.0
NOISE_STD = 0.05
MIN_PRICE_RANGE = 100
MAX_PRICE_RANGE = 500
########################

def generate_hybrid_strikes(S_0, option_types, rng, realistic_ratio=0.6):
    n = len(option_types)
    K_prices = np.zeros(n, dtype=np.float32)

    for i in range(n):
        if rng.random() < realistic_ratio:
            # 60% of data is 'realistic' (Call OTM, Put OTM)
            if option_types[i] == "call":
                # Call OTM: K > S_0
                K_prices[i] = S_0 * rng.uniform(1.00, 1.20)
            else:
                # Put OTM: K < S_0
                K_prices[i] = S_0 * rng.uniform(0.80, 1.00)
        else:
            # 40% of data is 'exploration' (Call ITM, Put ITM)
            if option_types[i] == "call":
                # Call ITM: K < S_0
                K_prices[i] = S_0 * rng.uniform(0.80, 1.00)
            else:
                # Put ITM: K > S_0
                K_prices[i] = S_0 * rng.uniform(1.00, 1.20)

    return K_prices


def _generate_option_types(n, rng, composition_type="mixed", p=(0.5, 0.5)):
    if composition_type == "call_only":
        option_types_string = np.array(["call"] * n)
        option_types_numeric = np.ones(n, dtype=np.float32)
    elif composition_type == "put_only":
        option_types_string = np.array(["put"] * n)
        option_types_numeric = -np.ones(n, dtype=np.float32)
    else:
        option_types_string = rng.choice(["call", "put"], size=n, p=p)
        option_types_numeric = np.where(option_types_string == "call", 1.0, -1.0).astype(np.float32)

    return option_types_string, option_types_numeric


def generate_option_prices_for_idx(seed, n, composition_type="mixed", weights=None, realistic_ratio=0.7):

    rng = np.random.default_rng(seed)

    torch.manual_seed(seed)
    if DEVICE.type == 'cuda':
        torch.cuda.manual_seed_all(seed)

    S_0 = rng.uniform(MIN_PRICE_RANGE, MAX_PRICE_RANGE)

    option_types, option_types_numeric = _generate_option_types(n, rng, composition_type)

    # Generate hybrid strike prices
    K_prices = generate_hybrid_strikes(S_0, option_types, rng, realistic_ratio)

    # Generate or use weights
    if weights is None:
        weight_sets = generate_combinatorial_weights_manageable(n, rng=rng)
        weights_array = weight_sets[0]
    else:
        weights_array = np.array(weights, dtype=np.float32)

    return K_prices, option_types_numeric, S_0, weights_array


def generate_combinatorial_weights_manageable(
    n, base_weights=[-0.75, -0.5, -0.25, 0, 0.25, 0.5, 0.75], rng=None
):
    if rng is None:
        rng = np.random.default_rng()

    weight_sets = []

    if n < 2:
        weights = np.zeros(n, dtype=np.float32)
        if n == 1:
            weights[0] = 1.0
        weight_sets.append(weights)
        return weight_sets

    weights = np.zeros(n, dtype=np.float32)

    # Use rng instead of random
    is_long = rng.choice([True, False])

    if is_long:
        long_idx = rng.integers(0, n)
        weights[long_idx] = 1.0
    else:
        short_idx = rng.integers(0, n)
        weights[short_idx] = -1.0

    remaining_positions = np.where(weights == 0)[0]
    combinatorics = rng.choice(base_weights, size=len(remaining_positions), replace=True)
    weights[remaining_positions] = combinatorics

    weight_sets.append(weights)
    return weight_sets


def compute_cashflow_and_delta(portfolio, S_T_batch):

    B, N, _ = portfolio.shape
    B2, M = S_T_batch.shape
    assert B == B2, f"Batch size mismatch: portfolio {B}, S_T {B2}"

    # Expand tensors for vectorized computation
    portfolio_exp = portfolio.unsqueeze(2).expand(-1, -1, M, -1)  # [B, N, M, 3]
    S_T_exp = S_T_batch.unsqueeze(1).expand(-1, N, -1)  # [B, N, M]

    strikes = portfolio_exp[..., 0]  # [B, N, M]
    types = portfolio_exp[..., 1]    # [B, N, M]
    weights = portfolio_exp[..., 2]  # [B, N, M]

    # Compute payoffs for all scenarios
    call_payoffs = torch.relu(S_T_exp - strikes)  # [B, N, M]
    put_payoffs = torch.relu(strikes - S_T_exp)   # [B, N, M]
    payoffs = torch.where(types == 1, call_payoffs, put_payoffs)

    # Compute deltas for all scenarios
    call_delta = ((types == 1) & (S_T_exp > strikes)).float()
    put_delta = -((types == -1) & (S_T_exp < strikes)).float()
    delta_each = call_delta + put_delta  # [B, N, M]

    # Aggregate across options (sum over N dimension)
    cashflow = (payoffs * weights).sum(dim=1)  # [B, M]
    derivative = (delta_each * weights).sum(dim=1)  # [B, M]

    return cashflow.float(), derivative.float()

# Dataset

In [6]:
import numpy as np
import torch
from torch.utils.data import Dataset

class DatasetStandardized(Dataset):

    def __init__(self, num_samples, num_samples_S_T, K_scaler=None,
                 S_T_scaler=None, cashflow_scaler=None, is_fitting_mode=False,
                 max_portfolio_size=100, min_portfolio_size=1, base_seed=42): # Reduced max_portfolio_size to 100 for stability

        self.num_samples = num_samples
        self.num_samples_S_T = num_samples_S_T
        self.max_portfolio_size = max_portfolio_size
        self.min_portfolio_size = min_portfolio_size
        self.is_fitting_mode = is_fitting_mode

        if not is_fitting_mode and any(s is None for s in (K_scaler, S_T_scaler, cashflow_scaler)):
            raise ValueError("K_scaler, S_T_scaler, and cashflow_scaler must be provided in evaluation mode.")

        self.K_scaler = K_scaler
        self.S_T_scaler = S_T_scaler
        self.cashflow_scaler = cashflow_scaler
        self.base_seed = base_seed
        self.epoch = 0

    def set_epoch(self, epoch):
        self.epoch = epoch

    def __len__(self):
        return self.num_samples

    def _sample_portfolio_len(self):
      if self.min_portfolio_size == self.max_portfolio_size:
          return self.max_portfolio_size

      u = np.random.rand()
      low = self.min_portfolio_size
      high = self.max_portfolio_size

      # --- Special bucket for single-option portfolios (e.g., 10% chance) ---
      if low <= 1 and u < 0.10:
          return 1

      # --- Training mode: stratified distribution with dynamic ranges ---
      # Adjust u after the single-option probability
      u = (u - 0.10) / 0.90  # rescale remaining probability mass

      # Bucket 1: Small Portfolios (up to 10, weighted 35%)
      bucket_1_max = min(high, 10)

      # Bucket 2: Medium Portfolios (11–50, weighted 20%)
      bucket_2_min = max(low, 11)
      bucket_2_max = min(high, 50)

      # Bucket 3: Large Portfolios (51–200, weighted 20%)
      bucket_3_min = max(low, 51)
      bucket_3_max = min(high, 200)

      # Bucket 4: Very Large Portfolios (201–max, weighted 25%)
      bucket_4_min = max(low, 201)
      bucket_4_max = high

      if u < 0.35 and low <= bucket_1_max:
          return np.random.randint(max(low, 2), bucket_1_max + 1)
      elif u < 0.55 and bucket_2_min <= bucket_2_max:
          return np.random.randint(bucket_2_min, bucket_2_max + 1)
      elif u < 0.75 and bucket_3_min <= bucket_3_max:
          return np.random.randint(bucket_3_min, bucket_3_max + 1)
      elif bucket_4_min <= bucket_4_max:
          return np.random.randint(bucket_4_min, bucket_4_max + 1)
      else:
          return np.random.randint(low, high + 1)



    def _simulate_terminal_prices(self, S_0, seed=None):
        g = torch.Generator()
        if seed is not None:
            g.manual_seed(seed)

        Z = torch.clamp(torch.randn(self.num_samples_S_T, generator=g), -3, 3)
        drift = (MU - 0.5 * SIGMA**2) * T
        diffusion = SIGMA * torch.sqrt(torch.tensor(T, dtype=torch.float32))

        S_T = S_0 * torch.exp(drift + diffusion * Z)

        noise = torch.randn(S_T.shape, generator=g, dtype=S_T.dtype, device=S_T.device) * (NOISE_STD * S_T)
        S_T += noise

        return S_T.float()

    def __getitem__(self, idx):


        portfolio_len = self._sample_portfolio_len()
        unique_seed = self.base_seed + self.epoch * 1_000_000 + idx

        np.random.seed(unique_seed)
        torch.manual_seed(unique_seed)

        # Put all types as equal probablities
        composition_choices = ["call_only", "put_only", "mixed"]
        probabilities = [0.35, 0.35, 0.30]
        composition_type = np.random.choice(composition_choices, p=probabilities)


        K, option_types, S_0, weights = generate_option_prices_for_idx(unique_seed, portfolio_len, composition_type)

        # Convert to unpadded PyTorch tensors
        K = torch.tensor(K, dtype=torch.float32)
        option_types = torch.tensor(option_types, dtype=torch.float32)
        weights = torch.tensor(weights, dtype=torch.float32)
        S_0 = torch.tensor(S_0, dtype=torch.float32)


        S_0_Broadcast = S_0.repeat(len(K)) if S_0.ndim == 0 else S_0

        # Unpadded Denormalized Portfolio
        portfolio_denorm_raw = torch.stack([K, option_types, weights, S_0_Broadcast], dim=-1)

        # Simulate terminal prices
        S_T_denorm = self._simulate_terminal_prices(S_0, seed=unique_seed)

        # Cashflows + derivatives (denormalized, unpadded)
        cashflow_denorm, derivative_denorm = compute_cashflow_and_delta(
            portfolio_denorm_raw.unsqueeze(0), S_T_denorm.unsqueeze(0)
        )
        cashflow_denorm = cashflow_denorm.squeeze(0).float()
        derivative_denorm = derivative_denorm.squeeze(0).float()

        # === Normalization and Scaling ===
        if not self.is_fitting_mode:
            # 1. Normalize K using .numpy()
            K_norm = torch.tensor(
                self.K_scaler.transform(K.unsqueeze(1).numpy()), dtype=torch.float32
            ).squeeze()

            # 2. Create the normalized, unpadded portfolio
            portfolio_norm_raw = portfolio_denorm_raw.clone()
            portfolio_norm_raw[:, 0] = K_norm

            # 3. Normalize S_T using .numpy()
            S_T = torch.tensor(
                self.S_T_scaler.transform(S_T_denorm.unsqueeze(1).numpy()),
                dtype=torch.float32
            ).squeeze()

            # 4. Normalize cashflow using .numpy()
            cashflow = torch.tensor(
                self.cashflow_scaler.transform(cashflow_denorm.unsqueeze(1).numpy()),
                dtype=torch.float32
            ).squeeze()

            # 5. Scale derivative
            derivative = derivative_denorm.clone()
            cf_std = cashflow.std().item()
            deriv_std = derivative.std().item()
            if deriv_std > 0:
                derivative *= (cf_std / deriv_std)

        else:
            # In fitting mode, use denormalized values
            portfolio_norm_raw = portfolio_denorm_raw
            S_T = S_T_denorm
            cashflow = cashflow_denorm
            derivative = derivative_denorm

        pad_len = self.max_portfolio_size - portfolio_len

        if pad_len > 0:
            # ✅ Fixed padding dimension (was 3 before, now correctly 4)
            pad_tensor = torch.zeros(pad_len, 4, dtype=torch.float32)

            # Pad the normalized portfolio (final input)
            portfolio = torch.cat([portfolio_norm_raw, pad_tensor], dim=0)

            # Pad the denormalized portfolio (for checks)
            portfolio_denorm = torch.cat([portfolio_denorm_raw, pad_tensor], dim=0)

            # Create the final mask (must be max_portfolio_size long)
            mask = torch.tensor([True]*portfolio_len + [False]*pad_len, dtype=torch.bool)
        else:
            portfolio = portfolio_norm_raw
            portfolio_denorm = portfolio_denorm_raw
            mask = torch.ones(portfolio_len, dtype=torch.bool)

        return {
            # Normalized (for training)
            "portfolio": portfolio.float(),          # [max_portfolio_size, 4]
            "mask": mask,                            # [max_portfolio_size]
            "S_T": S_T.float(),                      # [num_samples_S_T]
            "cashflow": cashflow.float(),            # [num_samples_S_T]
            "derivative": derivative.float(),        # [num_samples_S_T]

            # Denormalized (for evaluation / interpretation)
            "portfolio_denorm": portfolio_denorm.float(),
            "S_T_denorm": S_T_denorm.float(),
            "cashflow_denorm": cashflow_denorm.float(),
            "derivative_denorm": derivative_denorm.float(),
        }

In [12]:
# --- Quick test run ---
dataset = DatasetStandardized(
    num_samples=5,
    num_samples_S_T=2,
    K_scaler=None, S_T_scaler=None, cashflow_scaler=None,
    max_portfolio_size=5,
    min_portfolio_size=1,
    is_fitting_mode=True
)

for i in range(len(dataset)):
    sample = dataset[i]  # dictionary returned
    print(f"\n=== Sample {i} ===")
    print("Portfolio features:\n", sample["portfolio"])
    print("Mask:", sample["mask"])
    print("S_T:", sample["S_T"])
    print("Cashflow:", sample["cashflow"])
    print("Derivative:", sample["derivative"])
    print("Portfolio (denorm):\n", sample["portfolio_denorm"])
    print("S_T (denorm):", sample["S_T_denorm"])
    print("Cashflow (denorm):", sample["cashflow_denorm"])
    print("Derivative (denorm):", sample["derivative_denorm"])


=== Sample 0 ===
Portfolio features:
 tensor([[397.9993,  -1.0000,   0.0000, 409.5824],
        [335.3806,  -1.0000,  -0.7500, 409.5824],
        [471.9323,  -1.0000,  -1.0000, 409.5824],
        [  0.0000,   0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,   0.0000]])
Mask: tensor([ True,  True,  True, False, False])
S_T: tensor([442.9435, 425.2559])
Cashflow: tensor([-28.9888, -46.6764])
Derivative: tensor([1., 1.])
Portfolio (denorm):
 tensor([[397.9993,  -1.0000,   0.0000, 409.5824],
        [335.3806,  -1.0000,  -0.7500, 409.5824],
        [471.9323,  -1.0000,  -1.0000, 409.5824],
        [  0.0000,   0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,   0.0000]])
S_T (denorm): tensor([442.9435, 425.2559])
Cashflow (denorm): tensor([-28.9888, -46.6764])
Derivative (denorm): tensor([1., 1.])

=== Sample 1 ===
Portfolio features:
 tensor([[ 3.6237e+02,  1.0000e+00,  1.0000e+00,  3.6092e+02],
        [ 3.3112e+02,  1.0000e+00, -5.0000e-01,  3.6092e

# Utility Functions

In [13]:

def collate_fn(batch):
    max_len = max(item["mask"].shape[0] for item in batch)
    keys_to_stack = ["S_T", "S_T_denorm", "cashflow", "cashflow_denorm", "derivative", "derivative_denorm"]

    def pad_tensor(tensor, target_len):
        if tensor.shape[0] < target_len:
            pad = torch.zeros(target_len - tensor.shape[0], tensor.shape[-1])
            tensor = torch.cat([tensor, pad], dim=0)
        return tensor

    # Pad portfolios (normalized + denormalized)
    portfolios = torch.stack([pad_tensor(item["portfolio"], max_len) for item in batch])
    portfolios_denorm = torch.stack([pad_tensor(item["portfolio_denorm"], max_len) for item in batch])

    # Pad masks
    masks = torch.stack([
        torch.cat([item["mask"], torch.zeros(max_len - item["mask"].shape[0], dtype=torch.bool)])
        for item in batch
    ])

    # Stack everything else directly (no padding needed)
    collated = {
        "portfolio": portfolios,
        "portfolio_denorm": portfolios_denorm,
        "mask": masks
    }
    for key in keys_to_stack:
        collated[key] = torch.stack([item[key] for item in batch])

    return collated


In [14]:
import os
from sklearn.preprocessing import StandardScaler
import joblib
import numpy as np
from tqdm import tqdm

DRIVE_PATH = "/content/drive/MyDrive/Ucl/"
K_SCALAR_FILE = os.path.join(DRIVE_PATH, 'K_Scalar_Advanced.pkl')
ST_SCALAR_FILE = os.path.join(DRIVE_PATH, 'S_T_Scalar_Advanced.pkl')
CASHFLOW_SCALAR_FILE = os.path.join(DRIVE_PATH, 'Cashflow_Scalar_Advanced.pkl')


def fit_K_ST_scalers(train_loader, save_path_K=K_SCALAR_FILE, save_path_ST=ST_SCALAR_FILE):
    print("Fitting K and S_T scalers from training set...")
    all_K = []
    all_S_T = []

    for batch in tqdm(train_loader, desc="Collecting K and S_T for scalers"):

        portfolio_real = batch["portfolio_denorm"]  # [B, N, 3]
        s_t_real = batch["S_T_denorm"]              # [B, num_S_T]

        K_real = portfolio_real[:, :, 0].cpu().numpy().reshape(-1, 1)
        S_T_real = s_t_real.cpu().numpy().reshape(-1, 1)

        all_K.append(K_real)
        all_S_T.append(S_T_real)

    K_all_np = np.concatenate(all_K, axis=0)
    S_T_all_np = np.concatenate(all_S_T, axis=0)

    K_scalar = StandardScaler().fit(K_all_np)
    S_T_scalar = StandardScaler().fit(S_T_all_np)

    # joblib.dump(K_scalar, save_path_K)
    # joblib.dump(S_T_scalar, save_path_ST)

    print(f"K mean: {K_scalar.mean_[0]:.4f}, std: {K_scalar.scale_[0]:.4f}")
    print(f"S_T mean: {S_T_scalar.mean_[0]:.4f}, std: {S_T_scalar.scale_[0]:.4f}")

    return K_scalar, S_T_scalar



def fit_cashflow_scaler(train_loader, save_path=CASHFLOW_SCALAR_FILE):
    all_cashflows = []

    for batch in tqdm(train_loader, desc="Fitting Cashflow Scaler"):
        cashflow = batch["cashflow_denorm"]  # use denormalized
        cashflow_np = cashflow.detach().cpu().numpy().reshape(-1, 1)
        all_cashflows.append(cashflow_np)

    cashflows_np = np.concatenate(all_cashflows, axis=0)

    scaler = StandardScaler().fit(cashflows_np)

    print(f"Cashflow Mean: {scaler.mean_[0]:.4f}, Std Dev: {scaler.scale_[0]:.4f}")

    return scaler


# The Model

In [16]:
import torch
import torch.nn as nn
import math

DRPO_OUT_PROB = 0.4
NUM_SEEDS = 32
NUM_HEADS = 8

# ===================== Trunk Network =====================
class TrunkNet(nn.Module):
    def __init__(self, input_dim=1, latent_dim=256, hidden_dim=64, num_layers=6):
        super().__init__()
        self.input_proj = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU()
        )
        self.blocks = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim),
                nn.LayerNorm(hidden_dim),
                nn.ReLU()
            ) for _ in range(num_layers)
        ])
        self.output_proj = nn.Linear(hidden_dim, latent_dim)

    def forward(self, S_T):
        if S_T.dim() == 1:
            S_T = S_T.unsqueeze(-1)
        elif S_T.dim() == 2:
            S_T = S_T.unsqueeze(-1)

        x = self.input_proj(S_T)
        for block in self.blocks:
            x = x + block(x)
        return self.output_proj(x)

class PMA(nn.Module):

    def __init__(self, d_model, num_heads, num_seeds=NUM_SEEDS, dropout=DRPO_OUT_PROB):
        super().__init__()
        self.num_seeds = num_seeds
        self.d_model = d_model

        # Initialize seeds with smaller values for stability
        self.seed_vectors = nn.Parameter(torch.randn(num_seeds, d_model) * 0.02)

        self.attention = nn.MultiheadAttention(
            d_model, num_heads, dropout=dropout, batch_first=True
        )

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model * 3),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model * 3, d_model),
            nn.Dropout(dropout)
        )

        # Scaling factor for attention (prevents saturation with large sets)
        self.scale = nn.Parameter(torch.ones(1))

    def forward(self, x, mask=None):
        B, N, _ = x.shape
        S = self.seed_vectors.unsqueeze(0).expand(B, -1, -1)

        key_padding_mask = None
        if mask is not None:
            key_padding_mask = (~mask.bool())
            N_effective = mask.sum(dim=1, keepdim=True).float().clamp(min=1).unsqueeze(-1)

        # Apply scaled attention
        attn_out, attn_weights = self.attention(
            query=S,
            key=x,
            value=x,
            key_padding_mask=key_padding_mask
        )
        if N_effective is not None:
            attn_out = attn_out / N_effective

        attn_out = attn_out * self.scale

        S = self.norm1(S + attn_out)
        S = self.norm2(S + self.ffn(S))
        return S


# ===================== DeepSets Encoder =====================
class DeepSetEncoder(nn.Module):
    """DeepSets with multiple aggregation strategies for robustness."""
    def __init__(self, feature_dim, hidden_dim, dropout=DRPO_OUT_PROB):
        super().__init__()
        # Deeper encoder for better feature extraction
        self.encoder = nn.Sequential(
            nn.Linear(feature_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.GELU()
        )
        self.hidden_dim = hidden_dim

    def forward(self, x, mask=None):
        H = self.encoder(x)  # [B, N, hidden_dim]

        LARGE_NEG = -1e9
        LARGE_POS = 1e9

        if mask is not None:
            mask_f = mask.float().unsqueeze(-1)

            # Max/Min aggregation
            max_H, _ = (H + LARGE_NEG * (1 - mask_f)).max(dim=1)
            min_H, _ = (H + LARGE_POS * (1 - mask_f)).min(dim=1)

            # Mean aggregation (size-normalized by definition)
            sum_H = (H * mask_f).sum(dim=1)
            count = mask_f.sum(dim=1).clamp(min=1)
            mean_H = sum_H / count

            # Std aggregation (normalized) - handle single element case
            count_for_std = count.clamp(min=2)  # Need at least 2 for std
            var_H = ((H - mean_H.unsqueeze(1))**2 * mask_f).sum(dim=1) / (count_for_std - 1)
            std_H = torch.sqrt(var_H.clamp(min=0) + 1e-8)

            # For single-element sets, std should be zero
            std_H = torch.where(count < 2, torch.zeros_like(std_H), std_H)
        else:
            max_H, _ = H.max(dim=1)
            min_H, _ = H.min(dim=1)
            mean_H = H.mean(dim=1)

            # Handle std for potentially single-element batches
            if H.size(1) == 1:
                std_H = torch.zeros_like(mean_H)
            else:
                std_H = H.std(dim=1, unbiased=True)

        # Combine multiple statistics (all size-invariant)
        h_aggregated = torch.cat([mean_H, std_H, max_H, min_H], dim=-1)
        return h_aggregated  # [B, 4 * hidden_dim]


# ===================== Size-Invariant Hybrid Branch =====================
class BranchNet(nn.Module):
    """Truly size-invariant hybrid encoder using normalized aggregations."""
    def __init__(self, portfolio_feature_dim, hidden_dim=128, latent_dim=256,
                 dropout_prob=DRPO_OUT_PROB, num_heads=NUM_HEADS, num_seeds=NUM_SEEDS,
                 use_pma=True):
        super().__init__()
        self.use_pma = use_pma

        self.deepsets_module = DeepSetEncoder(
            portfolio_feature_dim, hidden_dim, dropout_prob
        )
        self.effective_hidden_dim = hidden_dim * 4
        self.layer_norm_fusion = nn.LayerNorm(self.effective_hidden_dim)

        if use_pma:
            self.pma = PMA(hidden_dim, num_heads, num_seeds, dropout_prob)

            self.pma_output_proj = nn.Sequential(
                nn.Linear(hidden_dim * num_seeds, self.effective_hidden_dim),
                nn.LayerNorm(self.effective_hidden_dim),
                nn.GELU(),
                nn.Dropout(dropout_prob)
            )

            self.alpha = nn.Parameter(torch.full((self.effective_hidden_dim,), 0.5))

        self.output_proj = nn.Sequential(
            nn.LayerNorm(self.effective_hidden_dim),
            nn.Dropout(dropout_prob),
            nn.Linear(self.effective_hidden_dim, hidden_dim * 2),
            nn.GELU(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_dim * 2, latent_dim)
        )

    def forward(self, portfolio, mask=None):
        # DeepSets path: size-invariant by design (mean, std, max, min)
        h_deepsets = self.deepsets_module(portfolio, mask)  # [B, 4*H]

        if self.use_pma:

            h_encoded_per_item = self.deepsets_module.encoder(portfolio)  # [B, N, H]
            h_pma_seeds = self.pma(h_encoded_per_item, mask)  # [B, num_seeds, H]

            # Flatten seeds
            h_pma_flat = h_pma_seeds.flatten(1)  # [B, num_seeds * H]
            h_pma_proj = self.pma_output_proj(h_pma_flat)  # [B, 4*H]

            # Simple learnable weighted fusion
            alpha = torch.sigmoid(self.alpha)
            h_fused = alpha * h_deepsets + (1 - alpha) * h_pma_proj
        else:
            h_fused = h_deepsets
        h_fused = self.layer_norm_fusion(h_fused)
        return self.output_proj(h_fused)


# ===================== Final DeepONet =====================
class DeepONet(nn.Module):
    """DeepONet with truly size-invariant set encoding."""
    def __init__(self, portfolio_feature_dim=3, hidden_dim=64, latent_dim=256,
                 dropout_prob=DRPO_OUT_PROB, num_heads=NUM_HEADS, use_pma=True,
                 num_seeds=NUM_SEEDS):
        super().__init__()

        self.branch_net = BranchNet(
            portfolio_feature_dim=portfolio_feature_dim,
            hidden_dim=hidden_dim,
            latent_dim=latent_dim,
            dropout_prob=dropout_prob,
            num_heads=num_heads,
            num_seeds=num_seeds,
            use_pma=use_pma
        )

        self.trunk_net = TrunkNet(
            input_dim=1,
            latent_dim=latent_dim,
            hidden_dim=hidden_dim
        )

        self.bias = nn.Parameter(torch.zeros(1))
        self.branch_scale = nn.Parameter(torch.ones(1))
        self.trunk_scale = nn.Parameter(torch.ones(1))

    def forward(self, portfolio, S_T, mask=None):
        """
        portfolio: [B, N, feature_dim]
        S_T: [B, M]
        mask: [B, N]
        """
        branch_out = self.branch_net(portfolio, mask=mask) * self.branch_scale
        trunk_out = self.trunk_net(S_T.unsqueeze(-1)) * self.trunk_scale

        branch_expanded = branch_out.unsqueeze(1)
        interaction = (branch_expanded * trunk_out).sum(dim=-1)

        return interaction + self.bias

# Gradient Prints

In [17]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.amp import GradScaler

# --- Gradient summary helper ---
def compute_gradient_stats(model):
    gradient_stats = {}
    total_norm = 0.0
    for name, param in model.named_parameters():
        if param.grad is not None:
            grad_norm = param.grad.norm().item()
            total_norm += grad_norm ** 2
            gradient_stats[name] = {
                'norm': grad_norm,
                'shape': tuple(param.grad.shape),
                'numel': param.grad.numel(),
                'mean': param.grad.mean().item(),
                'std': param.grad.std().item()
            }
    total_norm = total_norm ** 0.5
    return total_norm, gradient_stats, None

def print_gradient_summary(gradient_stats, total_norm, epoch, batch_idx=None):
    prefix = f"Epoch {epoch}" + (f", Batch {batch_idx}" if batch_idx is not None else "")
    print(f"\n🔍 === Gradient Analysis - {prefix} ===")
    print(f"Total Gradient Norm: {total_norm:.6f}")

    if total_norm > 30.0:
        print("🚨 CRITICAL: Severe gradient explosion! Consider stopping training.")
    elif total_norm > 20.0:
        print("⚠️  SEVERE: Major gradient explosion detected!")
    elif total_norm > 10.0:
        print("⚠️  WARNING: Moderate gradient explosion detected!")
    elif total_norm < 1e-6:
        print("⚠️  WARNING: Vanishing gradients detected!")
    else:
        print("✅ Gradient norm is healthy")

    sorted_layers = sorted(gradient_stats.items(), key=lambda x: x[1]['norm'], reverse=True)
    print(f"\nTop 5 layers by gradient norm (out of {len(gradient_stats)} total):")
    for i, (layer_name, stats) in enumerate(sorted_layers[:5]):
        status = "🔥" if stats['norm'] > 3.0 else "⚠️" if stats['norm'] > 1.0 else "✅"
        print(f"  {status} {i+1}. {layer_name}: {stats['norm']:.4f}")
        print(f"      Shape: {stats['shape']}, Elements: {stats['numel']}")
        print(f"      Mean: {stats['mean']:.6f}, Std: {stats['std']:.6f}")
    print("=" * 60)


# Early Stopping


In [18]:
import torch
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

class ExtendedEarlyStopping:
    # ... (no changes needed here) ...
    def __init__(self, patience=30, min_delta=0.0005, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.wait = 0
        self.stopped_epoch = 0
        self.best = float('inf')
        self.best_weights = None

    def __call__(self, val_loss, model=None):
        if val_loss < self.best - self.min_delta:
            self.best = val_loss
            self.wait = 0
            if model is not None and self.restore_best_weights:
                self.best_weights = model.state_dict().copy()
        else:
            self.wait += 1

        if self.wait >= self.patience:
            self.stopped_epoch = True
            if model is not None and self.restore_best_weights and self.best_weights is not None:
                model.load_state_dict(self.best_weights)

        return self.stopped_epoch


# The Trainer

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import GradScaler # Assuming GradScaler is used for mixed precision

# Trainer
class OptimizedTrainer:
    def __init__(self, model, device='cuda', monitor_gradients=True,
                 learning_rate=5e-6, lambda_deriv_weight=0.1, weight_decay=1e-4,
                 scheduler_T_0=10, scheduler_T_mult=2, scheduler_eta_min=1e-6, # New scheduler parameters integrated
                 warmup_epochs=5, initial_scale=0.05, final_scale=1.0, grad_log_threshold = 5.0):
        self.model = model.to(device)
        self.device = device
        self.monitor_gradients = monitor_gradients
        self.lambda_deriv_weight = lambda_deriv_weight

        # Optimizer
        self.optimizer = optim.AdamW(
            model.parameters(),
            lr=learning_rate,
            weight_decay=weight_decay,
            betas=(0.9, 0.999),
            eps=1e-8
        )

        # Cosine Annealing Warm Restarts scheduler (epoch-based)
        # T_mult (scheduler_T_mult) is now correctly passed, enabling warm restarts cycle length to increase
        self.scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
            self.optimizer,
            T_0=scheduler_T_0,
            T_mult=scheduler_T_mult,
            eta_min=scheduler_eta_min
        )

        # Gradient scaler for mixed precision
        self.scaler = GradScaler()
        self.huber_loss = nn.SmoothL1Loss(beta=1.0)

        # Branch/trunk scale (warmup is separate from scheduler T_0 now)
        self.warmup_epochs = warmup_epochs
        self.initial_scale = initial_scale
        self.final_scale = final_scale
        if hasattr(self.model, 'branch_scale') and hasattr(self.model, 'trunk_scale'):
            with torch.no_grad():
                self.model.branch_scale.fill_(initial_scale)
                self.model.trunk_scale.fill_(initial_scale)

    def check_model_health(self, epoch, batch_idx):
        for name, param in self.model.named_parameters():
            if torch.isnan(param).any() or torch.isinf(param).any():
                print(f"Bad parameter: {name} at Epoch {epoch}, Batch {batch_idx}")
                return False
        return True

    def compute_loss(self, pred_cashflow, true_cashflow, pred_deriv=None, true_deriv=None, mask=None):

        cashflow_loss = self.huber_loss(pred_cashflow, true_cashflow)

        # --- Derivative loss (no masking needed) ---
        if pred_deriv is not None and true_deriv is not None:
            deriv_loss = self.huber_loss(pred_deriv, true_deriv)
            total_loss = cashflow_loss + self.lambda_deriv_weight * deriv_loss
        else:
            deriv_loss = torch.tensor(0.0, device=pred_cashflow.device)
            total_loss = cashflow_loss

        return total_loss, cashflow_loss, deriv_loss

    def train_step(self, portfolio, S_T, cashflow, true_derivative=None, mask=None, epoch=0, batch_idx=0, log_gradients=False):
      self.optimizer.zero_grad()

      S_T = S_T.clone().detach().requires_grad_(True).to(self.device)
      portfolio = portfolio.to(self.device)
      cashflow = cashflow.to(self.device)
      if mask is not None:
          mask = mask.to(self.device)
      if true_derivative is not None:
          true_derivative = true_derivative.to(self.device)

      pred_cashflow = self.model(portfolio, S_T, mask=mask)

      pred_deriv = None
      if true_derivative is not None:
          # FIXED: Compute gradients per scenario, not summed
          pred_deriv = torch.autograd.grad(
              outputs=pred_cashflow,
              inputs=S_T,
              grad_outputs=torch.ones_like(pred_cashflow),  # ADD THIS LINE
              retain_graph=True,
              create_graph=True,
              allow_unused=True
          )[0]

      total_loss, cashflow_loss, deriv_loss = self.compute_loss(
          pred_cashflow, cashflow, pred_deriv, true_derivative, mask=None
      )

      self.scaler.scale(total_loss).backward()
      self.scaler.unscale_(self.optimizer)
      torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
      self.scaler.step(self.optimizer)
      self.scaler.update()

      return total_loss.item(), cashflow_loss.item(), deriv_loss.item()

    def val_step(self, portfolio, S_T, cashflow, true_derivative=None, mask=None):
      """
      Validation step that can compute derivative loss as well.
      """
      self.model.eval()

      portfolio = portfolio.to(self.device)
      S_T = S_T.clone().detach().requires_grad_(True).to(self.device)  # need grad for derivative
      cashflow = cashflow.to(self.device)
      if mask is not None:
          mask = mask.to(self.device)
      if true_derivative is not None:
          true_derivative = true_derivative.to(self.device)

      # always compute forward with grad enabled (so we can use for both losses)
      with torch.enable_grad():
          pred_cashflow = self.model(portfolio, S_T, mask=mask)

          pred_deriv = None
          if true_derivative is not None:
              pred_deriv = torch.autograd.grad(
                  outputs=pred_cashflow,
                  inputs=S_T,
                  grad_outputs=torch.ones_like(pred_cashflow),
                  retain_graph=False,
                  create_graph=False,
                  allow_unused=True
              )[0]

      # detach before computing loss to avoid holding graph in memory
      total_loss, cashflow_loss, deriv_loss = self.compute_loss(
          pred_cashflow.detach(), cashflow,
          pred_deriv.detach() if pred_deriv is not None else None,
          true_derivative, mask=None
      )

      return total_loss.item(), cashflow_loss.item(), deriv_loss.item()

    def step_scheduler_epoch(self):
        """Step scheduler once per epoch."""
        self.scheduler.step()


# Hyperparameters

In [24]:
import os



def get_stable_hyperparameters():

    return {
        "learning_rate": 3e-4,
        "weight_decay": 1e-5,
        "lambda_deriv": 1,
        "lambda_reg": 1e-4,
        "gradient_clip_norm": 1,
        "batch_size": 128,
        "scheduler_T0": 20,
        "scheduler_T_mult":2,
        "scheduler_eta_min":1e-6,
        "early_stopping_patience": 50,
    }

HIDDEN_DIM = 128  # Increased capacity
LATENT_DIM = 256 # Maintain
BATCH_SIZE = 128
TOTAL_EPOCHS = 500
FEATURE_DIM = 3
PORFOLIO_MAX_LENGTH = 500
PORT_SAMPLE_SIZE = 51200
FEED_ST_LEN_PER_PORT = 100

CAPACITY_EVAL = 1000

# === SAVE SCALERS ===
DRIVE_PATH = "/content/drive/MyDrive/15Oct/"
CHECK_POIN_DIR = DRIVE_PATH + "checkpoints/"
os.makedirs(CHECK_POIN_DIR, exist_ok=True)

# Save Check points function

In [21]:

def save_model_checkpoint(model, save_path, epoch=None, optimizer=None, scheduler=None,
                         train_loss=None, val_loss=None, train_size=None, val_size=None):

    checkpoint_data = {
        "model_state_dict": model.state_dict(),
        "hparams": {
            "hidden_dim": HIDDEN_DIM,
            "latent_dim": LATENT_DIM,
            "portfolio_feature_dim": FEATURE_DIM,
            "use_enhanced_transformer": True
        },
        "training_config": {
            "PORT_LEN": PORFOLIO_MAX_LENGTH,
            "PORT_SAMPLE_SIZE": PORT_SAMPLE_SIZE,
            "FEED_ST_LEN_EACH_PORT": FEED_ST_LEN_PER_PORT,
            "batch_size": BATCH_SIZE,
            "train_size": train_size,
            "val_size": val_size
        },
        "scaler_files": {
            "K_scaler": "K_Scalar_Training.pkl",
            "S_T_scaler": "S_T_Scalar_Training.pkl",
            "cashflow_scaler": "Cashflow_Scalar_Training.pkl"
        }
    }

    # Add optional training state information
    if epoch is not None:
        checkpoint_data["epoch"] = epoch
    if optimizer is not None:
        checkpoint_data["optimizer_state_dict"] = optimizer.state_dict()
    if scheduler is not None:
        checkpoint_data["scheduler_state_dict"] = scheduler.state_dict()
    if train_loss is not None:
        checkpoint_data["train_loss"] = train_loss
    if val_loss is not None:
        checkpoint_data["val_loss"] = val_loss

    torch.save(checkpoint_data, save_path)

In [22]:
def fit_scalars_and_save():
  raw_dataset = DatasetStandardized(
        num_samples=PORT_SAMPLE_SIZE,
        min_portfolio_size=1,
        max_portfolio_size=CAPACITY_EVAL,
        num_samples_S_T=FEED_ST_LEN_PER_PORT,
        is_fitting_mode=True
    )
  raw_loader_fitting = DataLoader(
        raw_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=collate_fn)

  print(f"Fitting scalers on RAW data of size {CAPACITY_EVAL}")
  K_scalar, S_T_scalar = fit_K_ST_scalers(raw_loader_fitting)
  cashflow_scaler = fit_cashflow_scaler(raw_loader_fitting)

  try:
      joblib.dump(K_scalar, DRIVE_PATH + "K_Scalar_Training.pkl")
      joblib.dump(S_T_scalar, DRIVE_PATH + "S_T_Scalar_Training.pkl")
      joblib.dump(cashflow_scaler, DRIVE_PATH + "Cashflow_Scalar_Training.pkl")
      print("Successfully saved all training scalers")

      print(f"Scaler Statistics:")
      print(f"K_scaler - mean: {K_scalar.mean_[0]:.4f}, std: {np.sqrt(K_scalar.var_[0]):.4f}")
      print(f"S_T_scaler - mean: {S_T_scalar.mean_[0]:.4f}, std: {np.sqrt(S_T_scalar.var_[0]):.4f}")
      print(f"Cashflow_scaler - mean: {cashflow_scaler.mean_[0]:.4f}, std: {np.sqrt(cashflow_scaler.var_[0]):.4f}")
      return K_scalar, S_T_scalar, cashflow_scaler
  except Exception as e:
      print(f"Error saving scalers: {e}")


In [None]:
import torch
from torch.utils.data import DataLoader, random_split
import numpy as np
import joblib
from comet_ml import start
from tqdm import tqdm
import os

# === COMET SETUP ===
experiment = start(
    api_key="iatWnXT4JyBtDQhn7OfgISQoF",
    project_name="option-portfolio-encoder-decoder",
    workspace="satyabratkumarsingh"
)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)


def main():
    """
    Training loop with curriculum learning on portfolio sizes.
    """
    hparams = get_stable_hyperparameters()
    experiment.log_parameters(hparams)

    # === Model ===
    model = DeepONet(
        portfolio_feature_dim=FEATURE_DIM,
        hidden_dim=HIDDEN_DIM,
        latent_dim=LATENT_DIM,
        use_pma=True,
        num_seeds=NUM_SEEDS
    ).to(DEVICE)

    K_scalar, S_T_scalar, cashflow_scaler = fit_scalars_and_save()

    # Now get the training and validation dataset
    normalized_dataset = DatasetStandardized(
        num_samples=PORT_SAMPLE_SIZE,
        min_portfolio_size=1,
        max_portfolio_size=PORFOLIO_MAX_LENGTH,
        num_samples_S_T=FEED_ST_LEN_PER_PORT,
        K_scaler=K_scalar,
        S_T_scaler=S_T_scalar,
        cashflow_scaler=cashflow_scaler,
        is_fitting_mode=False
    )
    train_size = int(0.8 * len(normalized_dataset))
    val_size = len(normalized_dataset) - train_size
    val_size = (val_size // BATCH_SIZE) * BATCH_SIZE
    train_size = len(normalized_dataset) - val_size


    train_dataset, val_dataset = random_split(normalized_dataset, [train_size, val_size])
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        collate_fn=collate_fn
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=collate_fn
    )

    # === Trainer ===
    trainer = OptimizedTrainer(
    model,
    device=DEVICE,
    learning_rate=hparams["learning_rate"],
    lambda_deriv_weight=hparams["lambda_deriv"],
    weight_decay=hparams["weight_decay"],
    monitor_gradients=True,
    grad_log_threshold=5.0,
    scheduler_T_0=hparams["scheduler_T0"],
    scheduler_T_mult=hparams["scheduler_T_mult"],
    scheduler_eta_min=hparams["scheduler_eta_min"],
    warmup_epochs=5,
    initial_scale=0.05,
    final_scale=1.0
    )


    early_stopper = ExtendedEarlyStopping(
        patience=50, min_delta=0.001, restore_best_weights=True
    )

    # === Track best model ===
    best_val_loss = float('inf')
    best_epoch = 0

    for epoch in range(TOTAL_EPOCHS):

        train_dataset.dataset.set_epoch(epoch)
        model.train()

        # --- Training Loop ---
        train_total_losses, train_cf_losses, train_deriv_losses = [], [], []

        for batch_idx, batch in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch}")):

            portfolio = batch["portfolio"].to(DEVICE)
            portfolio = portfolio[:, :, :FEATURE_DIM]

            mask = batch["mask"].to(DEVICE)
            S_T = batch["S_T"].to(DEVICE)
            cashflow = batch["cashflow"].to(DEVICE)
            derivative = batch["derivative"].to(DEVICE)

            # Training step
            total, cf, deriv = trainer.train_step(
                portfolio,
                S_T.clone().detach().requires_grad_(True),
                cashflow,
                derivative,
                mask
            )

            train_total_losses.append(total)
            train_cf_losses.append(cf)
            train_deriv_losses.append(deriv)

            # Batch-level logging
            global_step = epoch * len(train_loader) + batch_idx
            experiment.log_metric("train_total_loss_batch", total, step=global_step)
            experiment.log_metric("train_cashflow_loss_batch", cf, step=global_step)
            experiment.log_metric("train_derivative_loss_batch", deriv, step=global_step)

        # Epoch-level training metrics
        avg_train_total = np.mean(train_total_losses)
        avg_train_cf = np.mean(train_cf_losses)
        avg_train_deriv = np.mean(train_deriv_losses)

        experiment.log_metric("train_total_loss_epoch", avg_train_total, step=epoch)
        experiment.log_metric("train_cashflow_loss_epoch", avg_train_cf, step=epoch)
        experiment.log_metric("train_derivative_loss_epoch", avg_train_deriv, step=epoch)


        model.eval()
        val_total_losses, val_cf_losses, val_deriv_losses = [], [], []

        with torch.no_grad():
            for batch_idx, batch in enumerate(val_loader):
                portfolio = batch["portfolio"].to(DEVICE)
                portfolio = portfolio[:, :, :FEATURE_DIM]

                mask = batch["mask"].to(DEVICE)
                S_T = batch["S_T"].to(DEVICE)
                cashflow = batch["cashflow"].to(DEVICE)
                derivative = batch["derivative"].to(DEVICE)

                # Validation step
                total, cf, deriv = trainer.val_step(
                    portfolio, S_T, cashflow, derivative, mask
                )

                val_total_losses.append(total)
                val_cf_losses.append(cf)
                val_deriv_losses.append(deriv)

                # Batch-level validation logging
                global_step = epoch * len(val_loader) + batch_idx
                experiment.log_metric("val_total_loss_batch", total, step=global_step)
                experiment.log_metric("val_cashflow_loss_batch", cf, step=global_step)
                experiment.log_metric("val_derivative_loss_batch", deriv, step=global_step)

        # Step the scheduler
        trainer.step_scheduler_epoch()

        current_lr = trainer.optimizer.param_groups[0]['lr']

        # Epoch-level validation metrics
        avg_val_total = np.mean(val_total_losses)
        avg_val_cf = np.mean(val_cf_losses)
        avg_val_deriv = np.mean(val_deriv_losses)

        experiment.log_metric("val_total_loss_epoch", avg_val_total, step=epoch)
        experiment.log_metric("val_cashflow_loss_epoch", avg_val_cf, step=epoch)
        experiment.log_metric("val_derivative_loss_epoch", avg_val_deriv, step=epoch)

        print(
            f"Epoch [{epoch}/{TOTAL_EPOCHS}] "
            f"Train → total: {avg_train_total:.6f}, cf: {avg_train_cf:.6f}, deriv: {avg_train_deriv:.6f} | "
            f"Val → total: {avg_val_total:.6f}, cf: {avg_val_cf:.6f}, deriv: {avg_val_deriv:.6f}"
        )

        # === SAVE BEST MODEL ===
        if avg_val_total < best_val_loss:
            best_val_loss = avg_val_total
            best_epoch = epoch + 1

            best_checkpoint_path = CHECK_POIN_DIR + "deeponet_model-BEST.pt"
            save_model_checkpoint(
                model=model,
                save_path=best_checkpoint_path,
                epoch=epoch + 1,
                optimizer=trainer.optimizer,
                scheduler=trainer.scheduler if hasattr(trainer, 'scheduler') else None,
                train_loss=avg_train_total,
                val_loss=avg_val_total
            )
            print(f"NEW BEST MODEL! Epoch {epoch + 1}, Val Loss: {avg_val_total:.6f}")

        # === SAVE MODEL EVERY 50 EPOCHS ===
        if (epoch + 1) % 50 == 0:
            checkpoint_path = CHECK_POIN_DIR + f"deeponet_model-epoch{epoch + 1}.pt"
            save_model_checkpoint(
                model=model,
                save_path=checkpoint_path,
                epoch=epoch + 1,
                optimizer=trainer.optimizer,
                scheduler=trainer.scheduler if hasattr(trainer, 'scheduler') else None,
                train_loss=avg_train_total,
                val_loss=avg_val_total
            )
            print(f"Model checkpoint saved at epoch {epoch + 1}: {checkpoint_path}")

        # --- Early Stopping Check ---
        stop = early_stopper(avg_val_total, model)
        if stop:
            print(f"Early stopping triggered at epoch {epoch}. Best val loss: {early_stopper.best:.6f}")
            break

    # === SAVE FINAL MODEL ===
    save_path = DRIVE_PATH + "final_deeponet_model.pt"
    save_model_checkpoint(model=model, save_path=save_path)

    # === TRAINING SUMMARY ===
    print("\n" + "="*70)
    print("TRAINING COMPLETE")
    print("="*70)
    print(f"Best Model:")
    print(f"  - Epoch: {best_epoch}")
    print(f"  - Validation Loss: {best_val_loss:.6f}")
    print(f"  - Saved at: {CHECK_POIN_DIR}deeponet_model-BEST.pt")
    print(f"\nFinal Model:")
    print(f"  - Saved at: {save_path}")
    print(f"\nCheckpoints:")
    print(f"  - Directory: {CHECK_POIN_DIR}")
    print(f"  - Saved every 50 epochs")
    print(f"\nScalers:")
    print(f"  - Saved at: {DRIVE_PATH}*_Scalar_Training.pkl")
    print("="*70)

    experiment.end()


if __name__ == "__main__":
    main()

[1;38;5;39mCOMET INFO:[0m An experiment with the same configuration options is already running and will be reused.


Fitting scalers on RAW data of size 1000
Fitting K and S_T scalers from training set...


Collecting K and S_T for scalers:  10%|█         | 40/400 [00:13<02:02,  2.94it/s]

In [44]:
experiment.end()

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : okay_rosin_3999
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/satyabratkumarsingh/option-portfolio-encoder-decoder/fa8bd5ba49bb4137981ef05361e45c09
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     train_cashflow_loss_batch [30400]   : (0.01226563099771738, 0.147788405418396)
[1;38;5;39mCOMET INFO:[0m     train_cashflow_loss_epoch [95]      : (0.022968936304096133, 0.08902193123940379)
[1;38;5;39mCOMET INFO:[0m     train_derivative_loss_batch [30400] : (0.022049417719244957, 0.3778

In [None]:
experiment.end()

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : balanced_dowel_6941
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/satyabratkumarsingh/option-portfolio-encoder-decoder/99a57b0cd3da4bb4bd9e4a2180650536
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     train_cashflow_loss_batch [79205]   : (0.006972298491746187, 0.1393452286720276)
[1;38;5;39mCOMET INFO:[0m     train_cashflow_loss_epoch [247]     : (0.01495979430328589, 0.08376031016232446)
[1;38;5;39mCOMET INFO:[0m     train_derivative_loss_batch [79205] : (0.01581203006207943, 0.