# v204: Beignet Public h=256 Upgrade

**目的:** 升級 Beignet Public TCN 從 h=128 到 h=256，大幅降低 Beignet Public MSE

**已知最佳 h=128 配置:**
- h=128, L=3, dropout=0.25, CORAL λ=3.0, Mean=1.0, 9 features -> Public MSE 52,323

**Sweep 計畫:**
- h=256 with different L (3, 4), dropout (0.25, 0.35), CORAL λ (3.0, 5.0, 7.0)
- 找到最佳配置後訓練 5 seeds

**推論時間預算:** 5 seeds h=256 ~ +86s on Codabench -> total ~356s / 600s = SAFE

In [None]:
# ============================================================================
# Cell 1: Setup
# ============================================================================

from google.colab import drive
drive.mount('/content/drive')

PROJECT_ROOT = '/content/drive/MyDrive/Hackathon_NSF_Neural_Forecasting'
TRAIN_DIR = f'{PROJECT_ROOT}/1_data/raw/train_data_neuro'
TEST_DIR = f'{PROJECT_ROOT}/1_data/raw/test_dev_input'

import os, time, torch, numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
if device.type == 'cuda':
    print(f'GPU: {torch.cuda.get_device_name(0)}')

In [None]:
# ============================================================================
# Cell 2: Architecture (CORAL losses + TCN classes + augmentation)
# ============================================================================

# --- CORAL Losses ---

def coral_loss(source, target):
    d = source.size(1)
    cs = (source - source.mean(0, keepdim=True)).T @ (source - source.mean(0, keepdim=True)) / (source.size(0) - 1 + 1e-8)
    ct = (target - target.mean(0, keepdim=True)).T @ (target - target.mean(0, keepdim=True)) / (target.size(0) - 1 + 1e-8)
    return ((cs - ct) ** 2).sum() / (4 * d)

def mean_alignment_loss(source, target):
    return ((source.mean(0) - target.mean(0)) ** 2).mean()

# --- TCN Architecture ---

class CausalConv1d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, dilation=1):
        super().__init__()
        self.padding = (kernel_size - 1) * dilation
        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, padding=self.padding, dilation=dilation)
    def forward(self, x):
        out = self.conv(x)
        return out[:, :, :-self.padding] if self.padding > 0 else out

class TCNBlock(nn.Module):
    def __init__(self, in_ch, out_ch, kernel_size, dilation, dropout=0.2):
        super().__init__()
        self.conv1 = CausalConv1d(in_ch, out_ch, kernel_size, dilation)
        self.conv2 = CausalConv1d(out_ch, out_ch, kernel_size, dilation)
        self.norm1, self.norm2 = nn.BatchNorm1d(out_ch), nn.BatchNorm1d(out_ch)
        self.activation = nn.GELU()
        self.dropout = nn.Dropout(dropout)
        self.residual = nn.Conv1d(in_ch, out_ch, 1) if in_ch != out_ch else nn.Identity()
    def forward(self, x):
        r = self.residual(x)
        x = self.dropout(self.activation(self.norm1(self.conv1(x))))
        x = self.dropout(self.activation(self.norm2(self.conv2(x))))
        return x + r

class TCNEncoder(nn.Module):
    def __init__(self, in_size, h_size, n_layers=4, k_size=3, dropout=0.2):
        super().__init__()
        self.input_proj = nn.Conv1d(in_size, h_size, 1)
        self.layers = nn.ModuleList([TCNBlock(h_size, h_size, k_size, 2**i, dropout) for i in range(n_layers)])
    def forward(self, x):
        x = self.input_proj(x.transpose(1,2))
        for l in self.layers: x = l(x)
        return x.transpose(1,2)

class TCNForecaster(nn.Module):
    def __init__(self, n_ch, n_feat=1, h=64, n_layers=3, dropout=0.3):
        super().__init__()
        self.channel_embed = nn.Embedding(n_ch, h//4)
        self.input_proj = nn.Linear(n_feat + h//4, h)
        self.tcn = TCNEncoder(h, h, n_layers, 3, dropout)
        self.cross_attn = nn.MultiheadAttention(h, 4, dropout=dropout, batch_first=True)
        self.attn_norm = nn.LayerNorm(h)
        self.pred_head = nn.Sequential(nn.Linear(h,h), nn.GELU(), nn.Dropout(dropout), nn.Linear(h,10))
    def forward(self, x, return_features=False):
        B,T,C,F = x.shape
        ch_emb = self.channel_embed(torch.arange(C, device=x.device)).unsqueeze(0).unsqueeze(0).expand(B,T,-1,-1)
        x = torch.cat([x, ch_emb], -1).permute(0,2,1,3).reshape(B*C,T,-1)
        x = self.tcn(self.input_proj(x))
        x = x[:,-1,:].view(B,C,-1)
        x = self.attn_norm(x + self.cross_attn(x,x,x)[0])
        pred = self.pred_head(x).transpose(1,2)
        if return_features:
            return pred, x.mean(dim=1)
        return pred

# --- Augmentation ---

def augment_batch(x, y):
    """Apply augmentations to source batch during training.
    x: (B, T, C, F) normalized
    y: (B, T_out, C) normalized
    """
    # 1. Channel-wise mean shift (simulates baseline drift between domains)
    if torch.rand(1).item() < 0.5:
        shift = 0.15 * torch.randn(1, 1, x.shape[2], 1, device=x.device)
        x = x.clone()
        x[..., 0:1] = x[..., 0:1] + shift
        y = y + shift[..., 0].squeeze(0)
    # 2. Amplitude scaling per channel (simulates gain drift)
    if torch.rand(1).item() < 0.5:
        scale = 1.0 + 0.08 * torch.randn(1, 1, x.shape[2], 1, device=x.device)
        x = x * scale
        y = y * scale[..., 0].squeeze(0)
    # 3. Gaussian noise (regularization)
    if torch.rand(1).item() < 0.3:
        x = x + 0.03 * torch.randn_like(x)
    return x, y

# --- Param count check ---
for h, nl in [(128, 3), (256, 3), (256, 4)]:
    m = TCNForecaster(89, 9, h, nl, 0.25)
    n = sum(p.numel() for p in m.parameters())
    print(f'h={h}, layers={nl}: {n:,} params ({n/1e6:.2f}M)')

print('\nArchitecture, CORAL losses, and augmentation defined.')

In [None]:
# ============================================================================
# Cell 3: Load Data
# ============================================================================

train_data = np.load(f'{TRAIN_DIR}/train_data_beignet.npz')['arr_0']
test_public = np.load(f'{TEST_DIR}/test_data_beignet_masked.npz')['arr_0']

n_features = 9
X_train = train_data[:, :10, :, :n_features].astype(np.float32)
Y_train = train_data[:, 10:, :, 0].astype(np.float32)
X_target = test_public[:, :10, :, :n_features].astype(np.float32)

# Normalization: mean/std computed from X_train over axes (0,1)
mean = X_train.mean(axis=(0,1), keepdims=True)
std = X_train.std(axis=(0,1), keepdims=True) + 1e-8

X_train_n = (X_train - mean) / std
Y_train_n = (Y_train - mean[...,0]) / std[...,0]
X_target_n = (X_target - mean) / std

# Validation split: last 100 samples
n_val = 100
X_tr, X_val = X_train_n[:-n_val], X_train_n[-n_val:]
Y_tr, Y_val = Y_train_n[:-n_val], Y_train_n[-n_val:]

print(f'Train: {len(X_tr)}, Val: {len(X_val)}, Target: {len(X_target_n)}')
print(f'X shape: {X_tr.shape}, Y shape: {Y_tr.shape}')
print(f'Mean shape: {mean.shape}, Std shape: {std.shape}')

In [None]:
# ============================================================================
# Cell 4: Training Function
# ============================================================================

def train_coral_model(h_size, n_layers, dropout, coral_w, mean_w, seed,
                      epochs=250, patience=30, batch_size=32, use_aug=True):
    """Train a single TCN CORAL model with given config and seed."""
    torch.manual_seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

    train_ds = TensorDataset(torch.FloatTensor(X_tr), torch.FloatTensor(Y_tr))
    val_ds = TensorDataset(torch.FloatTensor(X_val), torch.FloatTensor(Y_val))
    target_ds = TensorDataset(torch.FloatTensor(X_target_n))
    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=False)
    val_dl = DataLoader(val_ds, batch_size=batch_size)
    target_dl = DataLoader(target_ds, batch_size=batch_size, shuffle=True)

    model = TCNForecaster(89, 9, h_size, n_layers, dropout).to(device)
    n_params = sum(p.numel() for p in model.parameters())
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

    best_val, best_state, no_improve = float('inf'), None, 0
    t0 = time.time()

    for epoch in range(epochs):
        model.train()
        target_iter = iter(target_dl)
        for xb, yb in train_dl:
            xb, yb = xb.to(device), yb.to(device)
            try:
                (xt,) = next(target_iter)
            except StopIteration:
                target_iter = iter(target_dl)
                (xt,) = next(target_iter)
            xt = xt.to(device)

            # Apply augmentation
            if use_aug:
                xb, yb = augment_batch(xb, yb)

            optimizer.zero_grad()
            pred, feat_src = model(xb, return_features=True)
            _, feat_tgt = model(xt, return_features=True)
            loss = ((pred - yb)**2).mean() + coral_w * coral_loss(feat_src, feat_tgt) + mean_w * mean_alignment_loss(feat_src, feat_tgt)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
        scheduler.step()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for xb, yb in val_dl:
                xb, yb = xb.to(device), yb.to(device)
                val_loss += ((model(xb) - yb)**2).sum().item()
        val_mse = (val_loss / len(X_val)) * (std[...,0]**2).mean()

        if val_mse < best_val:
            best_val = val_mse
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            no_improve = 0
        else:
            no_improve += 1
        if no_improve >= patience:
            break

    elapsed = time.time() - t0
    print(f'  seed={seed} h={h_size} L={n_layers} do={dropout} CORAL={coral_w} Mean={mean_w} '
          f'-> Val MSE: {best_val:.0f} ({n_params:,} params, ep {epoch+1}, {elapsed:.0f}s)')
    return best_val, best_state

print('Training function defined')

In [None]:
# ============================================================================
# Cell 5: Architecture Sweep
# ============================================================================

configs = [
    # (h, L, dropout, coral_lambda, mean_lambda, name)
    (256, 3, 0.25, 3.0, 1.0, "h256_L3_do025_c3"),     # baseline scaled up
    (256, 3, 0.35, 3.0, 1.0, "h256_L3_do035_c3"),     # more dropout
    (256, 4, 0.30, 3.0, 1.0, "h256_L4_do030_c3"),     # deeper
    (256, 3, 0.25, 5.0, 2.0, "h256_L3_do025_c5"),     # stronger CORAL
    (256, 3, 0.30, 5.0, 2.0, "h256_L3_do030_c5"),     # stronger CORAL + dropout
    (256, 3, 0.25, 7.0, 3.0, "h256_L3_do025_c7"),     # aggressive CORAL
    (128, 3, 0.25, 3.0, 1.0, "h128_baseline"),         # baseline for comparison
]

# Run each with seed=42
results = []
for h, nl, do, cw, mw, name in configs:
    print(f'\n=== {name} ===')
    val_mse, state = train_coral_model(h, nl, do, cw, mw, seed=42)
    results.append((name, h, nl, do, cw, mw, val_mse, state))

# Sort and print
results.sort(key=lambda x: x[6])
print('\n=== Sweep Results (sorted by val_mse) ===')
for name, h, nl, do, cw, mw, val_mse, _ in results:
    print(f'  {name}: Val MSE = {val_mse:,.0f}')
best = results[0]
print(f'\nBest: {best[0]} (Val MSE: {best[6]:,.0f})')
BEST_H, BEST_NL, BEST_DO, BEST_CW, BEST_MW = best[1], best[2], best[3], best[4], best[5]

In [None]:
# ============================================================================
# Cell 6: Multi-seed Training with Best Config
# ============================================================================

SEEDS = [42, 123, 456, 789, 2024]
print(f'=== Training {len(SEEDS)} seeds with best config: h={BEST_H} L={BEST_NL} do={BEST_DO} CORAL={BEST_CW} Mean={BEST_MW} ===')

seed_results = []
for s in SEEDS:
    val_mse, state = train_coral_model(BEST_H, BEST_NL, BEST_DO, BEST_CW, BEST_MW, seed=s)
    seed_results.append((s, val_mse, state))

print(f'\n=== Multi-seed Results ===')
for s, val_mse, _ in seed_results:
    print(f'  seed={s}: Val MSE = {val_mse:,.0f}')
val_mses = [r[1] for r in seed_results]
print(f'Mean: {np.mean(val_mses):,.0f}, Std: {np.std(val_mses):,.0f}')

In [None]:
# ============================================================================
# Cell 7: Save Checkpoints
# ============================================================================

out_dir = f'{PROJECT_ROOT}/4_models/v204_beignet_pub_h256'
os.makedirs(out_dir, exist_ok=True)

config = {
    'h_size': BEST_H,
    'n_layers': BEST_NL,
    'dropout': BEST_DO,
    'coral_weight': BEST_CW,
    'mean_weight': BEST_MW,
    'n_features': 9,
    'seeds': SEEDS,
}
print(f'Config: {config}')

for s, val_mse, state in seed_results:
    path = f'{out_dir}/model_tcn_seed{s}.pth'
    torch.save({
        'model_state_dict': state,
        'val_mse': val_mse,
        'config': config,
    }, path)
    fsize = os.path.getsize(path) / 1024
    print(f'Saved seed {s}: Val={val_mse:,.0f}, size={fsize:.0f}KB')

# Also save sweep results
sweep_path = f'{out_dir}/sweep_results.txt'
with open(sweep_path, 'w') as f:
    for name, h, nl, do, cw, mw, val_mse, _ in results:
        f.write(f'{name}: h={h} L={nl} do={do} CORAL={cw} Mean={mw} Val={val_mse:.0f}\n')
print(f'Sweep results saved to {sweep_path}')

In [None]:
# ============================================================================
# Cell 8: Download
# ============================================================================

from google.colab import files
for s in SEEDS:
    files.download(f'{out_dir}/model_tcn_seed{s}.pth')

print('\n=== Done! ===')
print(f'Best config: h={BEST_H}, L={BEST_NL}, do={BEST_DO}, CORAL={BEST_CW}, Mean={BEST_MW}')
print(f'Seeds: {SEEDS}')
print(f'\nFor model.py, update config:')
print(f'  V200_H = {BEST_H}')
print(f'  V200_LAYERS = {BEST_NL}')
print(f'  V200_DROPOUT = {BEST_DO}')
print(f'  V200_SEEDS = {SEEDS}')