# v202b: Beignet Private h=128 (方向 2)

**目的:** Private 模型從 h=64 升級到 h=128

**背景:**
- Beignet Public h=64→128 帶來 -1,978 (v200)
- Private 目前還是 h=64, L=3, 1-feat, dropout=0.3
- Private 數據量小 (82+76=158 samples)，需要更強 regularization

**計劃:**
1. Architecture sweep: h=64 vs h=128 (with 1-feat and 9-feat)
2. Multi-seed training with best config
3. 使用 combined normalization (v200b 驗證過比 per-domain 好)

**預估效果:** Priv1 36,780→~36,200 (-580), Priv2 38,000→~37,500 (-500)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

PROJECT_ROOT = '/content/drive/MyDrive/Hackathon_NSF_Neural_Forecasting'
DATA_DIR = f'{PROJECT_ROOT}/1_data/raw/train_data_neuro'
TEST_DIR = f'{PROJECT_ROOT}/1_data/raw/test_dev_input'

import os, time, torch, numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
if device.type == 'cuda': print(f'GPU: {torch.cuda.get_device_name(0)}')

In [None]:
# ============================================================================
# Load Private Training Data + Compute Proper Normalization
# ============================================================================

train_priv1 = np.load(f'{DATA_DIR}/train_data_beignet_2022-06-01_private.npz')['arr_0']
train_priv2 = np.load(f'{DATA_DIR}/train_data_beignet_2022-06-02_private.npz')['arr_0']
print(f'Priv1: {train_priv1.shape}')  # (82, 20, 89, 9)
print(f'Priv2: {train_priv2.shape}')  # (76, 20, 89, 9)

# Combined private normalization (v200b Step 6 confirmed this is better)
priv1_input = train_priv1[:, :10, :, :].astype(np.float32)
priv2_input = train_priv2[:, :10, :, :].astype(np.float32)
priv_all_input = np.concatenate([priv1_input, priv2_input], axis=0)
mean_priv = priv_all_input.mean(axis=(0,1), keepdims=True)
std_priv = priv_all_input.std(axis=(0,1), keepdims=True) + 1e-8

print(f'Combined Private std (feat0) avg: {std_priv[...,0].mean():.1f}')
print(f'Total private samples: {len(priv_all_input)}')

In [None]:
# ============================================================================
# TCN Architecture (same as v200b)
# ============================================================================

class CausalConv1d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, dilation=1):
        super().__init__()
        self.padding = (kernel_size - 1) * dilation
        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, padding=self.padding, dilation=dilation)
    def forward(self, x):
        out = self.conv(x)
        return out[:, :, :-self.padding] if self.padding > 0 else out

class TCNBlock(nn.Module):
    def __init__(self, in_ch, out_ch, kernel_size, dilation, dropout=0.2):
        super().__init__()
        self.conv1 = CausalConv1d(in_ch, out_ch, kernel_size, dilation)
        self.conv2 = CausalConv1d(out_ch, out_ch, kernel_size, dilation)
        self.norm1, self.norm2 = nn.BatchNorm1d(out_ch), nn.BatchNorm1d(out_ch)
        self.activation = nn.GELU()
        self.dropout = nn.Dropout(dropout)
        self.residual = nn.Conv1d(in_ch, out_ch, 1) if in_ch != out_ch else nn.Identity()
    def forward(self, x):
        r = self.residual(x)
        x = self.dropout(self.activation(self.norm1(self.conv1(x))))
        x = self.dropout(self.activation(self.norm2(self.conv2(x))))
        return x + r

class TCNEncoder(nn.Module):
    def __init__(self, in_size, h_size, n_layers=4, k_size=3, dropout=0.2):
        super().__init__()
        self.input_proj = nn.Conv1d(in_size, h_size, 1)
        self.layers = nn.ModuleList([TCNBlock(h_size, h_size, k_size, 2**i, dropout) for i in range(n_layers)])
    def forward(self, x):
        x = self.input_proj(x.transpose(1,2))
        for l in self.layers: x = l(x)
        return x.transpose(1,2)

class TCNForecaster(nn.Module):
    def __init__(self, n_ch, n_feat=1, h=64, n_layers=3, dropout=0.3):
        super().__init__()
        self.channel_embed = nn.Embedding(n_ch, h//4)
        self.input_proj = nn.Linear(n_feat + h//4, h)
        self.tcn = TCNEncoder(h, h, n_layers, 3, dropout)
        self.cross_attn = nn.MultiheadAttention(h, 4, dropout=dropout, batch_first=True)
        self.attn_norm = nn.LayerNorm(h)
        self.pred_head = nn.Sequential(nn.Linear(h,h), nn.GELU(), nn.Dropout(dropout), nn.Linear(h,10))
    def forward(self, x):
        B,T,C,F = x.shape
        ch_emb = self.channel_embed(torch.arange(C, device=x.device)).unsqueeze(0).unsqueeze(0).expand(B,T,-1,-1)
        x = torch.cat([x, ch_emb], -1).permute(0,2,1,3).reshape(B*C,T,-1)
        x = self.tcn(self.input_proj(x))
        x = x[:,-1,:].view(B,C,-1)
        x = self.attn_norm(x + self.cross_attn(x,x,x)[0])
        return self.pred_head(x).transpose(1,2)

# Count params
for h, nl, nf in [(64, 3, 1), (128, 3, 1), (128, 3, 9), (128, 4, 1)]:
    m = TCNForecaster(89, nf, h, nl, 0.3)
    n = sum(p.numel() for p in m.parameters())
    print(f'h={h}, L={nl}, feat={nf}: {n:,} params ({n/1e6:.2f}M)')

In [None]:
# ============================================================================
# Training Function (enhanced for small data)
# ============================================================================

def train_private_model(train_data, domain_mean, domain_std,
                        n_feat, h_size, n_layers, dropout,
                        seed=42, epochs=200, patience=25, batch_size=16, lr=5e-4):
    torch.manual_seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

    X_all = train_data[:, :10, :, :n_feat].astype(np.float32)
    Y_all = train_data[:, 10:, :, 0].astype(np.float32)

    dm = domain_mean[..., :n_feat]
    ds = domain_std[..., :n_feat]
    X_all_n = (X_all - dm) / ds
    Y_all_n = (Y_all - domain_mean[..., 0]) / domain_std[..., 0]

    n = len(X_all_n)
    idx = np.random.permutation(n)
    n_val = max(8, n // 6)
    X_tr, X_val = X_all_n[idx[n_val:]], X_all_n[idx[:n_val]]
    Y_tr, Y_val = Y_all_n[idx[n_val:]], Y_all_n[idx[:n_val]]

    train_ds = TensorDataset(torch.FloatTensor(X_tr), torch.FloatTensor(Y_tr))
    val_ds = TensorDataset(torch.FloatTensor(X_val), torch.FloatTensor(Y_val))
    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=False)
    val_dl = DataLoader(val_ds, batch_size=batch_size)

    model = TCNForecaster(89, n_feat, h_size, n_layers, dropout).to(device)
    n_params = sum(p.numel() for p in model.parameters())
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-3)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

    best_val, best_state, no_improve = float('inf'), None, 0
    t0 = time.time()

    for epoch in range(epochs):
        model.train()
        for xb, yb in train_dl:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            pred = model(xb)
            loss = ((pred - yb)**2).mean()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
        scheduler.step()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for xb, yb in val_dl:
                xb, yb = xb.to(device), yb.to(device)
                val_loss += ((model(xb) - yb)**2).sum().item()
        val_mse = (val_loss / len(X_val)) * (domain_std[...,0]**2).mean()

        if val_mse < best_val:
            best_val = val_mse
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            no_improve = 0
        else:
            no_improve += 1
        if no_improve >= patience: break

    elapsed = time.time() - t0
    print(f'  seed={seed} h={h_size} L={n_layers} feat={n_feat} do={dropout}'
          f' -> Val MSE: {best_val:.0f} ({n_params:,} params, {n} samples, ep {epoch+1}, {elapsed:.0f}s)')
    return best_val, best_state

print('Training function defined')

In [None]:
# ============================================================================
# Step 1: Architecture Sweep (h=64 vs h=128, 1-feat vs 9-feat)
# Use combined normalization
# ============================================================================

configs = [
    # (n_feat, h_size, n_layers, dropout, label)
    (1,  64,  3, 0.30, '1f h=64 L=3 do=0.3 (v200b baseline)'),
    (1, 128,  3, 0.30, '1f h=128 L=3 do=0.3'),
    (1, 128,  3, 0.40, '1f h=128 L=3 do=0.4'),
    (1, 128,  3, 0.50, '1f h=128 L=3 do=0.5'),
    (9,  64,  3, 0.30, '9f h=64 L=3 do=0.3'),
    (9, 128,  3, 0.40, '9f h=128 L=3 do=0.4'),
]

print('=== Architecture Sweep (seed=42, combined normalization) ===')
sweep_results = []

for nf, h, nl, do, label in configs:
    print(f'\n--- {label} ---')
    v1, s1 = train_private_model(train_priv1, mean_priv, std_priv, nf, h, nl, do, seed=42)
    v2, s2 = train_private_model(train_priv2, mean_priv, std_priv, nf, h, nl, do, seed=42)
    avg = (v1 + v2) / 2
    sweep_results.append((nf, h, nl, do, label, v1, v2, avg))
    print(f'  Combined: {avg:.0f}')

sweep_results.sort(key=lambda x: x[7])
print(f'\n=== Results (sorted by combined MSE) ===')
for nf, h, nl, do, label, v1, v2, avg in sweep_results:
    tag = ' *** BEST' if avg == sweep_results[0][7] else ''
    print(f'{label}: P1={v1:.0f} P2={v2:.0f} Avg={avg:.0f}{tag}')

best = sweep_results[0]
BEST_NF, BEST_H, BEST_NL, BEST_DO = best[0], best[1], best[2], best[3]
print(f'\nBest config: feat={BEST_NF}, h={BEST_H}, L={BEST_NL}, do={BEST_DO}')

In [None]:
# ============================================================================
# Step 2: Multi-Seed Training with Best Config
# ============================================================================

seeds = [42, 123, 456, 789, 2024]

print(f'=== Multi-Seed (feat={BEST_NF}, h={BEST_H}, L={BEST_NL}, do={BEST_DO}) ===')

print('\n--- Priv1 ---')
p1_results = []
for s in seeds:
    v, state = train_private_model(train_priv1, mean_priv, std_priv, BEST_NF, BEST_H, BEST_NL, BEST_DO, seed=s)
    p1_results.append((s, v, state))

print('\n--- Priv2 ---')
p2_results = []
for s in seeds:
    v, state = train_private_model(train_priv2, mean_priv, std_priv, BEST_NF, BEST_H, BEST_NL, BEST_DO, seed=s)
    p2_results.append((s, v, state))

print(f'\n=== Results ===')
p1_vals = [r[1] for r in p1_results]
p2_vals = [r[1] for r in p2_results]
print(f'Priv1: mean={np.mean(p1_vals):.0f} std={np.std(p1_vals):.0f}')
print(f'Priv2: mean={np.mean(p2_vals):.0f} std={np.std(p2_vals):.0f}')
for i, s in enumerate(seeds):
    print(f'  seed={s}: P1={p1_vals[i]:.0f} P2={p2_vals[i]:.0f}')

In [None]:
# ============================================================================
# Step 3: Save All Models
# ============================================================================

out_dir = f'{PROJECT_ROOT}/4_models/v202b_private_h128'
os.makedirs(out_dir, exist_ok=True)

config = {
    'n_feat': int(BEST_NF),
    'h_size': int(BEST_H),
    'n_layers': int(BEST_NL),
    'dropout': float(BEST_DO),
    'seeds': seeds,
}
print(f'Config: {config}')

for s, v, state in p1_results:
    path = f'{out_dir}/model_tcn_priv1_seed{s}.pth'
    torch.save({'model_state_dict': state, 'val_mse': v, 'config': config}, path)
    print(f'Priv1 seed {s}: Val={v:.0f} -> {path}')

for s, v, state in p2_results:
    path = f'{out_dir}/model_tcn_priv2_seed{s}.pth'
    torch.save({'model_state_dict': state, 'val_mse': v, 'config': config}, path)
    print(f'Priv2 seed {s}: Val={v:.0f} -> {path}')

# Save normalization (combined)
np.savez(f'{out_dir}/normalization_priv_combined.npz', mean=mean_priv, std=std_priv)
print(f'\nSaved to {out_dir}')

In [None]:
# ============================================================================
# Step 4: Download
# ============================================================================

from google.colab import files

for s in seeds:
    files.download(f'{out_dir}/model_tcn_priv1_seed{s}.pth')
    files.download(f'{out_dir}/model_tcn_priv2_seed{s}.pth')

files.download(f'{out_dir}/normalization_priv_combined.npz')

print('\n=== Downloaded! ===')
print(f'Config: feat={BEST_NF}, h={BEST_H}, L={BEST_NL}, do={BEST_DO}')
print(f'\nFor model.py, update:')
print(f'  V200B_PRIV_H = {BEST_H}')
print(f'  V200B_PRIV_LAYERS = {BEST_NL}')
print(f'  V200B_PRIV_DROPOUT = {BEST_DO}')
print(f'  V200B_PRIV_FEAT = {BEST_NF}')
print(f'  V200B_PRIV_SEEDS = {seeds}')
print(f'\nIMPORTANT: If h changed, the .pth files are NOT compatible with old model!')
print(f'Must replace ALL priv seed files, not just add new ones.')