In [None]:
!pip install kaggle



In [None]:
from google.colab import files
files.upload()  # 選擇 kaggle.json

In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c intro-ml-2025-nccu-task3


Downloading intro-ml-2025-nccu-task3.zip to /content
  0% 0.00/588k [00:00<?, ?B/s]
100% 588k/588k [00:00<00:00, 267MB/s]


In [None]:
!unzip intro-ml-2025-nccu-task3.zip -d task3

Archive:  intro-ml-2025-nccu-task3.zip
  inflating: task3/task3_test.csv    
  inflating: task3/task3_test_sample_submissions.csv  
  inflating: task3/task3_train.csv   


In [None]:
import pandas as pd

train = pd.read_csv("task3/task3_train.csv")
test = pd.read_csv("task3/task3_test.csv")
sample = pd.read_csv("task3/task3_test_sample_submissions.csv")

train.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,value
0,-9.214828,5.817583,7.821907,26.149092,-7.423011,14.325433,0.17346,-12.413642,-1.199872,-8778.806766
1,3.426554,0.046013,1.907013,-25.134674,5.255181,22.32988,-12.759983,9.463432,-2.355829,1322.525297
2,-11.008204,-15.707894,13.018347,7.073724,0.405525,1.088192,-7.861424,-4.826598,-7.114291,1621.848205
3,4.344236,2.377356,-5.9415,-14.460579,0.721295,-5.294927,2.326762,0.218521,16.017789,1317.113312
4,8.116964,2.051131,7.02164,-15.916387,2.977037,-0.648867,4.952249,7.522719,-10.510979,1311.30835


In [None]:
train.shape

(8000, 10)

In [None]:
test.head()

Unnamed: 0,id,x1,x2,x3,x4,x5,x6,x7,x8,x9
0,1,4.751973,-9.267343,-12.834793,-10.924147,4.150118,-1.0424,-9.844618,4.695375,-8.275308
1,2,16.056474,2.0266,-13.506225,7.070796,-13.218757,4.803172,-4.112018,5.98724,5.517377
2,3,-10.778766,3.881728,14.424371,-3.130155,2.92139,4.536818,2.400036,-9.966949,2.235206
3,4,-26.369458,-7.447451,6.024749,-3.123047,9.940289,-22.331903,-7.349761,4.527465,4.40195
4,5,-2.156053,4.993374,0.465574,-5.442493,-7.585865,-1.914673,-12.945151,-4.426967,-3.215984


In [None]:
sample.head()

Unnamed: 0,id,value
0,1,234511
1,2,289393
2,3,238290
3,4,18472
4,5,1633


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# ================================
# 1. No-Softmax Self-Attention Layer
# ================================
class NoSoftmaxAttention(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.q_proj = nn.Linear(dim, dim, bias=False)
        self.k_proj = nn.Linear(dim, dim, bias=False)
        self.v_proj = nn.Linear(dim, dim, bias=False)

        # 防止爆炸：增加一個 learnable scalar
        self.scale = nn.Parameter(torch.tensor(0.1))

    def forward(self, x):
        # x shape: (batch, tokens, dim)
        Q = self.q_proj(x)   # (B, T, D)
        K = self.k_proj(x)   # (B, T, D)
        V = self.v_proj(x)   # (B, T, D)

        # q @ k^T  (no softmax)
        scores = torch.matmul(Q, K.transpose(-1, -2))  # 輸出shape(B, T, T)
        #K.transpose(-1, -2)以後會變(B,D,T)

        # 使用 scale 縮小，避免爆炸
        scores = scores * self.scale

        # output = scores @ V
        out = torch.matmul(scores, V)   # (B, T, D)
        return out


# ================================
# 2. Tabular Transformer Block
# ================================
class TabularBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.ln1 = nn.LayerNorm(dim)
        self.attn = NoSoftmaxAttention(dim)
        self.ln2 = nn.LayerNorm(dim)

        # MLP block
        self.mlp = nn.Sequential(
            nn.Linear(dim, dim * 4),
            nn.ReLU(),
            nn.Linear(dim * 4, dim)
        )

    def forward(self, x):
        # Attention block
        x_res = x
        x = self.ln1(x)
        x = self.attn(x)
        x = x + x_res  # residual

        # MLP block
        x_res = x
        x = self.ln2(x)
        x = self.mlp(x)
        x = x + x_res
        return x


# ================================
# 3. Full Model
# ================================
class NoSoftmaxTabularModel(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.dim = 32  # small for stability

        # 將每個特徵視為一個 token
        self.embed = nn.Linear(1, self.dim)

        self.block1 = TabularBlock(self.dim)
        self.block2 = TabularBlock(self.dim)

        # pooling
        self.pool = nn.AdaptiveAvgPool1d(1)

        # final regression head
        self.head = nn.Linear(self.dim, 1)

    def forward(self, x):
        # x shape: (batch, 9 features)
        B, F = x.shape

        # (B, F, 1) → (B, F, dim)
        x = x.unsqueeze(-1)
        x = self.embed(x)

        # two transformer blocks
        x = self.block1(x)
        x = self.block2(x)

        # pooling across tokens
        x = x.mean(dim=1)  # (B, dim)

        out = self.head(x)
        return out.squeeze(-1)


#「FT-Transformer 的簡化版 + no-softmax attention + learnable scale」

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, random_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import plotly.graph_objects as go
import copy

# ========== 0. 指定裝置 ==========
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ========== 1. 準備資料 ==========
X = train.drop(columns=['value']).values
y = train['value'].values

# 標準化
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)

X_tensor = torch.tensor(X_norm, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

dataset = TensorDataset(X_tensor, y_tensor)

# --- 切 90% train / 10% valid ---
n_total = len(dataset)
n_val = int(n_total * 0.1)
n_train = n_total - n_val

train_ds, val_ds = random_split(dataset, [n_train, n_val])

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=128, shuffle=False)

# ========== 2. 模型初始化 ==========
model = NoSoftmaxTabularModel(num_features=X_tensor.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
loss_fn = nn.MSELoss()

# 記錄 loss
train_losses = []
val_losses = []

# 記錄最佳模型
best_val_loss = float('inf')
best_model_state = None

# ========== 3. Training Loop ==========
epochs = 200

for epoch in range(epochs):
    # ---- 訓練 Train ----
    model.train()
    total_train_loss = 0.0

    for xb, yb in train_loader:
        xb = xb.to(device)
        yb = yb.to(device)

        pred = model(xb)  # 已經 squeeze(-1) 過了
        loss = loss_fn(pred, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # ---- 驗證 Validation ----
    model.eval()
    total_val_loss = 0.0

    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device)
            yb = yb.to(device)

            pred = model(xb)
            loss = loss_fn(pred, yb)
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    # 如果這輪的 val loss 比歷史最好還小，就記錄下來
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model_state = copy.deepcopy(model.state_dict())
        improved_flag = "  <-- best so far"
    else:
        improved_flag = ""

    # ---- 打印 ----
    print(
        f"Epoch {epoch+1:03d} | "
        f"Train Loss: {avg_train_loss:.4f} | "
        f"Val Loss: {avg_val_loss:.4f}{improved_flag}"
    )

# ========== 4. 載回最佳模型 ==========
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f"\nLoaded best model with Val Loss = {best_val_loss:.4f}")
else:
    print("\n[Warning] best_model_state is None，可能訓練 loop 沒有執行。")

# ========== 5. 畫圖 ==========
fig = go.Figure()
fig.add_trace(go.Scatter(y=train_losses, mode='lines', name='Train Loss'))
fig.add_trace(go.Scatter(y=val_losses, mode='lines', name='Val Loss'))

fig.update_layout(
    title="Training vs Validation Loss",
    xaxis_title="Epoch",
    yaxis_title="Loss",
    template="plotly_white"
)

fig.show()





Using device: cuda
Epoch 001 | Train Loss: 983629900.1593 | Val Loss: 969848610.2857  <-- best so far
Epoch 002 | Train Loss: 622024993.0265 | Val Loss: 627828534.8571  <-- best so far
Epoch 003 | Train Loss: 476618957.0265 | Val Loss: 549936996.5714  <-- best so far
Epoch 004 | Train Loss: 422199181.5929 | Val Loss: 480263460.5714  <-- best so far
Epoch 005 | Train Loss: 404985112.9204 | Val Loss: 520477474.2857
Epoch 006 | Train Loss: 392292234.7965 | Val Loss: 530107478.8571
Epoch 007 | Train Loss: 349730263.4336 | Val Loss: 471861197.7143  <-- best so far
Epoch 008 | Train Loss: 353309536.7257 | Val Loss: 475186678.8571
Epoch 009 | Train Loss: 350866346.5841 | Val Loss: 448430747.4286  <-- best so far
Epoch 010 | Train Loss: 323973206.9204 | Val Loss: 433600553.1429  <-- best so far
Epoch 011 | Train Loss: 315436433.4336 | Val Loss: 502024004.5714
Epoch 012 | Train Loss: 340641683.4690 | Val Loss: 685217232.0000
Epoch 013 | Train Loss: 325366978.2478 | Val Loss: 403099200.0000  <--

In [None]:
import torch
import numpy as np

# 0. 決定裝置（跟訓練時一樣）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 1. 載回最佳模型參數 & 丟到 device
model.load_state_dict(best_model_state)
model.to(device)
model.eval()
print("Using best model with Val Loss =", best_val_loss)

# 2. 取出 test 的特徵
X_test_raw = test.drop(columns=['id']).values

# 3. 用訓練時的 scaler 做標準化（scaler 在 CPU 沒關係）
X_test_norm = scaler.transform(X_test_raw)

# 4. 丟進模型推論
X_test_tensor = torch.tensor(X_test_norm, dtype=torch.float32).to(device)

with torch.no_grad():
    pred = model(X_test_tensor).squeeze()
    # 先搬回 CPU 再轉 numpy
    pred = pred.detach().cpu().numpy()

# 5. 建立 submission
submission = sample.copy()
submission['value'] = pred
submission.to_csv("submission_no_softmax_attention_task3_best.csv", index=False)

files.download("submission_no_softmax_attention_task3_best.csv")


Using device: cuda
Using best model with Val Loss = 268942733.71428573


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>