In [1]:
# 🔬 활성화 함수 비교 실험 (Jupyter Notebook)
# ---------------------------------------------------------------
# 목표: ReLU / Sigmoid / Tanh / GELU / SiLU(Swish) / LeakyReLU 활성화 함수를
# 간단한 NLP 분류 문제에서 비교해본다.
#
# 아이디어:
# - GPT 내부 활성화 함수는 바꿀 수 없으므로, 우리가 직접 만든 작은 분류 모델에서 실험.
# - 같은 데이터셋, 같은 모델 구조, 같은 학습 설정에서 활성화만 바꿔 성능 차이를 본다.
# - 텍스트 → TF-IDF → 작은 MLP → 분류 정확도/ROC AUC 등으로 비교.


In [2]:
#setting

In [32]:
import math
import time
import random
from dataclasses import dataclass
from typing import Dict, List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

In [39]:
#Set up device

In [53]:
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
else:
    DEVICE = torch.device("cpu")

DEVICE_STR = str(DEVICE)  # <-- fixed: now always defined as a string ("cuda", "mps" or "cpu")
print("Using device:", DEVICE_STR)
try:
    torch.set_float32_matmul_precision("high")
except Exception:
    pass

Using device: mps


In [54]:
#Load dataset

In [55]:
print("All categories:")
print(fetch_20newsgroups(subset="train").target_names)

All categories:
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [56]:
CATEGORIES = ["sci.med", "sci.space"]   # Medical + Space
raw = fetch_20newsgroups(subset="all", categories=CATEGORIES,
                         remove=("headers","footers","quotes"))

In [57]:
X_text = raw.data
y = raw.target.astype(np.int64)

print(len(X_text), "documents")
print(raw.target_names)  # shows ['sci.med', 'sci.space']

1977 documents
['sci.med', 'sci.space']


In [58]:
# Split into each category
med_docs = [doc for doc, label in zip(X_text, y) if raw.target_names[label] == "sci.med"]
space_docs = [doc for doc, label in zip(X_text, y) if raw.target_names[label] == "sci.space"]

In [59]:
print(len(med_docs), "sci.med docs")

990 sci.med docs


In [60]:
print(len(space_docs), "sci.space docs")

987 sci.space docs


In [61]:
# Vectorize text using Sentence-BERT embeddings

In [62]:
model_name = "all-MiniLM-L6-v2"
sbert_model = SentenceTransformer(model_name, device=DEVICE_STR)
X = sbert_model.encode(
    X_text,
    convert_to_numpy=True,
    show_progress_bar=True,
    batch_size=32  # safer batch size for most machines
)

Batches:   0%|          | 0/62 [00:00<?, ?it/s]

In [82]:
#train_test_split + NewsDataset + DataLoader
# - 분할은 scikit-learn(train_test_split)로: stratify/재현성 용이
# - 배치는 PyTorch(Dataset/DataLoader)로: 미니배치/셔플/GPU 학습
#
# 이 코드는 상단에서 X, y가 이미 준비되어 있다고 가정합니다
# (예: Sentence-BERT 임베딩 완료 후 X: (N, D), y: (N,))

In [73]:
# 1) 학습/검증/테스트 분할 (sklearn)
# - test 20%, val 10% (train 내부에서 12.5%를 떼면 전체의 10%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)
print(
    f"Split sizes → train: {len(y_train)}, val: {len(y_val)}, test: {len(y_test)}"
)

Split sizes → train: 1581, val: 198, test: 198


In [64]:
# Create PyTorch Dataset
    #  We use PyTorch (instead of sklearn) because we need seamless integration
    #   with neural networks (GPU acceleration, mini-batching, autograd).
    #   sklearn handles splitting and evaluation well, but PyTorch handles training loops.

In [74]:
class NewsDataset(Dataset):
    """Sentence-BERT 임베딩(X)과 라벨(y)을 PyTorch 학습 파이프라인에 맞게 래핑"""
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        # DataLoader가 인덱스로 접근해 (feature, label) 쌍을 꺼내갈 수 있도록 함
        return self.X[idx], self.y[idx]

In [75]:
train_ds = NewsDataset(X_train, y_train)
val_ds   = NewsDataset(X_val,   y_val)
test_ds  = NewsDataset(X_test,  y_test)

In [76]:
#Dataset이 잘 정의되었는지 확인
print("Train dataset size:", len(train_ds))
print("Val dataset size:", len(val_ds))
print("Test dataset size:", len(test_ds))


Train dataset size: 1581
Val dataset size: 198
Test dataset size: 198


In [78]:
#print("Example from train_ds:", train_ds[0])

In [81]:
# DataLoader 구성 (PyTorch)
BATCH_SIZE = 64
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE)

# 4) (선택) 간단한 MLP 예시 — 활성화 함수만 바꿔가며 실험 가능
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, activation_fn):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            activation_fn,
            nn.Linear(hidden_dim, 2)
        )
    def forward(self, x):
        return self.net(x)
print("✅ Ready: sklearn split + PyTorch Dataset/DataLoader pipeline")

✅ Ready: sklearn split + PyTorch Dataset/DataLoader pipeline


In [83]:
print("DataLoader 준비 완료 → train/val/test 배치 사이즈 확인:")
for name, loader in zip(["Train", "Val", "Test"], [train_loader, val_loader, test_loader]):
    batch_X, batch_y = next(iter(loader))
    print(f"{name}: X batch shape = {batch_X.shape}, y batch shape = {batch_y.shape}")

DataLoader 준비 완료 → train/val/test 배치 사이즈 확인:
Train: X batch shape = torch.Size([64, 384]), y batch shape = torch.Size([64])
Val: X batch shape = torch.Size([64, 384]), y batch shape = torch.Size([64])
Test: X batch shape = torch.Size([64, 384]), y batch shape = torch.Size([64])


In [148]:
#MLP= Multi-Layer Perceptron
"""
    Build and return a simple MLP (Multi-Layer Perceptron) model.

    Args:
        act_name (str): Name of the activation function (e.g., 'relu', 'tanh').
        input_dim (int): Number of input features.
        hidden_dim (int, optional): Number of hidden units. Default = 128.

    Returns:
        MLP: A PyTorch model with the specified activation and dimensions.
 """

"\n    Build and return a simple MLP (Multi-Layer Perceptron) model.\n\n    Args:\n        act_name (str): Name of the activation function (e.g., 'relu', 'tanh').\n        input_dim (int): Number of input features.\n        hidden_dim (int, optional): Number of hidden units. Default = 128.\n\n    Returns:\n        MLP: A PyTorch model with the specified activation and dimensions.\n "

In [149]:
# 1) 모델/학습/평가 함수 정의
# - build_model(...)
# - train_model(...)   # 이미 만든 간결 버전 사용
# - evaluate(...)      # 한 가지 버전만 남기기

In [151]:
 ㅍ


In [152]:
model = build_model("relu", input_dim=384)
print(model)

MLP(
  (net): Sequential(
    (0): Linear(in_features=384, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=2, bias=True)
  )
)


In [154]:
# -------------------------------
# Train Model Utility (Simplified)
# -------------------------------
def train_model(act_name: str = "relu", hidden_dim: int = 128, epochs: int = 8, lr: float = 1e-3):
    input_dim = X_train.shape[1]
    device = DEVICE if 'DEVICE' in globals() else (
        torch.device("cuda") if torch.cuda.is_available() else 
        torch.device("mps") if hasattr(torch.backends, "mps") and torch.backends.mps.is_available() else 
        torch.device("cpu")
    )
    print("[Device]", device)

    model = build_model(act_name, input_dim=input_dim, hidden_dim=hidden_dim).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for ep in range(1, epochs + 1):
        model.train()
        total_loss, total_n = 0.0, 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            loss = criterion(model(xb), yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * xb.size(0)
            total_n += xb.size(0)
        train_loss = total_loss / max(total_n, 1)

        val_metrics = evaluate(model, val_loader, device)
        print(f"[Epoch {ep}] loss={train_loss:.4f} | val_acc={val_metrics['acc']:.4f}")

    test_metrics = evaluate(model, test_loader, device)
    return model, test_metrics

# Explicit test print (Jupyter-friendly)
model, test_metrics = train_model()
print("[Test Metrics]", test_metrics)


[Device] mps
[Evaluate] loss=0.4828 acc=0.9343 f1=0.9366 auc=0.9753
[Epoch 1] loss=0.6057 | val_acc=0.9343
[Evaluate] loss=0.2554 acc=0.9242 f1=0.9268 auc=0.9771
[Epoch 2] loss=0.3381 | val_acc=0.9242
[Evaluate] loss=0.1915 acc=0.9192 f1=0.9200 auc=0.9791
[Epoch 3] loss=0.1839 | val_acc=0.9192
[Evaluate] loss=0.1786 acc=0.9242 f1=0.9268 auc=0.9793
[Epoch 4] loss=0.1387 | val_acc=0.9242
[Evaluate] loss=0.1743 acc=0.9242 f1=0.9261 auc=0.9796
[Epoch 5] loss=0.1197 | val_acc=0.9242
[Evaluate] loss=0.1745 acc=0.9242 f1=0.9261 auc=0.9793
[Epoch 6] loss=0.1087 | val_acc=0.9242
[Evaluate] loss=0.1751 acc=0.9242 f1=0.9239 auc=0.9795
[Epoch 7] loss=0.1007 | val_acc=0.9242
[Evaluate] loss=0.1780 acc=0.9242 f1=0.9239 auc=0.9798
[Epoch 8] loss=0.0946 | val_acc=0.9242
[Evaluate] loss=0.1572 acc=0.9040 f1=0.9005 auc=0.9863
[Test Metrics] {'loss': 0.15724752632656483, 'acc': 0.9040404040404041, 'f1': 0.9005235602094241, 'auc': 0.9863279257218651}


In [163]:
# -------------------------------
# Evaluation Utility (Jupyter-friendly)
# -------------------------------
@torch.no_grad()
def evaluate(model: nn.Module, loader, device: torch.device, verbose: bool = True):
    """
    Evaluate the model on a given DataLoader and compute metrics.

    Args:
        model (nn.Module): Trained model to evaluate.
        loader (DataLoader): DataLoader for validation/test data.
        device (torch.device): Device to run evaluation on.
        verbose (bool): If True, print metrics directly (useful in Jupyter).

    Returns:
        dict: Dictionary containing loss, accuracy, f1, and auc.
    """
    model.eval()  # set model to evaluation mode (disables dropout, etc.)
    all_probs, all_preds, all_labels = [], [], []
    criterion = nn.CrossEntropyLoss()
    total_loss, total_n = 0.0, 0

    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = criterion(logits, yb)

        # Softmax to get probabilities for the positive class
        probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        labels = yb.cpu().numpy()

        all_probs.extend(probs)
        all_preds.extend(preds)
        all_labels.extend(labels)

        total_loss += loss.item() * xb.size(0)
        total_n += xb.size(0)

    # Compute metrics
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    try:
        auc = roc_auc_score(all_labels, all_probs)
    except Exception:
        auc = float('nan')

    metrics = {
        "loss": total_loss / max(total_n, 1),
        "acc": acc,
        "f1": f1,
        "auc": auc,
    }

    if verbose:
        print("[Evaluate]",
              f"loss={metrics['loss']:.4f}",
              f"acc={metrics['acc']:.4f}",
              f"f1={metrics['f1']:.4f}",
              f"auc={(metrics['auc'] if isinstance(metrics['auc'], float) else float('nan')):.4f}")

    return metrics

In [91]:
#MLP + model training loop

In [107]:
# 여러 활성화 함수를 손쉽게 바꿔 끼우기 위한 레지스트리
ACTIVATIONS = {
    "relu": nn.ReLU(),
    "leaky_relu": nn.LeakyReLU(negative_slope=0.01),
    "gelu": nn.GELU(),
    "silu": nn.SiLU(),   # Swish
    "mish": nn.Mish(),
    "elu": nn.ELU(),
    "selu": nn.SELU(),
    "tanh": nn.Tanh(),
    "sigmoid": nn.Sigmoid(),
    "softplus": nn.Softplus(),
}


In [123]:
    """
    Build a simple MLP model with the specified activation function.

    Args:
        act_name (str): Name of the activation function to use (must be in ACTIVATIONS).
        input_dim (int): Dimensionality of the input features.
        hidden_dim (int, optional): Number of hidden units. Defaults to 128.

    Returns:
        MLP: A multi-layer perceptron model with the chosen activation.
    """

'\nBuild a simple MLP model with the specified activation function.\n\nArgs:\n    act_name (str): Name of the activation function to use (must be in ACTIVATIONS).\n    input_dim (int): Dimensionality of the input features.\n    hidden_dim (int, optional): Number of hidden units. Defaults to 128.\n\nReturns:\n    MLP: A multi-layer perceptron model with the chosen activation.\n'

In [124]:
def build_model(act_name: str, input_dim: int, hidden_dim: int = 128) -> MLP:

    act_name = act_name.lower()
    if act_name not in ACTIVATIONS:
        raise ValueError(f"Unknown activation: {act_name}. Available: {list(ACTIVATIONS.keys())}")
    model = MLP(input_dim=input_dim, hidden_dim=hidden_dim, activation_fn=ACTIVATIONS[act_name])
    return model

In [125]:
model = build_model("relu", input_dim=384)
print(model)

MLP(
  (net): Sequential(
    (0): Linear(in_features=384, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=2, bias=True)
  )
)


In [126]:
@torch.no_grad()
def evaluate(model: nn.Module, loader, device: torch.device):
    """
    Evaluate the model on a given DataLoader and compute metrics.

    Args:
        model (nn.Module): Trained model to evaluate.
        loader (DataLoader): DataLoader for validation/test data.
        device (torch.device): Device to run evaluation on.

    Returns:
        dict: Dictionary containing loss, accuracy, f1, and auc.
    """
    model.eval()  # set model to evaluation mode (disables dropout, etc.)
    all_probs, all_preds, all_labels = [], [], []
    criterion = nn.CrossEntropyLoss()
    total_loss, total_n = 0.0, 0

    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = criterion(logits, yb)

        # Softmax to get probabilities for the positive class
        probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
        # Predicted class (0 or 1)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        labels = yb.cpu().numpy()

        all_probs.extend(probs)
        all_preds.extend(preds)
        all_labels.extend(labels)

        total_loss += loss.item() * xb.size(0)
        total_n += xb.size(0)

    # Compute metrics
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    try:
        auc = roc_auc_score(all_labels, all_probs)
    except Exception:
        auc = float('nan')  # handle case where ROC AUC cannot be computed

    metrics = {
        "loss": total_loss / max(total_n, 1),
        "acc": acc,
        "f1": f1,
        "auc": auc,
    }

    return metrics

In [131]:
metrics = evaluate(model, val_loader, device)
print(metrics)

[Evaluate] loss=0.6927 acc=0.5000 f1=0.0000 auc=0.5940 n=198
{'loss': 0.6927446313578673, 'acc': 0.5, 'f1': 0.0, 'auc': 0.5939700030609122, 'n_samples': 198}
