In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

### TODO
- Conv(1→16) → ReLU → Pool → Conv(16→32) → ReLU → Pool → Flatten → Linear → ReLU → Linear
- Conv는 kernel=3, stride=1, padding=1
- Pool은 MaxPool2d(2)

- 입력: (B, 1, 28, 28)
- Conv1 → (B, 16, 28, 28)
- Pool1 → (B, 16, 14, 14)
- Conv2 → (B, 32, 14, 14)
- Pool2 → (B, 32, 7, 7)
- Flatten → (B, 32 × 7 × 7)
- Linear1 input dim = 32 × 7 × 7 = 1568

---
## Fashion-MNIST 실험

### 실험 목표
- SmallCNN vs BigCNN으로 **과적합 유도**
- Dropout/Weight Decay로 **일반화 성능 변화 체감**


In [None]:
# Fashion-MNIST DataLoader
train_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = datasets.FashionMNIST(
    root="./data", train=True, download=True, transform=train_transform
)
test_dataset = datasets.FashionMNIST(
    root="./data", train=False, download=True, transform=test_transform
)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2, pin_memory=torch.cuda.is_available())
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False, num_workers=2, pin_memory=torch.cuda.is_available())

fashion_classes = train_dataset.classes
print("classes:", fashion_classes[:5], "... total:", len(fashion_classes))


### 샘플 16장 시각화


In [None]:
# 4x4 시각화
images, labels = next(iter(train_loader))
images = images[:16]
labels = labels[:16]

plt.figure(figsize=(6, 6))
for i in range(16):
    plt.subplot(4, 4, i + 1)
    # Normalize 되돌리기: x_norm = (x - 0.5)/0.5 -> x = x_norm*0.5 + 0.5
    img = images[i].squeeze().numpy() * 0.5 + 0.5
    plt.imshow(img, cmap="gray")
    plt.title(fashion_classes[labels[i].item()], fontsize=8)
    plt.axis("off")
plt.tight_layout()
plt.show()


### 모델 정의: SmallCNN / BigCNN


In [None]:
def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
class SmallCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),      # 28 -> 14
            nn.Conv2d(16, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),      # 14 -> 7
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(32 * 7 * 7, 128),
            nn.ReLU(),
            nn.Linear(128, 10),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

class BigCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),      # 28 -> 14
            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),      # 14 -> 7
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 7 * 7, 256),
            nn.ReLU(),
            nn.Linear(256, 10),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

small = SmallCNN()
big = BigCNN()

print("SmallCNN params:", count_params(small))
print("BigCNN   params:", count_params(big))


### 빠른 학습: Small vs Big (2~3 epoch)

In [None]:
def accuracy_from_logits(logits: torch.Tensor, y: torch.Tensor) -> float:
    """logits: (B, C), y: (B,)"""
    preds = logits.argmax(dim=1)
    return (preds == y).float().mean().item()

def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    total_acc = 0.0
    n_batches = 0

    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad(set_to_none=True)

        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_acc += accuracy_from_logits(logits, y)
        n_batches += 1

    return total_loss / max(n_batches, 1), total_acc / max(n_batches, 1)

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    total_acc = 0.0
    n_batches = 0

    for x, y in loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = criterion(logits, y)

        total_loss += loss.item()
        total_acc += accuracy_from_logits(logits, y)
        n_batches += 1

    return total_loss / max(n_batches, 1), total_acc / max(n_batches, 1)


def run_training(model, train_loader, test_loader, epochs=2, lr=1e-3, weight_decay=0.0):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    history = []
    for epoch in range(1, epochs + 1):
        tr_loss, tr_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
        te_loss, te_acc = evaluate(model, test_loader, criterion, device)
        history.append((tr_loss, tr_acc, te_loss, te_acc))
        print(f"Epoch {epoch:02d}/{epochs} | train loss {tr_loss:.4f} acc {tr_acc:.4f} | test loss {te_loss:.4f} acc {te_acc:.4f}")
    return history

In [None]:
EPOCHS_FASHION = 2
lr = 1e-3

print("\n[Train] SmallCNN")
hist_small = run_training(SmallCNN(), train_loader, test_loader, epochs=EPOCHS_FASHION, lr=lr)

print("\n[Train] BigCNN")
hist_big = run_training(BigCNN(), train_loader, test_loader, epochs=EPOCHS_FASHION, lr=lr)

# 최종 성능 요약
small_tr_acc = hist_small[-1][1]
small_te_acc = hist_small[-1][3]
big_tr_acc   = hist_big[-1][1]
big_te_acc   = hist_big[-1][3]

print("\nSummary (last epoch)")
print(f"Small: train acc={small_tr_acc:.4f} test acc={small_te_acc:.4f}")
print(f"Big  : train acc={big_tr_acc:.4f} test acc={big_te_acc:.4f}")


### BigCNN + Dropout


In [None]:
class BigCNN_Dropout(nn.Module):
    def __init__(self, p=0.5):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),      # 28 -> 14
            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),      # 14 -> 7
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 7 * 7, 256),
            nn.ReLU(),
            nn.Dropout(p),
            nn.Linear(256, 10),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

print("\n[Train] BigCNN + Dropout")
hist_big_do = run_training(BigCNN_Dropout(p=0.5), train_loader, test_loader, epochs=EPOCHS_FASHION, lr=lr)

big_do_tr_acc = hist_big_do[-1][1]
big_do_te_acc = hist_big_do[-1][3]

print("\nSummary (last epoch)")
print(f"Big+DO: train acc={big_do_tr_acc:.4f} test acc={big_do_te_acc:.4f}")


### Weight Decay (Big + Dropout + WD)

Weight Decay는 파라미터 크기를 너무 키우지 않도록 벌점을 주는 방식으로 과적합을 완화
https://eair.tistory.com/80


In [None]:
WD = 1e-4
print("\n[Train] BigCNN + Dropout + Weight Decay")
hist_big_do_wd = run_training(BigCNN_Dropout(p=0.5), train_loader, test_loader, epochs=EPOCHS_FASHION, lr=lr, weight_decay=WD)

big_do_wd_tr_acc = hist_big_do_wd[-1][1]
big_do_wd_te_acc = hist_big_do_wd[-1][3]

print("\nSummary (last epoch)")
print(f"Big+DO+WD: train acc={big_do_wd_tr_acc:.4f} test acc={big_do_wd_te_acc:.4f}")


### 결과 표 정리


In [None]:
import pandas as pd

results_fashion = pd.DataFrame({
    "Model": ["Small", "Big", "Big + Dropout", "Big + Dropout + WD"],
    "Train Acc (last)": [small_tr_acc, big_tr_acc, big_do_tr_acc, big_do_wd_tr_acc],
    "Test Acc (last)":  [small_te_acc, big_te_acc, big_do_te_acc, big_do_wd_te_acc],
})
results_fashion


---
## CIFAR-10 프로젝트 맛보기

- 입력이 (B, 3, 32, 32)로 바뀜.


### CIFAR-10 DataLoader 세팅 + 16장 시각화


In [None]:
train_transform_cifar = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=(0.4914, 0.4822, 0.4465),
        std=(0.2470, 0.2435, 0.2616)
    )
])

test_transform_cifar = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean=(0.4914, 0.4822, 0.4465),
        std=(0.2470, 0.2435, 0.2616)
    )
])

train_cifar = datasets.CIFAR10(root="./data", train=True, download=True, transform=train_transform_cifar)
test_cifar  = datasets.CIFAR10(root="./data", train=False, download=True, transform=test_transform_cifar)

train_loader_cifar = DataLoader(train_cifar, batch_size=64, shuffle=True, num_workers=2, pin_memory=torch.cuda.is_available())
test_loader_cifar  = DataLoader(test_cifar, batch_size=1000, shuffle=False, num_workers=2, pin_memory=torch.cuda.is_available())

cifar_classes = train_cifar.classes
print("cifar classes:", cifar_classes)


In [None]:
# CIFAR 16장 시각화 (Normalize 되돌린 뒤 표시)
images, labels = next(iter(train_loader_cifar))
images = images[:16]
labels = labels[:16]

mean = torch.tensor([0.4914, 0.4822, 0.4465]).view(3, 1, 1)
std  = torch.tensor([0.2470, 0.2435, 0.2616]).view(3, 1, 1)

plt.figure(figsize=(7, 7))
for i in range(16):
    plt.subplot(4, 4, i + 1)
    img = images[i].cpu() * std + mean
    img = img.permute(1, 2, 0).clamp(0, 1).numpy()
    plt.imshow(img)
    plt.title(cifar_classes[labels[i].item()], fontsize=8)
    plt.axis("off")
plt.tight_layout()
plt.show()


### CIFAR용 CNN 모델

32×32 입력에서 Pool(2) 두 번이면 32→16→8이 되어 Flatten은 (채널 * 8 * 8)이 됩니다.


In [None]:
class CIFARCNN(nn.Module):
    def __init__(self, dropout_p=0.5):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),     # 32 -> 16
            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),     # 16 -> 8
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 8 * 8, 256),
            nn.ReLU(),
            nn.Dropout(dropout_p),
            nn.Linear(256, 10),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

print("CIFARCNN params:", count_params(CIFARCNN()))


### 1~2 epoch 학습


In [None]:
EPOCHS_CIFAR = 2

print("\n[Train] CIFARCNN baseline")
hist_cifar = run_training(CIFARCNN(dropout_p=0.5), train_loader_cifar, test_loader_cifar, epochs=EPOCHS_CIFAR, lr=lr)

cifar_tr_acc = hist_cifar[-1][1]
cifar_te_acc = hist_cifar[-1][3]
print(f"Baseline CIFAR: train acc={cifar_tr_acc:.4f} test acc={cifar_te_acc:.4f}")


In [None]:
print("\nCIFAR-10 baseline:")
pd.DataFrame({
    "Model": ["CIFARCNN baseline"],
    "Train Acc (last)": [cifar_tr_acc],
    "Test Acc (last)": [cifar_te_acc],
})
