---
## 0) 환경 설정 & 공통 유틸


In [None]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)


In [None]:
def accuracy_from_logits(logits, y):
    preds = logits.argmax(dim=1)
    return (preds == y).float().mean().item()

def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss, total_acc, n_batches = 0.0, 0.0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_acc += accuracy_from_logits(logits, y)
        n_batches += 1
    return total_loss / max(n_batches, 1), total_acc / max(n_batches, 1)

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_acc, n_batches = 0.0, 0.0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = criterion(logits, y)
        total_loss += loss.item()
        total_acc += accuracy_from_logits(logits, y)
        n_batches += 1
    return total_loss / max(n_batches, 1), total_acc / max(n_batches, 1)

def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def run_training(model, train_loader, test_loader, epochs, lr, weight_decay=0.0):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    history = []
    for epoch in range(1, epochs + 1):
        tr_loss, tr_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
        te_loss, te_acc = evaluate(model, test_loader, criterion, device)
        history.append((tr_loss, tr_acc, te_loss, te_acc))
        print(f"Epoch {epoch:02d}/{epochs} | Train acc {tr_acc:.4f} | Test acc {te_acc:.4f}")
    return history


---
## 1) Fashion-MNIST: 데이터 로더


In [None]:
train_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = datasets.FashionMNIST(root="./data", train=True, download=True, transform=train_transform)
test_dataset  = datasets.FashionMNIST(root="./data", train=False, download=True, transform=test_transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2, pin_memory=torch.cuda.is_available())
test_loader  = DataLoader(test_dataset, batch_size=1000, shuffle=False, num_workers=2, pin_memory=torch.cuda.is_available())

fashion_classes = train_dataset.classes
print("Fashion classes:", fashion_classes)


---
## 2) Fashion-MNIST: 모델 정의


In [None]:
class SmallCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),      # 28 -> 14
            nn.Conv2d(16, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),      # 14 -> 7
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(32 * 7 * 7, 128),
            nn.ReLU(),
            nn.Linear(128, 10),
        )
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

class BigCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),      # 28 -> 14
            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),      # 14 -> 7
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 7 * 7, 256),
            nn.ReLU(),
            nn.Linear(256, 10),
        )
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

class BigCNN_Dropout(nn.Module):
    def __init__(self, p=0.5):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),      # 28 -> 14
            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),      # 14 -> 7
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 7 * 7, 256),
            nn.ReLU(),
            nn.Dropout(p),
            nn.Linear(256, 10),
        )
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

print("Small params:", count_params(SmallCNN()))
print("Big   params:", count_params(BigCNN()))


---
## 3) Fashion-MNIST: 본 실험 (epoch=10)


In [None]:
EPOCHS_FASHION = 10
lr = 1e-3
wd = 1e-4

print("\n[Train] SmallCNN")
hist_small = run_training(SmallCNN(), train_loader, test_loader, epochs=EPOCHS_FASHION, lr=lr)

print("\n[Train] BigCNN")
hist_big = run_training(BigCNN(), train_loader, test_loader, epochs=EPOCHS_FASHION, lr=lr)

print("\n[Train] BigCNN + Dropout(p=0.5)")
hist_big_do = run_training(BigCNN_Dropout(p=0.5), train_loader, test_loader, epochs=EPOCHS_FASHION, lr=lr)

print("\n[Train] BigCNN + Dropout(p=0.5) + WD(1e-4)")
hist_big_do_wd = run_training(BigCNN_Dropout(p=0.5), train_loader, test_loader, epochs=EPOCHS_FASHION, lr=lr, weight_decay=wd)


In [None]:
print("\n[Train] Extra: BigCNN + Dropout(p=0.7)")
hist_extra = run_training(BigCNN_Dropout(p=0.7), train_loader, test_loader, epochs=EPOCHS_FASHION, lr=lr)


In [None]:
def last_acc(hist):
    return hist[-1][1], hist[-1][3]

rows = [
    ("SmallCNN", "-", *last_acc(hist_small)),
    ("BigCNN", "channels/linear↑", *last_acc(hist_big)),
    ("BigCNN+DO", "dropout p=0.5", *last_acc(hist_big_do)),
    ("BigCNN+DO+WD", "dropout p=0.5, wd=1e-4", *last_acc(hist_big_do_wd)),
    ("Extra", "dropout p=0.7", *last_acc(hist_extra)),
]
results_fashion = pd.DataFrame(rows, columns=["Model", "Change", "TrainAcc_last", "TestAcc_last"])
results_fashion


In [None]:
big_train = [x[1] for x in hist_big]
big_test  = [x[3] for x in hist_big]
do_train  = [x[1] for x in hist_big_do]
do_test   = [x[3] for x in hist_big_do]
epochs = list(range(1, EPOCHS_FASHION + 1))

plt.figure(figsize=(7,4))
plt.plot(epochs, big_train, label="Big train")
plt.plot(epochs, big_test, label="Big test")
plt.plot(epochs, do_train, label="Big+DO train")
plt.plot(epochs, do_test, label="Big+DO test")
plt.xlabel("epoch")
plt.ylabel("accuracy")
plt.legend()
plt.title("Fashion-MNIST: Big vs Big+Dropout")
plt.show()


---
## 4) CIFAR-10: 데이터 로더


In [None]:
train_transform_cifar = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465),
                         std=(0.2470, 0.2435, 0.2616))
])

test_transform_cifar = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465),
                         std=(0.2470, 0.2435, 0.2616))
])

train_cifar = datasets.CIFAR10(root="./data", train=True, download=True, transform=train_transform_cifar)
test_cifar  = datasets.CIFAR10(root="./data", train=False, download=True, transform=test_transform_cifar)

train_loader_cifar = DataLoader(train_cifar, batch_size=64, shuffle=True, num_workers=2, pin_memory=torch.cuda.is_available())
test_loader_cifar  = DataLoader(test_cifar, batch_size=1000, shuffle=False, num_workers=2, pin_memory=torch.cuda.is_available())

cifar_classes = train_cifar.classes
print("cifar classes:", cifar_classes)


---
## 5) CIFAR-10: baseline + 개선 모델


In [None]:
class CIFARCNN(nn.Module):
    def __init__(self, dropout_p=0.5):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),     # 32 -> 16
            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),     # 16 -> 8
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 8 * 8, 256),
            nn.ReLU(),
            nn.Dropout(dropout_p),
            nn.Linear(256, 10),
        )
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

print("CIFARCNN params:", count_params(CIFARCNN()))


In [None]:
EPOCHS_CIFAR = 10
lr_cifar = 1e-3

print("\n[Train] CIFAR baseline")
hist_cifar_base = run_training(CIFARCNN(dropout_p=0.5),
                               train_loader_cifar, test_loader_cifar,
                               epochs=EPOCHS_CIFAR, lr=lr_cifar)


In [None]:
print("\n[Train] CIFAR improved (channels↑ + wd)")
hist_cifar_improved = run_training(CIFARCNN(dropout_p=0.5),
                                   train_loader_cifar, test_loader_cifar,
                                   epochs=EPOCHS_CIFAR, lr=lr_cifar, weight_decay=5e-4)

In [None]:
base_te = hist_cifar_base[-1][3]
impr_te = hist_cifar_improved[-1][3]
results_cifar = pd.DataFrame([
    ("Baseline", base_te),
    ("Improved", impr_te),
], columns=["Model", "Change", "TestAcc_last"])
results_cifar


---
## 6) 개념 요약 (각 3줄 이내, 과제 결과 예시 포함)


In [None]:
overfitting = """
BigCNN에서 epoch를 늘리면 train acc는 계속 오르는데 test acc가 정체/하락하는 현상이 나타날 수 있다.
이때 train/test 격차가 커지는 것이 과적합의 대표적인 신호다.
(예: BigCNN의 epoch 6~10 구간에서 train↑, test↔/↓)
"""

dropout = """
학습 중 일부 뉴런 출력을 확률적으로 0으로 만들어 특정 경로 의존을 줄이는 정규화다.
BigCNN+Dropout은 보통 train acc를 낮추지만 test acc를 유지/개선해 일반화에 도움을 줄 수 있다.
train()에서만 적용되고 eval()에서는 꺼진다.
"""

weight_decay = """
loss에 L2 패널티를 더해 weight가 과도하게 커지는 것을 억제하는 정규화다.
보통 train 성능을 약간 희생해 test 성능을 안정화시키는 방향으로 작동한다.
(예: BigCNN+Dropout+WD가 BigCNN보다 test 추세가 덜 흔들림)
"""

augmentation = """
훈련 데이터에 랜덤 변형(Flip/Crop 등)을 줘서 데이터 다양성을 늘리는 기법이다.
CIFAR-10처럼 실제 이미지에서 위치/방향 변화에 덜 민감하게 만들어 일반화를 돕는다.
테스트에는 랜덤성을 넣지 않는다.
"""

print(overfitting)
print(dropout)
print(weight_decay)
print(augmentation)
