# 22장. 비정형 데이터를 위한 신경망

<table align="left"><tr><td>
<a href="https://colab.research.google.com/github/rickiepark/ml-with-python-cookbook-2nd/blob/main/ch22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="코랩에서 실행하기"/></a>
</td></tr></table>

In [1]:
# 코랩을 사용하는 경우 다음 코드를 실행하세요.
!pip install datasets evaluate accelerate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7

In [2]:
import numpy as np
import sklearn
import torch
import torchvision
import transformers
import datasets
import evaluate

print('numpy', np.__version__)
print('sklearn', sklearn.__version__)
print('torch', torch.__version__)
print('torchvision', torchvision.__version__)
print('transformers', transformers.__version__)
print('datasets', datasets.__version__)
print('evaluate', evaluate.__version__)

numpy 1.26.4
sklearn 1.6.0
torch 2.5.1+cu121
torchvision 0.20.1+cu121
transformers 4.47.1
datasets 3.2.0
evaluate 0.4.3


## 22.1 이미지 분류 신경망 훈련하기

In [3]:
# triton을 설치하고 런타임을 재시작해 주세요.
!pip install triton

Collecting triton
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.5/209.5 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: triton
Successfully installed triton-3.1.0


In [1]:
# 런타임을 재시작한 다음 환경 변수를 설정합니다.
%env TRITON_PTXAS_PATH=/usr/local/cuda-12.2/bin/ptxas

env: TRITON_PTXAS_PATH=/usr/local/cuda-12.2/bin/ptxas


In [2]:
# 라이브러리를 임포트합니다.
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms

# 합성곱 신경망을 정의합니다.
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(64 * 14 * 14, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = nn.functional.relu(self.conv1(x))
        x = nn.functional.relu(self.conv2(x))
        x = nn.functional.max_pool2d(self.dropout1(x), 2)
        x = torch.flatten(x, 1)
        x = nn.functional.relu(self.fc1(self.dropout2(x)))
        x = self.fc2(x)
        return nn.functional.log_softmax(x, dim=1)

# 실행 장치를 설정합니다.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 데이터 전처리 단계를 정의합니다.
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# MNIST 데이터셋을 로드합니다.
train_dataset = datasets.MNIST('./data', train=True, download=True,
    transform=transform)
test_dataset = datasets.MNIST('./data', train=False, transform=transform)

# 데이터 로더를 정의합니다.
batch_size = 64
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,
    shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size,
    shuffle=True)

# 모델과 옵티마이저를 초기화합니다.
model = Net().to(device)
optimizer = optim.Adam(model.parameters())

# 모델을 컴파일합니다.
model = torch.compile(model)

# 훈련 루프를 정의합니다.
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
    data, target = data.to(device), target.to(device)
    optimizer.zero_grad()
    output = model(data)
    loss = nn.functional.nll_loss(output, target)
    loss.backward()
    optimizer.step()

# 테스트 루프를 정의합니다.
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        output = model(data)

        # 가장 큰 로그 확률의 인덱스를 추출합니다.
        test_loss += nn.functional.nll_loss(
            output, target, reduction='sum'
        ).item()  # 배치 손실을 더합니다.
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)



## 22.2 텍스트 분류 신경망 훈련하기

In [3]:
# 라이브러리를 임포트합니다.
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 20 뉴스그룹 데이터셋을 로드합니다.
cats = ['alt.atheism', 'sci.space']
newsgroups_data = fetch_20newsgroups(subset='all', shuffle=True,
    random_state=42, categories=cats)

# 훈련 세트와 테스트 세트를 만듭니다.
X_train, X_test, y_train, y_test = train_test_split(newsgroups_data.data,
    newsgroups_data.target, test_size=0.2, random_state=42)

# bag-of-words 방식을 사용해 텍스트 데이터를 벡터화합니다.
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

# 데이터를 파이토치 텐서로 변환합니다.
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)

# 모델을 정의합니다.
class TextClassifier(nn.Module):
    def __init__(self, num_classes):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return nn.functional.log_softmax(x, dim=1)

# 모델을 초기화하고, 손실 함수와 옵티마이저를 정의합니다.
model = TextClassifier(num_classes=len(cats))
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# 모델을 컴파일합니다.
model = torch.compile(model)

# 모델을 훈련합니다.
num_epochs = 1
batch_size = 10
num_batches = len(X_train) // batch_size
for epoch in range(num_epochs):
    total_loss = 0.0
    for i in range(num_batches):
        # 현재 배치를 위한 입력과 타깃 데이터를 준비합니다.
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        inputs = X_train[start_idx:end_idx]
        targets = y_train[start_idx:end_idx]

        # 옵티마이저의 그레이디언트를 0으로 초기화합니다.
        optimizer.zero_grad()

        # 모델의 정방향 계산을 수행하고 손실을 계산합니다.
        outputs = model(inputs)
        loss = loss_function(outputs, targets)

        # 모델의 역전파를 수행하고 모델 파라미터를 업데이트합니다.
        loss.backward()
        optimizer.step()

        # 해당 에포크의 총 손실을 업데이트합니다.
        total_loss += loss.item()

    # 해당 에포크에 대한 테스트 세트의 정확도를 계산합니다.
    test_outputs = model(X_test)
    test_predictions = torch.argmax(test_outputs, dim=1)
    test_accuracy = accuracy_score(y_test, test_predictions)

    # 에포크 횟수, 평균 손실, 테스트 세트 정확도를 출력합니다.
    print(f"에포크: {epoch+1}, 손실: {total_loss/num_batches}, "
          f"테스트 세트 정확도: {test_accuracy}")

에포크: 1, 손실: 0.1567723306957692, 테스트 세트 정확도: 0.9888268156424581


In [4]:
X_train.shape[1]

25150

## 22.3 이미지 분류를 위해 사전 훈련된 모델 미세 튜닝하기

In [5]:
# 라이브러리를 임포트합니다.
import torch
from torchvision.transforms import(
    RandomResizedCrop, Compose, Normalize, ToTensor
    )
from transformers import Trainer, TrainingArguments, DefaultDataCollator
from transformers import ViTImageProcessor, ViTForImageClassification
from datasets import load_dataset, Image
import evaluate

# 이미지를 RGB로 변환하기 위한 헬퍼 함수를 정의합니다.
def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in
        examples["image"]]
    del examples["image"]
    return examples

# 측정 지표를 계산하기 위한 헬퍼 함수를 정의합니다.
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1),
        references=p.label_ids)

# 패션 mnist 데이터셋을 로드합니다.
dataset = load_dataset("fashion_mnist")

# VIT 모델에서 전처리기를 로드합니다.
image_processor = ViTImageProcessor.from_pretrained(
    "google/vit-base-patch16-224-in21k"
)

# 데이터셋에서 레이블을 추출합니다.
labels = dataset['train'].features['label'].names

# 사전 훈련된 모델을 로드합니다.
model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)

# 콜레이터, 정규화를 정의하고 변환합니다.
collate_fn = DefaultDataCollator()
normalize = Normalize(mean=image_processor.image_mean,
    std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

# 변환할 데이터셋을 로드합니다.
dataset = dataset.with_transform(transforms)

# 정확도를 측정 지표로 사용합니다.
metric = evaluate.load("accuracy")

# 훈련 매개변수를 설정합니다.
training_args = TrainingArguments(
    output_dir="fashion_mnist_model",
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=0.01,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    report_to="none"
)

# trainer를 초기화합니다.
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=image_processor,
)

# 모델을 기록하고 지표를 기록, 저장합니다.
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

README.md:   0%|          | 0.00/9.02k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/30.9M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/5.18M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/60000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy
0,5.9257,1.525648,0.4233


***** train metrics *****
  epoch                    =       0.9995
  total_flos               = 4328203231GF
  train_loss               =       7.3806
  train_runtime            =   0:09:18.08
  train_samples_per_second =       107.51
  train_steps_per_second   =        1.679


## 22.4 텍스트 분류를 위해 사전 훈련된 모델 미세 튜닝하기

In [6]:
# 라이브러리를 임포트합니다.
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import (
    AutoModelForSequenceClassification, TrainingArguments, Trainer
    )
import evaluate
import numpy as np

# imdb 데이터셋을 로드합니다.
imdb = load_dataset("imdb")

# 토크나이저와 콜레이터를 만듭니다.
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# imdb 데이터셋을 토큰화합니다.
tokenized_imdb = imdb.map(
    lambda example: tokenizer(
        example["text"], padding="max_length", truncation=True
    ),
    batched=True,
)

# 정확도 지표를 사용합니다.
accuracy = evaluate.load("accuracy")

# 지표를 계산하는 헬퍼 함수를 정의합니다.
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# 인덱스와 레이블을 서로 매핑하는 딕셔너리를 만듭니다.
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

# 사전 훈련된 모델을 로드합니다.
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label,
        label2id=label2id
)

# 훈련 매개변수를 설정합니다.
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
                            eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
                            report_to="none"
)

# trainer를 초기화합니다.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
                            processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 모델을 훈련합니다.
trainer.train()

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2206,0.200131,0.92332
2,0.1431,0.234567,0.93144


TrainOutput(global_step=3126, training_loss=0.20458446461194918, metrics={'train_runtime': 705.9781, 'train_samples_per_second': 70.824, 'train_steps_per_second': 4.428, 'total_flos': 6623369932800000.0, 'train_loss': 0.20458446461194918, 'epoch': 2.0})