## Trainer APIを使ったファインチューニング

Hagging Faceが提供しているTrainer API。Transformerモデルのために設計されており、幅広い訓練オプションとさまざまな組み込み機能を搭載しており、訓練ループを自分で書かずに済む。

```
export PYENV_ROOT="$HOME/.pyenv"
export PATH="$PYENV_ROOT/bin:$PATH"
eval "$(pyenv init -)"
eval "$(pyenv virtualenv-init -)"

source ~/.zshrc

pyenv activate torchenv310

pip freeze > requirements_pyenv.txt

In [1]:
import gzip
import shutil
import time

import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext

import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

In [2]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
# DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    DEVICE = torch.device("mps")
elif torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")
NUM_EPOCHS = 3
print(f"Using device: {DEVICE}")

Using device: mps


In [3]:
path='../第8章_機械学習の適用_感情分析'
df = pd.read_csv(f'{path}/movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0


In [4]:
train_texts = df.iloc[:35000]['review'].values
train_labels = df.iloc[:35000]['sentiment'].values
valid_texts = df.iloc[35000:40000]['review'].values
valid_labels = df.iloc[35000:40000]['sentiment'].values
test_texts = df.iloc[40000:]['review'].values
test_labels = df.iloc[40000:]['sentiment'].values

### データセットのトークン化

In [5]:
tokennizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokennizer(train_texts.tolist(), truncation=True, padding=True)
valid_encodings = tokennizer(valid_texts.tolist(), truncation=True, padding=True)
test_encodings = tokennizer(test_texts.tolist(), truncation=True, padding=True)

In [6]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [7]:
train_dataset = IMDbDataset(train_encodings, train_labels)
valid_dataset = IMDbDataset(valid_encodings, valid_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

### モデルの読み込みとファインチューニング

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased').to(DEVICE)
model.train()

In [None]:
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)
from transformers import Trainer, TrainingArguments
# 他にも色々設定できる。エポックごとに評価の出力もできる。
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir=f'./logs',
    logging_steps=10
)

In [None]:
from datasets import load_metrics
import numpy as np
metric = load_metrics('accuracy')
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # ロジットはPyTorchのテンソルではなくNumPy
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# ハイパーパラメータを変更または最適化しながらファインチューニングステップを複数回繰り返す場合は、valid_datasetを指定する。
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    optimizer=(optim, None)
)

In [None]:
start_time = time.time()
trainer.train()

print(f"Total Training time: {(time.time() - start_time)/60:.2f} seconds")

In [None]:
trainer.evaluate()

In [9]:
def compute_accuracy(model, data_loader, device):
    with torch.no_grad():
        correct_pred, num_examples = 0, 0
        for batch_idx, batch in enumerate(data_loader):
            # データの前処理
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']
            predicted_labels = torch.argmax(logits, 1)
            num_examples += labels.size(0)
            correct_pred += (predicted_labels == labels).sum()
    return correct_pred.float() / num_examples * 100

In [None]:
model.eval()
model.to(DEVICE)
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

In [None]:
start_time = time.time()
for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch in enumerate(train_loader):
        # データの前処理
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        # モデルの出力
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss, logits = outputs['loss'], outputs['logits']

        # 勾配の計算とパラメータの更新
        optim.zero_grad()
        loss.backward()
        optim.step()

        if batch_idx % 250 == 0:
            print(f'Epoch {epoch + 1:04d}/{NUM_EPOCHS:04d}, Batch {batch_idx:04d}/{len(train_loader):04d}, Loss: {loss:.4f}')

    model.eval()
    with torch.set_grad_enabled(False):
        print(f'Training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nValid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

print(f'Total training time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')


Epoch 0001/0003, Batch 0000/2188, Loss: 0.6993
