In [1]:
import random
import logging
from IPython.display import display, HTML
import os

import numpy as np
import pandas as pd
import datasets
from datasets import load_dataset, load_metric, ClassLabel, Sequence
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

if torch.cuda.is_available():
    print("사용가능한 gpu : ",torch.cuda.device_count())

사용가능한 gpu :  1


Reproduction을 위한 Seed 고정  
출처 : https://dacon.io/codeshare/2363?dtype=vote&s_id=0

In [2]:
RANDOM_SEED = 42

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
    
seed_everything(RANDOM_SEED)

In [3]:
task = "nli"
model_checkpoint = "klue/bert-base"
batch_size = 32
save_checkpoint_path = "./checkpoints"

huggingface 에서 tokenizer 불러옵니다. 

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

dataset을 가져옵니다.

In [5]:
dataset = pd.read_csv("data/train_data.csv")
test = pd.read_csv("data/test_data.csv")

In [6]:
dataset.head()

Unnamed: 0,index,title,topic_idx
0,0,인천→핀란드 항공기 결항…휴가철 여행객 분통,4
1,1,실리콘밸리 넘어서겠다…구글 15조원 들여 美전역 거점화,4
2,2,이란 외무 긴장완화 해결책은 미국이 경제전쟁 멈추는 것,4
3,3,NYT 클린턴 측근韓기업 특수관계 조명…공과 사 맞물려종합,4
4,4,시진핑 트럼프에 중미 무역협상 조속 타결 희망,4


`train_test_split`을 사용하여 train data와 validation data를 나눕니다.

In [7]:
dataset_train, dataset_val = train_test_split(dataset,test_size = 0.2,random_state = RANDOM_SEED)

학습에 사용될 Dataset을 정의합니다.

In [8]:
class TrainDataset(Dataset):
    def __init__(self, dataset, sent_key, label_key, bert_tokenizer):
        self.sentences = [ bert_tokenizer(i,truncation=True,return_token_type_ids=False) for i in dataset[sent_key] ]
        self.labels = [np.int64(i) for i in dataset[label_key]]


    def __getitem__(self, i):
        self.sentences[i]["label"] = self.labels[i]
        return self.sentences[i]


    def __len__(self):
        return len(self.sentences)
    
class TestDataset(Dataset):
    def __init__(self, dataset, sent_key, bert_tokenizer):
        self.sentences = [ bert_tokenizer(i,truncation=True,return_token_type_ids=False) for i in dataset[sent_key] ]
        
    def __getitem__(self, i):
        return self.sentences[i]
    
    def __len__(self):
        return len(self.sentences)


In [9]:
data_train = TrainDataset(dataset_train, "title", "topic_idx", tokenizer)
data_validation = TrainDataset(dataset_val, "title", "topic_idx", tokenizer)
data_test = TestDataset(test, "title", tokenizer)

학습에 사용할 모델을 hugginface에서 불러옵니다. 

In [10]:
num_labels = 7
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

모델의 성능을 측정하기 위한 metric(지표)를 불러옵니다.  
수행할 과제는 Text Classification이기 때문에 비슷한 과제인 glue-qnli의 metric을 가져옵니다.

In [11]:
metric = load_metric("glue", "qnli")

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
metric_name = "accuracy"

args = TrainingArguments(
    save_checkpoint_path,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [14]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=data_train,
    eval_dataset=data_validation,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 36523
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3426


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3489,0.317258,0.893659
2,0.2609,0.311066,0.894864


***** Running Evaluation *****
  Num examples = 9131
  Batch size = 32
Saving model checkpoint to ./checkpoints\checkpoint-1142
Configuration saved in ./checkpoints\checkpoint-1142\config.json
Model weights saved in ./checkpoints\checkpoint-1142\pytorch_model.bin
tokenizer config file saved in ./checkpoints\checkpoint-1142\tokenizer_config.json
Special tokens file saved in ./checkpoints\checkpoint-1142\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 9131
  Batch size = 32
Saving model checkpoint to ./checkpoints\checkpoint-2284
Configuration saved in ./checkpoints\checkpoint-2284\config.json
Model weights saved in ./checkpoints\checkpoint-2284\pytorch_model.bin
tokenizer config file saved in ./checkpoints\checkpoint-2284\tokenizer_config.json
Special tokens file saved in ./checkpoints\checkpoint-2284\special_tokens_map.json


In [None]:
trainer.evaluate()

In [None]:
pred = trainer.predict(data_test)
pred = np.argmax(pred[0],axis=1)

In [None]:
pred[:10]

In [None]:
submission = pd.read_csv('data/sample_submission.csv')
submission['topic_idx'] = pred
submission.to_csv("results/klue-bert-base-fine-tuning-with-trainer.csv",index=False)