## 말뭉치 프로젝트
#### 1.train.py 변환
#### 1) 라이브러리 및 패키지 가져오기

In [1]:
# !pip install transformers

In [2]:
# !pip install datasets

In [3]:
# !pip install accelerate

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
import sys
import os
import json
import logging

import torch
import numpy as np

In [6]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    EvalPrediction
)

from datasets import Dataset
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

#### 2)변수 및 파라미터 설정

In [7]:
output_dir = "./output"
model_path = "klue/roberta-base"
tokenizer_path = "klue/roberta-base"
max_seq_len = 512
batch_size = 32
valid_batch_size = 64
accumulate_grad_batches = 1
epochs = 10
learning_rate = 2e-4
weight_decay = 0.01
gpus = 0
seed = 42

#### 3) logging 설명 및 데이터 로드

In [8]:
logger = logging.getLogger("train")
logger.propagate = False
logger.setLevel(logging.DEBUG)
if not logger.handlers:
    handler = logging.StreamHandler(sys.stdout)
    handler.setFormatter(logging.Formatter("[%(asctime)s] %(message)s"))
    logger.addHandler(handler)

In [9]:
os.makedirs(output_dir, exist_ok=True)
logger.info(f'[+] Save output to "{output_dir}"')

[2023-10-15 15:01:49,551] [+] Save output to "./output"


In [10]:
logger.info(f'[+] Load Dataset')
train_ds = Dataset.from_json("./data/nikluge-ea-2023-train.jsonl")
valid_ds = Dataset.from_json("./data/nikluge-ea-2023-dev.jsonl")

[2023-10-15 15:01:49,561] [+] Load Dataset
Downloading and preparing dataset json/default to /aiffel/.cache/huggingface/datasets/json/default-85e56bdadbc1dc2c/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /aiffel/.cache/huggingface/datasets/json/default-85e56bdadbc1dc2c/0.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset json/default to /aiffel/.cache/huggingface/datasets/json/default-a9b3af15d07fad8a/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /aiffel/.cache/huggingface/datasets/json/default-a9b3af15d07fad8a/0.0.0. Subsequent calls will reuse this data.


#### 4) 데이터 전처리

In [11]:
logger.info(f'[+] Load Tokenizer"')
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

[2023-10-15 15:01:50,575] [+] Load Tokenizer"


Downloading (…)okenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [12]:
labels = list(train_ds["output"][0].keys())
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [13]:
def preprocess_data(examples):
    # take a batch of texts
    text1 = examples["input"]["form"]
    text2 = examples["input"]["target"]["form"]
    # encode them
    encoding = tokenizer(text1, text2, padding="max_length", truncation=True, max_length=max_seq_len)
    # add labels
    encoding["labels"] = [0.0] * len(labels)
    for key, idx in label2id.items():
        if examples["output"][key] == 'True':
            encoding["labels"][idx] = 1.0

    return encoding

In [14]:
encoded_tds = train_ds.map(preprocess_data, remove_columns=train_ds.column_names)
encoded_vds = valid_ds.map(preprocess_data, remove_columns=valid_ds.column_names)

Map:   0%|          | 0/37932 [00:00<?, ? examples/s]

Map:   0%|          | 0/4751 [00:00<?, ? examples/s]

#### 5) 모델 및 트레이너 설정

In [15]:
logger.info(f'[+] Load Model from "{model_path}"')

[2023-10-15 15:02:16,898] [+] Load Model from "klue/roberta-base"


In [16]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    problem_type="multi_label_classification",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'class

In [17]:
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=valid_batch_size,
    num_train_epochs=epochs,
    weight_decay=weight_decay,
    load_best_model_at_end=True,
    metric_for_best_model= "f1",
)

In [18]:
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

In [19]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

In [20]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=encoded_tds,
    eval_dataset=encoded_vds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [21]:
# free -mh

#### 6) 학습 실행

In [22]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
with open(os.path.join(output_dir, "label2id.json"), "w") as f:
    json.dump(label2id, f)

In [None]:
# rm -rv output checkpoint-1