## 말뭉치 프로젝트
#### 1.train.py 변환
#### 1) 라이브러리 및 패키지 가져오기

In [None]:
# !pip install transformers
# !pip install datasets
# !pip install accelerate
!pip install tensorflow_addons

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (612 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m612.1/612.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow_addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.21.0 typeguard-2.13.3


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
import os
import json
import logging

import torch
import numpy as np

In [None]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    EvalPrediction
)

from datasets import Dataset
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

#### 2)변수 및 파라미터 설정

In [None]:
output_dir = "/content/output"
model_path = "klue/roberta-base"
tokenizer_path = "klue/roberta-base"
max_seq_len = 512
batch_size = 32
valid_batch_size = 64
accumulate_grad_batches = 1
epochs = 10
learning_rate = 2e-4
weight_decay = 0.01
gpus = 0
seed = 42

#### 3) logging 설명 및 데이터 로드

In [None]:
logger = logging.getLogger("train")
logger.propagate = False
logger.setLevel(logging.DEBUG)
if not logger.handlers:
    handler = logging.StreamHandler(sys.stdout)
    handler.setFormatter(logging.Formatter("[%(asctime)s] %(message)s"))
    logger.addHandler(handler)

In [None]:
os.makedirs(output_dir, exist_ok=True)
logger.info(f'[+] Save output to "{output_dir}"')

[2023-10-15 07:15:04,984] [+] Save output to "/content/output"


In [None]:
logger.info(f'[+] Load Dataset')
train_ds = Dataset.from_json("/content/drive/MyDrive/malmungchi/data/nikluge-ea-2023-train.jsonl")
valid_ds = Dataset.from_json("/content/drive/MyDrive/malmungchi/data/nikluge-ea-2023-dev.jsonl")

[2023-10-15 07:15:05,002] [+] Load Dataset


#### 4) 데이터 전처리

In [None]:
logger.info(f'[+] Load Tokenizer"')
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

[2023-10-15 07:15:05,141] [+] Load Tokenizer"


In [None]:
labels = list(train_ds["output"][0].keys())
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [None]:
def preprocess_data(examples):
    # take a batch of texts
    text1 = examples["input"]["form"]
    text2 = examples["input"]["target"]["form"]
    # encode them
    encoding = tokenizer(text1, text2, padding="max_length", truncation=True, max_length=max_seq_len)
    # add labels
    encoding["labels"] = [0.0] * len(labels)
    for key, idx in label2id.items():
        if examples["output"][key] == 'True':
            encoding["labels"][idx] = 1.0

    return encoding

In [None]:
encoded_tds = train_ds.map(preprocess_data, remove_columns=train_ds.column_names)
encoded_vds = valid_ds.map(preprocess_data, remove_columns=valid_ds.column_names)

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

#### 5) 모델 및 트레이너 설정

In [None]:
logger.info(f'[+] Load Model from "{model_path}"')

[2023-10-15 07:15:05,923] [+] Load Model from "klue/roberta-base"


In [None]:
import tensorflow
from tensorflow.keras import optimizers
import tensorflow_addons as tfa

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    problem_type="multi_label_classification",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

optimzer = tfa.optimizers.RectifiedAdam(lr=5.03-5, total_steps = 2344*4, warmup_proportion=0.1, min_lr=1e-5, epsilon=1e-08)
model.compile(optimizer=optimizer)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(name, **kwargs)


AttributeError: ignored

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=valid_batch_size,
    num_train_epochs=epochs,
    weight_decay=weight_decay,
    load_best_model_at_end=True,
    metric_for_best_model= "f1",
)

In [None]:
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

In [None]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=encoded_tds,
    eval_dataset=encoded_vds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

model.compile(optimzer)

NameError: ignored

#### 6) 학습 실행

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.46149,0.666667,0.809524,0.666667
2,No log,0.367923,0.666667,0.809524,0.666667
3,No log,0.311924,0.666667,0.809524,0.666667
4,No log,0.276261,0.666667,0.809524,0.666667
5,No log,0.251027,0.666667,0.809524,0.666667
6,No log,0.230054,0.666667,0.809524,0.666667
7,No log,0.213233,1.0,1.0,1.0
8,No log,0.198542,1.0,1.0,1.0
9,No log,0.188557,1.0,1.0,1.0
10,No log,0.183136,1.0,1.0,1.0


TrainOutput(global_step=10, training_loss=0.33321681022644045, metrics={'train_runtime': 320.2819, 'train_samples_per_second': 0.094, 'train_steps_per_second': 0.031, 'total_flos': 7893756887040.0, 'train_loss': 0.33321681022644045, 'epoch': 10.0})

In [None]:
with open(os.path.join(output_dir, "label2id.json"), "w") as f:
    json.dump(label2id, f)

In [None]:
# rm -rv output checkpoint-1