In [1]:
# HuggingFace_SA_1.ipynb - 3.에서 사용한 라이브러리 그대로 가져옴 

import math
import numpy as np
import pandas as pd
import re
import torch

from datasets import load_dataset
from kobert_tokenizer import KoBERTTokenizer
from sklearn.metrics import (
    confusion_matrix, 
    accuracy_score, 
    roc_auc_score, 
    precision_score, 
    recall_score, 
    f1_score)
from transformers import (
    AdamW, 
    AutoModelForSequenceClassification, 
    DataCollatorWithPadding, 
    Trainer, 
    TrainingArguments
)

# 1. Class weight

## 1) Data Import

In [2]:
data_name = "sent_merge"
checkpoint = "skt/kobert-base-v1"
seed = 7353
train_proportion = 0.7

In [3]:
# 토크나이저 로드
from transformers import AutoTokenizer
tokenizer = KoBERTTokenizer.from_pretrained(checkpoint)

# dataset load
dataset = load_dataset('csv', data_files={'train': f'../data_split/{data_name}_train.csv',
                                          'test': f'../data_split/{data_name}_test.csv'})

Using custom data configuration default-8e80763ffbac1bee
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-8e80763ffbac1bee/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 43853
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 10964
    })
})

## 2) Data Preprocessing

In [5]:
# 1) 결측값 있으면 삭제
dataset = dataset.filter(lambda row: pd.notnull(row["id"]))
dataset = dataset.filter(lambda row: pd.notnull(row["text"]))
dataset = dataset.filter(lambda row: pd.notnull(row["label"]))
print("Drop rows with NA - Done")

# 2) 특수문자 삭제
def remove_sp(example):
    example["text"]=re.sub(r'[^0-9|ㄱ-ㅎ|ㅏ-ㅣ|가-힣| ]+', '', str(example["text"]))
    return example

dataset = dataset.map(remove_sp)
print("Remove SP - Done")

# 3) 토크나이징
tokenizer = KoBERTTokenizer.from_pretrained(checkpoint, truncation_side = 'right')

def tokenize_fn(dataset):
    tokenized_batch = tokenizer(dataset["text"],
                                padding=True,
                                truncation=True,
                                max_length = 512)
    return tokenized_batch

dataset = dataset.map(tokenize_fn, batched=True) # 여러 텍스트가 포함된 하나의 배치 단위로 인코딩
print("Tokenize - Done")

# 4) train / eval /test split
train_dataset = dataset["train"].shuffle(seed=seed).select(range(0,math.floor(len(dataset["train"])*train_proportion)))
eval_dataset = dataset["train"].shuffle(seed=seed).select(range(math.floor(len(dataset["train"])*train_proportion), len(dataset["train"])))
test_dataset = dataset["test"]
print("Data Split - Done")

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-8e80763ffbac1bee/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-7d46d2613c1bc41a.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-8e80763ffbac1bee/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-05259dc511dece38.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-8e80763ffbac1bee/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-6b41264358b44366.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-8e80763ffbac1bee/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-9830456025cb629c.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-8e80763ffbac1bee/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-0711df7d795adcf4.arrow
Loadi

Drop rows with NA - Done
Remove SP - Done


  0%|          | 0/44 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/csv/default-8e80763ffbac1bee/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-09f689d63cb09e97.arrow


Tokenize - Done
Data Split - Done


## 3) 모델링

- [TrainingArguments](https://huggingface.co/docs/transformers/v4.19.2/en/main_classes/trainer#transformers.TrainingArguments)

In [6]:
# GPU / CPU
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    device_count = torch.cuda.device_count()
    print("device_count: {}".format(device_count))
    for device_num in range(device_count):
        print("device {} capability {}".format(
            device_num,
            torch.cuda.get_device_capability(device_num)))
        print("device {} name {}".format(
            device_num, 
            torch.cuda.get_device_name(device_num)))
else:
    device = torch.device("cpu")
    print("no cuda device")
    
num_gpus = 1

device_count: 1
device 0 capability (8, 6)
device 0 name NVIDIA GeForce RTX 3080


In [7]:
from sklearn.utils.class_weight import compute_class_weight

# weights 계산 
train_labels = np.array(train_dataset["label"])
class_weights = compute_class_weight(class_weight = 'balanced', classes = np.unique(train_labels), y = train_labels)
weights = torch.tensor(class_weights, dtype = torch.float)
weights

tensor([  0.8477,   0.4418,   0.5646,   0.8285, 128.9790, 182.7202,   1.7690])

In [8]:
args = TrainingArguments(
    output_dir="./output",
    do_train=True,
    do_eval=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    learning_rate=2e-5, # config
    weight_decay=0.1, # config
    adam_beta1=0.9, # config
    adam_beta2=0.9, # config
    adam_epsilon=1.5e-06, # config
    num_train_epochs=10,
    max_steps=-1,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,  # config
    warmup_steps=0,
    logging_dir="./logs",
    save_strategy="steps",
    no_cuda=num_gpus <= 0,
    seed=seed,
    fp16=True,
    eval_steps = 50,
    load_best_model_at_end=True,
    greater_is_better=True,
    metric_for_best_model="objective", # f1 + acc
    report_to="none",
    skip_memory_metrics=True,
    )

In [9]:
# train() method를 호출할 때마다 모델 초기화
# 이렇게 하면 train 중간에 중지했다가 다시 run해도 오류나지 않습니다;
def _model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        checkpoint,
        num_labels = 7,
        output_attentions = False,
        output_hidden_states = False
        )

In [10]:
# 평가 매트릭 정의
def _compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_weighted = f1_score(labels, predictions, average = 'weighted')
    acc_weighted = accuracy_score(labels, predictions)
    return {"acc_weighted": acc_weighted, "f1_weighted": f1_weighted, "objective": acc_weighted + f1_weighted}

In [11]:
# data_collator 정의
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # 배치단위로 패딩 수행할 수 있도록 해줌

In [12]:
train_dataset

Dataset({
    features: ['attention_mask', 'id', 'input_ids', 'label', 'text', 'token_type_ids'],
    num_rows: 30697
})

In [None]:
# Trainer클래스를 상속받아 새로운 CustomTrainer 클래스를 만들고, 그 안의 compute_loss 함수를 새로 작성
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        weight = weights.to(device)
        loss_fct = torch.nn.MultiMarginLoss(weight=weight) # loss function for 다중분류
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Trainer
trainer = CustomTrainer(
    model=_model_init(),
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=_compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()