사용한 모델: KR-BERT_character_sub-character  
모델 repo: https://github.com/snunlp/KR-BERT

# 0. 환경설정

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.nn.utils import clip_grad_norm_
from sklearn.model_selection import train_test_split
from torch.optim import AdamW, NAdam
import pickle
import numpy as np
import pandas as pd
import unicodedata
import re
from pathlib import Path
from typing import Union
from transformers import BertModel, BertForPreTraining, BertTokenizer, BertPreTrainedModel, BertConfig
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from scipy.stats import pearsonr
from transformers import get_linear_schedule_with_warmup, get_constant_schedule, get_cosine_with_hard_restarts_schedule_with_warmup
from copy import deepcopy

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"# available GPUs : {torch.cuda.device_count()}")
    print(f"GPU name : {torch.cuda.get_device_name()}")
else:
    device = torch.device("cpu")
print(device)

# available GPUs : 1
GPU name : GeForce RTX 3070
cuda


# 1. 데이터

In [None]:
class Config:
    """Config class"""

    def __init__(self, json_path_or_dict: Union[str, dict]) -> None:
        """Instantiating Config class
        Args:
            json_path_or_dict (Union[str, dict]): filepath of config or dictionary which has attributes
        """
        if isinstance(json_path_or_dict, dict):
            self.__dict__.update(json_path_or_dict)
        else:
            with open(json_path_or_dict, mode="r") as io:
                params = json.loads(io.read())
            self.__dict__.update(params)

    def save(self, json_path: Union[str, Path]) -> None:
        """Saving config to json_path
        Args:
            json_path (Union[str, Path]): filepath of config
        """
        with open(json_path, mode="w") as io:
            json.dump(self.__dict__, io, indent=4)

    def update(self, json_path_or_dict) -> None:
        """Updating Config instance
        Args:
            json_path_or_dict (Union[str, dict]): filepath of config or dictionary which has attributes
        """
        if isinstance(json_path_or_dict, dict):
            self.__dict__.update(json_path_or_dict)
        else:
            with open(json_path_or_dict, mode="r") as io:
                params = json.loads(io.read())
            self.__dict__.update(params)

    @property
    def dict(self) -> dict:
        return self.__dict__


def data_preproc(paragrahp:str):
    """
    1. 괄호 및 괄호 안 글자 제거
    2. 글자 인코딩 변경
    3. 홈페이지 주소 제거
    4. 이메일 주소 제거
    """
    paragrahp = re.sub(r'\(.*\)', '', paragrahp)
    patten = r"[^ .,·?!:'”%/()A-Za-z0-9가-힣+]"
    paragrahp = re.sub(patten, " ", paragrahp)
    paragrahp = " ".join(paragrahp.split())
    paragrahp = unicodedata.normalize("NFKD", paragrahp)
    paragrahp = re.sub("((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*", "", paragrahp)
    paragrahp = re.sub("'^[a-zA-Z0-9+-_.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'", "", paragrahp)
    return paragrahp

# Data augmentation

아래의 마크다운으로 처리된 코드들은 데이터 증강을 위한 코드입니다. 총 3가지 방법으로 데이터를 증강하였습니다. 각각의 방법으로 증강한 데이터를 원본데이터와 함께 사용하여 학습을 시도하였습니다. back translation 방법을 사용하여 데이터를 증강한 방법이 3가지 방법 중 가장 좋은 성능을 보였습니다. 하지만, 원본데이터만 사용한 것과 비교하여 일관되게 성능이 하락하는 양상을 보였고, 최종적으로 원본데이터만 사용하는 것으로 결정하였습니다.

```python
import random
from googletrans import Translator

# KorEDA
# https://github.com/catSirup/KorEDA/tree/master
# 기반 논문: EDA: Easy Data Augmentation Techniques for Boosting Performance on Text Classification Tasks

def random_deletion(words, p):
    if len(words) == 1:
        return words

    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]

    return new_words

def random_swap(words, n):
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)

    return new_words

def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0

    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1
        if counter > 3:
            return new_words

    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
    return new_words

def EDA(sentence, alpha_rs=0.1, p_rd=0.1, num_aug=1):
    words = sentence.split(' ')
    words = [word for word in words if word is not ""]
    num_words = len(words)

    num_new_per_technique = int(num_aug/4) + 1

    n_rs = max(1, int(alpha_rs*num_words))

    # rs
    for _ in range(num_new_per_technique):
        a_words = random_swap(words, n_rs)
        rs_result = " ".join(a_words)

    # rd
    for _ in range(num_new_per_technique):
        a_words = random_deletion(words, p_rd)
        rd_result = " ".join(a_words)

    return rs_result, rd_result
```

```python
# Back translattion
# 참고 논문: Data expansion using back translation and paraphrasing for hate speech detection

def back_translattion_using_google(text):
    translator = Translator()
    result = translator.translate(text, src="ko", dest="en")
    result = translator.translate(result.text, src="en", dest="ko")

    return result.text
```

```python
def data_augument(df):
    sent_1_list=df.sentence1.to_list()
    sent_2_list=df.sentence2.to_list()
    label_list=df.label.to_list()

    random_swap_result = list()
    random_delete_result = list()
    translate_result = list()

    for idx, sent1 in enumerate(sent_1_list):
        sent2 = sent_2_list[idx]
        s1_rs, s1_rd = EDA(sent1)
        s2_rs, s2_rd = EDA(sent2)

        random_swap_result.append([s1_rs, s2_rs, label_list[idx]])
        random_delete_result.append([s1_rd, s2_rd, label_list[idx]])

        try:
            translate_result.append([back_translattion_using_google(sent1), back_translattion_using_google(sent2), label_list[idx]])
        except:
            translate_result=list()
    
    random_swap_df = pd.DataFrame (random_swap_result, columns = ['sentence1', 'sentence2', 'label'])
    random_delete_df = pd.DataFrame (random_delete_result, columns = ['sentence1', 'sentence2', 'label'])
    back_translate_df = pd.DataFrame (translate_result, columns = ['sentence1', 'sentence2', 'label'])
    return random_swap_df, random_delete_df, back_translate_df

rs, rd, bt = data_augument(temp)
```

---

# 2. 모델링

In [None]:
with open("./klue-sts-data/klue-sts-v1.1_train.json", "rt", encoding='utf8') as f:
    data = json.load(f)

In [None]:
def custom_collate_fn(batch):
    input_list, target_list = [], []
    
    for _input, _target in batch:
        input_list.append(_input)
        target_list.append(_target)
    
    tensorized_input = tokenizer_krbert_sub(
        input_list,
        add_special_tokens=True,
        padding="longest",
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
    
    tensorized_label = torch.tensor(target_list, dtype = torch.float)
    
    return tensorized_input, tensorized_label


class CustomDataset(Dataset):
    """
    - input_data: list of string
    - target_data: list of int
    """
    def __init__(self, input_data:list, target_data:list) -> None:
        self.X = input_data
        self.Y = target_data

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        X = self.X[index]
        Y = self.Y[index]
        return X, Y

In [None]:
class FCLayer(torch.nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.0, use_activation=None):
        super(FCLayer, self).__init__()
        self.use_activation = use_activation
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        x = self.dropout(x)
        if self.use_activation:
            x = self.use_activation(x)
        return self.linear(x)


class BertSts(BertPreTrainedModel):
    def __init__(self, config) -> None:
        super(BertSts, self).__init__(config)
        self.bert = BertModel(config)
        self.layer = FCLayer(config.hidden_size, 
                             config.hidden_size, 
                             config.hidden_dropout_prob, 
                             torch.nn.GELU())
        self.Dense = torch.nn.Sequential(self.layer)
        self.output_layer = FCLayer(config.hidden_size, 1)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        dense_outputs = self.Dense(bert_outputs['pooler_output'])
        sim_score = self.output_layer(dense_outputs)
        return sim_score

---

### 데이터 전처리 및 pandas dataframe으로 변경

In [None]:
shape = np.full([len(data), 3], np.nan)
df = pd.DataFrame(shape, columns=['sentence1', 'sentence2', 'label'])

In [None]:
for idx, el in enumerate(data):
    df.loc[idx] = [el['sentence1'], el['sentence2'], el['labels']['real-label']]

In [None]:
df.loc[7:7]

Unnamed: 0,sentence1,sentence2,label
7,사례집은 국립환경과학원 누리집(ecolibrary.me.go.kr)에서 12일부터 ...,주말을 제외한 평일 오후 12시 30분부터 문예회관 공식 페이스북과 유튜브에서는 지...,0.0


In [None]:
df[['sentence1', 'sentence2']] = df[['sentence1', 'sentence2']].applymap(data_preproc)

In [None]:
df.loc[7:7]

Unnamed: 0,sentence1,sentence2,label
7,사례집은 국립환경과학원 누리집에서 12...,주말을 제외한 평일 오후 12시 30분부터 무...,0.0


In [None]:
train, valid = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
del data

### read config files

In [None]:
conf_info = Config(f"./config_files/config_subchar12367_bert.json")

In [None]:
with open("./config_files/config_subchar12367_bert.json", "rt", encoding="utf8") as f:
    conf_subchar = json.load(f)

In [None]:
conf_subchar

{'config': 'config_files/bert_config_subchar12367.json',
 'bert': 'torch_model/pytorch_model_subchar12367_bert.bin',
 'tokenizer': 'config_files/vocab_snu_subchar12367.txt',
 'vocab': 'config_files/vocab_snu_subchar12367.pkl'}

In [None]:
tokenizer_krbert_sub = BertTokenizer.from_pretrained(f"{conf_info.tokenizer}")



In [None]:
conf_info.bert

'torch_model/pytorch_model_subchar12367_bert.bin'

In [None]:
config = BertConfig(conf_info.bert)

In [None]:
config.vocab_size = 12367

In [None]:
model = BertSts(config = config)

In [None]:
weights = torch.load(conf_info.bert)

In [None]:
param_names = []

for name, param in model.named_parameters():
    param_names.append(name)

In [None]:
weight_dict = deepcopy(model.state_dict())

In [None]:
for name, weight in weights.items():
    if name in param_names:
        weight_dict[name] = weight

In [None]:
model.load_state_dict(weight_dict)

<All keys matched successfully>

In [None]:
batch_size = 32

### 데이터셋 로더 생성

In [None]:
train_dataset = CustomDataset(train.iloc[:, :2].values.tolist(), train['label'].tolist())
valid_dataset = CustomDataset(valid.iloc[:, :2].values.tolist(), valid['label'].tolist())

In [None]:
train_dataloader = DataLoader(train_dataset,
                              batch_size = batch_size,
                              sampler = RandomSampler(train_dataset),
                              collate_fn = custom_collate_fn)

valid_dataloader = DataLoader(valid_dataset,
                              batch_size = batch_size,
                              sampler = RandomSampler(valid_dataset),
                              collate_fn = custom_collate_fn)

---

# 4. 모델 학습

In [None]:
loss_fct = torch.nn.MSELoss()

def train(model, train_dataloader, valid_dataloader=None, epochs=2):
        global loss_fct, scheduler
        digit = len(str(len(train_dataloader)))
        early_stopping = EarlyStopping(patience = 5)
        
        for epoch in range(epochs):
            print(f"*****Epoch {epoch} Train Start*****")
            
            total_loss, batch_loss, batch_count = 0,0,0
        
            model.train()
            model.to(device)
            
            for step, batch in enumerate(train_dataloader):
                batch_count+=1
                
                batch = tuple(item.to(device) for item in batch)
            
                batch_input, batch_label = batch
                
                model.zero_grad()
            
                logits = model(**batch_input)

                loss = loss_fct(logits.view(-1), batch_label.view(-1))
                
                batch_loss += loss.item()
                total_loss += loss.item()
            
                loss.backward()
                
                clip_grad_norm_(model.parameters(), 1.0)
                
                optimizer.step()
                scheduler.step()
                
                if (step % 10 == 0 and step != 0):
                    learning_rate = optimizer.param_groups[0]['lr']
                    print(f"Epoch: {epoch}, Step: {step:{digit}d}, LR: {learning_rate:.2e}, Avg Loss: {batch_loss / batch_count:.4f}")

                    batch_loss, batch_count = 0,0

            print(f"Epoch {epoch} Total Mean Loss : {total_loss/(step+1):.4f}")
            print(f"*****Epoch {epoch} Train Finish*****\n")
            
            if valid_dataloader:
                print(f"*****Epoch {epoch} Valid Start*****")
                valid_loss = validate(model, valid_dataloader)
                print(f"Epoch {epoch} Valid Loss : {valid_loss:.4f}")
                print(f"*****Epoch {epoch} Valid Finish*****\n")
            

            early_stopping(valid_loss)

            if early_stopping.early_stop:
                print('terminating because of early stopping.')
                break
                
        print("Train Completed. End Program.")

In [None]:
def validate(model, valid_dataloader):

    model.eval()
    model.to(device)
    
    total_loss = 0
        
    for step, batch in enumerate(valid_dataloader):
        batch = tuple(item.to(device) for item in batch)
            
        batch_input, batch_label = batch

        with torch.no_grad():
            logits = model(**batch_input)
            
        loss = loss_fct(logits.view(-1), batch_label.view(-1))
        total_loss += loss.item()
        
    total_loss = total_loss/(step+1)

    return total_loss

In [None]:
class EarlyStopping:
    """주어진 patience 이후로 validation loss가 개선되지 않으면 학습을 조기 중지"""
    def __init__(self, patience=5, verbose=False, delta=0.0001):
        """
        Args:
            patience (int): validation loss가 개선된 후 기다리는 기간
                            Default: 7
            verbose (bool): True일 경우 각 validation loss의 개선 사항 메세지 출력
                            Default: False
            delta (float): 개선되었다고 인정되는 monitered quantity의 최소 변화
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta


    def __call__(self, val_loss):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0
        
        if self.verbose and self.val_loss_min > val_loss:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).')
            self.val_loss_min = val_loss

In [None]:
def initializer(train_dataloader, epochs=2):
    """
    모델, 옵티마이저, 스케쥴러 초기화
    """

    optimizer = NAdam(
        model.parameters(),
        lr=2e-5,
        eps=1e-8
    )
    
    total_steps = len(train_dataloader) * epochs
    print(f"Total train steps with {epochs} epochs: {total_steps}")

    scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
        optimizer, 
        num_warmup_steps = 0,
        num_training_steps = total_steps
    )

    return optimizer, scheduler

In [None]:
epochs = 30
optimizer, scheduler = initializer(train_dataloader, epochs = epochs)

train(model, train_dataloader, valid_dataloader, epochs = epochs)

Total train steps with 30 epochs: 8760
*****Epoch 0 Train Start*****
Epoch: 0, Step:  10, LR: 2.00e-05, Avg Loss: 5.3951
Epoch: 0, Step:  20, LR: 2.00e-05, Avg Loss: 2.4244
Epoch: 0, Step:  30, LR: 2.00e-05, Avg Loss: 0.9192
Epoch: 0, Step:  40, LR: 2.00e-05, Avg Loss: 0.5671
Epoch: 0, Step:  50, LR: 2.00e-05, Avg Loss: 0.5098
Epoch: 0, Step:  60, LR: 2.00e-05, Avg Loss: 0.3683
Epoch: 0, Step:  70, LR: 2.00e-05, Avg Loss: 0.4303
Epoch: 0, Step:  80, LR: 2.00e-05, Avg Loss: 0.3909
Epoch: 0, Step:  90, LR: 2.00e-05, Avg Loss: 0.3958
Epoch: 0, Step: 100, LR: 2.00e-05, Avg Loss: 0.3448
Epoch: 0, Step: 110, LR: 2.00e-05, Avg Loss: 0.3736
Epoch: 0, Step: 120, LR: 2.00e-05, Avg Loss: 0.3891
Epoch: 0, Step: 130, LR: 2.00e-05, Avg Loss: 0.2574
Epoch: 0, Step: 140, LR: 2.00e-05, Avg Loss: 0.2906
Epoch: 0, Step: 150, LR: 2.00e-05, Avg Loss: 0.3446
Epoch: 0, Step: 160, LR: 2.00e-05, Avg Loss: 0.2786
Epoch: 0, Step: 170, LR: 2.00e-05, Avg Loss: 0.2194
Epoch: 0, Step: 180, LR: 2.00e-05, Avg Loss: 0.

Epoch: 4, Step: 270, LR: 1.87e-05, Avg Loss: 0.0725
Epoch: 4, Step: 280, LR: 1.87e-05, Avg Loss: 0.0681
Epoch: 4, Step: 290, LR: 1.87e-05, Avg Loss: 0.0776
Epoch 4 Total Mean Loss : 0.0696
*****Epoch 4 Train Finish*****

*****Epoch 4 Valid Start*****
Epoch 4 Valid Loss : 0.2231
*****Epoch 4 Valid Finish*****

EarlyStopping counter: 2 out of 5
*****Epoch 5 Train Start*****
Epoch: 5, Step:  10, LR: 1.86e-05, Avg Loss: 0.0573
Epoch: 5, Step:  20, LR: 1.86e-05, Avg Loss: 0.0544
Epoch: 5, Step:  30, LR: 1.86e-05, Avg Loss: 0.0593
Epoch: 5, Step:  40, LR: 1.86e-05, Avg Loss: 0.0523
Epoch: 5, Step:  50, LR: 1.86e-05, Avg Loss: 0.0590
Epoch: 5, Step:  60, LR: 1.85e-05, Avg Loss: 0.0513
Epoch: 5, Step:  70, LR: 1.85e-05, Avg Loss: 0.0592
Epoch: 5, Step:  80, LR: 1.85e-05, Avg Loss: 0.0505
Epoch: 5, Step:  90, LR: 1.85e-05, Avg Loss: 0.0670
Epoch: 5, Step: 100, LR: 1.85e-05, Avg Loss: 0.0465
Epoch: 5, Step: 110, LR: 1.85e-05, Avg Loss: 0.0516
Epoch: 5, Step: 120, LR: 1.84e-05, Avg Loss: 0.0571
E

# make output

In [None]:
def predict(model, test_dataloader):
    """
    test_dataloader의 label별 확률값과 실제 label 값을 반환
    """

    model.eval()
    model.to(device)

    all_logits = []
    all_labels = []

    for step, batch in enumerate(test_dataloader):
        print(f"{step+1}/{len(test_dataloader)}\r", end = "")
        
        batch_input, batch_label = batch
        
        batch_input = batch_input.to(device)
        
        with torch.no_grad():
            logits = model(**batch_input)
            all_logits.append(logits)
        all_labels.extend(batch_label)

    all_logits = torch.cat(all_logits, dim=0)
    probs = torch.tensor(all_logits).cpu().numpy()
    all_labels = np.array(all_labels)

    return probs, all_labels

# validationset score

In [None]:
val_probs, val_labels = predict(model, valid_dataloader)

73/73

  probs = torch.tensor(all_logits).cpu().numpy()


In [None]:
print("Pearson r: {:.2f} \nP-value: {:.2e}".format(*pearsonr(val_probs.flatten(), val_labels)))

Pearson r: 0.97 
P-value: 0.00e+00


In [None]:
print("F1 score:", f1_score(np.where(val_probs.flatten() >= 3, 1, 0), np.where(val_labels >= 3, 1, 0)))

F1 score: 0.9551227773073666


# devset score

In [None]:
with open("./klue-sts-data/klue-sts-v1.1_dev.json", "rt", encoding='utf8') as f:
    dev_data = json.load(f)

In [None]:
shape = np.full([len(dev_data), 3], np.nan)
dev_df = pd.DataFrame(shape, columns=['sentence1', 'sentence2', 'label'])

for idx, el in enumerate(dev_data):
    dev_df.loc[idx] = [el['sentence1'], el['sentence2'], el['labels']['real-label']]

dev_df[['sentence1', 'sentence2']] = dev_df[['sentence1', 'sentence2']].applymap(data_preproc)

In [None]:
dev_dataset = CustomDataset(dev_df.iloc[:, :2].values.tolist(), dev_df['label'].tolist())

dev_dataloader = DataLoader(dev_dataset,
                            batch_size = batch_size,
                            sampler = RandomSampler(dev_dataset),
                            collate_fn = custom_collate_fn)

In [None]:
probs, labels = predict(model, dev_dataloader)

17/17

  probs = torch.tensor(all_logits).cpu().numpy()


In [None]:
print("Pearson r: {:.2f} \nP-value: {:.2e}".format(*pearsonr(probs.flatten(), labels)))

Pearson r: 0.86 
P-value: 3.22e-154


In [None]:
print("F1 score:", f1_score(np.where(probs.flatten() >= 3, 1, 0), np.where(labels >= 3, 1, 0)))

F1 score: 0.7903225806451614
