사용한 모델: KR-BERT_character_sub-character  
모델 repo: https://github.com/snunlp/KR-BERT

In [2]:
import json
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.nn.utils import clip_grad_norm_
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
import pickle
import numpy as np
import pandas as pd
import re
from pathlib import Path
from typing import Union
from transformers import BertModel, BertForPreTraining, BertTokenizer, BertPreTrainedModel, BertConfig

In [13]:
import sklearn
import transformers
print("사용한 라이브러리 버전")
print("-"*22)
print("pytorch:", torch.__version__)
print("transformers:", transformers.__version__)
print("scikit-learn:", sklearn.__version__)
print("numpy:", np.__version__)
print("pandas:", pd.__version__)
print("-"*22)

사용한 라이브러리 버전
----------------------
pytorch: 1.10.2
transformers: 4.16.2
scikit-learn: 1.0.2
numpy: 1.21.5
pandas: 1.4.1
----------------------


In [2]:
class Config:
    """Config class"""

    def __init__(self, json_path_or_dict: Union[str, dict]) -> None:
        """Instantiating Config class
        Args:
            json_path_or_dict (Union[str, dict]): filepath of config or dictionary which has attributes
        """
        if isinstance(json_path_or_dict, dict):
            self.__dict__.update(json_path_or_dict)
        else:
            with open(json_path_or_dict, mode="r") as io:
                params = json.loads(io.read())
            self.__dict__.update(params)

    def save(self, json_path: Union[str, Path]) -> None:
        """Saving config to json_path
        Args:
            json_path (Union[str, Path]): filepath of config
        """
        with open(json_path, mode="w") as io:
            json.dump(self.__dict__, io, indent=4)

    def update(self, json_path_or_dict) -> None:
        """Updating Config instance
        Args:
            json_path_or_dict (Union[str, dict]): filepath of config or dictionary which has attributes
        """
        if isinstance(json_path_or_dict, dict):
            self.__dict__.update(json_path_or_dict)
        else:
            with open(json_path_or_dict, mode="r") as io:
                params = json.loads(io.read())
            self.__dict__.update(params)

    @property
    def dict(self) -> dict:
        return self.__dict__
    
def data_qc(paragrahp:str):
    paragrahp = re.sub(r"(\(.*?\))", "", paragrahp)
    paragrahp = re.sub("((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*", "", paragrahp)
    paragrahp = re.sub("'^[a-zA-Z0-9+-_.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'", "", paragrahp)
    return paragrahp

### read data

In [3]:
with open("./klue-sts-v1.1/klue-sts-v1.1_train.json", "rt", encoding='utf8') as f:
    data = json.load(f)

In [4]:
data[0]

{'guid': 'klue-sts-v1_train_00000',
 'source': 'airbnb-rtt',
 'sentence1': '숙소 위치는 찾기 쉽고 일반적인 한국의 반지하 숙소입니다.',
 'sentence2': '숙박시설의 위치는 쉽게 찾을 수 있고 한국의 대표적인 반지하 숙박시설입니다.',
 'labels': {'label': 3.7, 'real-label': 3.714285714285714, 'binary-label': 1},
 'annotations': {'agreement': '0:0:0:2:5:0',
  'annotators': ['07', '13', '15', '10', '12', '02', '19'],
  'annotations': [3, 4, 4, 4, 3, 4, 4]}}

In [5]:
shape = np.full([len(data), 3], np.nan)
df = pd.DataFrame(shape, columns=['sentence1', 'sentence2', 'label'])

In [6]:
for idx, el in enumerate(data):
    df.loc[idx] = [el['sentence1'], el['sentence2'], el['labels']['real-label']]

In [7]:
df.loc[7:7]

Unnamed: 0,sentence1,sentence2,label
7,사례집은 국립환경과학원 누리집(ecolibrary.me.go.kr)에서 12일부터 ...,주말을 제외한 평일 오후 12시 30분부터 문예회관 공식 페이스북과 유튜브에서는 지...,0.0


In [8]:
df[['sentence1', 'sentence2']] = df[['sentence1', 'sentence2']].applymap(data_qc)

In [9]:
df.loc[7:7]

Unnamed: 0,sentence1,sentence2,label
7,사례집은 국립환경과학원 누리집에서 12일부터 볼 수 있다.,주말을 제외한 평일 오후 12시 30분부터 문예회관 공식 페이스북과 유튜브에서는 지...,0.0


In [10]:
train, valid = train_test_split(df, test_size=0.2, random_state=42)

In [11]:
del data

### read config files

In [12]:
ptr_dir = "KR-BERT/krbert_pytorch/"

In [13]:
conf_info = Config(f"{ptr_dir}pretrained/config_subchar12367_bert.json")

In [14]:
conf_info.dict

{'config': 'pretrained/bert_config_subchar12367.json',
 'bert': 'pretrained/pytorch_model_subchar12367_bert.bin',
 'tokenizer': 'pretrained/vocab_snu_subchar12367.txt',
 'vocab': 'pretrained/vocab_snu_subchar12367.pkl'}

In [15]:
data_config = Config(f"{ptr_dir}data/config.json")
model_config = Config(f"{ptr_dir}finetuning_config.json")

In [16]:
with open(f"{ptr_dir}{conf_info.vocab}", "rb") as vc:
    vocab = pickle.load(vc)

In [17]:
tokenizer_krbert_sub = BertTokenizer.from_pretrained(f"{ptr_dir}{conf_info.tokenizer}")



In [18]:
tokenizer_krbert_sub.tokenize(df['sentence1'].loc[0])

['숙소',
 '위치',
 '##는',
 '찾기',
 '쉬',
 '##ᆸ',
 '##고',
 '일반적인',
 '한국의',
 '반',
 '##지',
 '##하',
 '숙소',
 '##입니다',
 '.']

In [19]:
def custom_collate_fn(batch):
    input_list, target_list = [], []
    
    for _input, _target in batch:
        input_list.append(_input)
        target_list.append(_target)
    
    tensorized_input = tokenizer_krbert_sub(
        input_list,
        add_special_tokens=True,
        padding="longest",
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
    
    tensorized_label = torch.tensor(target_list, dtype = torch.float)
    
    return tensorized_input, tensorized_label


class CustomDataset(Dataset):
    """
    - input_data: list of string
    - target_data: list of int
    """

    def __init__(self, input_data:list, target_data:list) -> None:
        self.X = input_data
        self.Y = target_data

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        X = self.X[index]
        Y = self.Y[index]
        return X, Y

In [20]:
class BertSts(BertPreTrainedModel):
    def __init__(self, config) -> None:
        super(BertSts, self).__init__(config)
        self.bert = BertModel(config)
        self.regressor = torch.nn.Sequential(torch.nn.Dropout(config.hidden_dropout_prob),
                                             torch.nn.Linear(config.hidden_size, 1))


    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None):
        # pooled_output is not same hidden vector corresponds to first token from last encoded layers
        
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        
        logits = self.regressor(outputs['pooler_output'])
        return logits

In [21]:
config = BertConfig(conf_info.bert)

In [22]:
config.vocab_size = 12367

In [23]:
model = BertSts(config = config)

In [49]:
model

BertSts(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(12367, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [24]:
weights = torch.load("./torch_model/pytorch_model_subchar12367_bert.bin")

In [25]:
df.loc[7]['sentence1'], df.loc[7]['sentence2']

('사례집은 국립환경과학원 누리집에서 12일부터 볼 수 있다.',
 '주말을 제외한 평일 오후 12시 30분부터 문예회관 공식 페이스북과 유튜브에서는 지역 예술인들이 중심이 된 서양음악, 국악, 댄스 등의 공연을 실시간으로 감상할 수 있다.')

In [26]:
tokenizer_krbert_sub(df.loc[7]['sentence1'], df.loc[7]['sentence2'])

{'input_ids': [2, 2238, 518, 17, 2591, 3641, 3552, 80, 4891, 518, 23, 181, 828, 594, 33, 30, 5, 3, 3141, 6, 3634, 967, 22, 244, 181, 50, 281, 171, 230, 268, 615, 129, 187, 991, 2561, 24, 5641, 282, 463, 2329, 4809, 1818, 12, 698, 7052, 5401, 8, 644, 1329, 8, 5822, 646, 1078, 6, 4998, 28, 1509, 160, 49, 33, 30, 5, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [27]:
param_names = []

for name, param in model.named_parameters():
    param_names.append(name)

In [28]:
from copy import deepcopy

In [29]:
weight_dict = deepcopy(model.state_dict())

In [30]:
for name, weight in weights.items():
    if name in param_names:
        weight_dict[name] = weight

In [31]:
model.load_state_dict(weight_dict)

<All keys matched successfully>

In [32]:
batch_size = 32

In [33]:
train_dataset = CustomDataset(train.iloc[:, :2].values.tolist(), train['label'].tolist())
valid_dataset = CustomDataset(valid.iloc[:, :2].values.tolist(), valid['label'].tolist())

In [34]:
train_dataloader = DataLoader(train_dataset,
                              batch_size = batch_size,
                              sampler = RandomSampler(train_dataset),
                              collate_fn = custom_collate_fn)

valid_dataloader = DataLoader(valid_dataset,
                              batch_size = batch_size,
                              sampler = RandomSampler(valid_dataset),
                              collate_fn = custom_collate_fn)

In [35]:
loss_fct = torch.nn.MSELoss()

def train(model, train_dataloader, valid_dataloader=None, epochs=2):
        global loss_fct, scheduler
        
        for epoch in range(epochs):
            print(f"*****Epoch {epoch} Train Start*****")
            
            total_loss, batch_loss, batch_count = 0,0,0
        
            model.train()
            model.to(device)
            
            for step, batch in enumerate(train_dataloader):
                batch_count+=1
                
                batch = tuple(item.to(device) for item in batch)
            
                batch_input, batch_label = batch
                
                model.zero_grad()
            
                logits = model(**batch_input)

                loss = loss_fct(logits.view(-1), batch_label.view(-1))
                
                batch_loss += loss.item()
                total_loss += loss.item()
            
                loss.backward()
                
                clip_grad_norm_(model.parameters(), 1.0)
                
                optimizer.step()
                scheduler.step()
                
                if (step % 10 == 0 and step != 0):
                    learning_rate = optimizer.param_groups[0]['lr']
                    print(f"Epoch: {epoch}, Step : {step}, LR : {learning_rate}, Avg Loss : {batch_loss / batch_count:.4f}")

                    batch_loss, batch_count = 0,0

            print(f"Epoch {epoch} Total Mean Loss : {total_loss/(step+1):.4f}")
            print(f"*****Epoch {epoch} Train Finish*****\n")
            
            if valid_dataloader is not None:
                print(f"*****Epoch {epoch} Valid Start*****")
                valid_loss, valid_acc = validate(model, valid_dataloader)
                print(f"Epoch {epoch} Valid Loss : {valid_loss:.4f} Valid Acc : {valid_acc:.2f}")
                print(f"*****Epoch {epoch} Valid Finish*****\n")
            
#             save_checkpoint(".", model, optimizer, scheduler, epoch, total_loss/(step+1))
                
        print("Train Completed. End Program.")

In [36]:
def initializer(train_dataloader, epochs=2):
    """
    모델, 옵티마이저, 스케쥴러 초기화
    """

    optimizer = AdamW(
        model.parameters(),
        lr=2e-5,
        eps=1e-8
    )
    
    total_steps = len(train_dataloader) * epochs
    print(f"Total train steps with {epochs} epochs: {total_steps}")

    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps = 0,
        num_training_steps = total_steps
    )

    return optimizer, scheduler

In [37]:
# device type
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"# available GPUs : {torch.cuda.device_count()}")
    print(f"GPU name : {torch.cuda.get_device_name()}")
else:
    device = torch.device("cpu")
print(device)

# available GPUs : 1
GPU name : GeForce RTX 3070
cuda


In [38]:
from transformers import get_linear_schedule_with_warmup, get_constant_schedule

In [39]:
epochs = 2
optimizer, scheduler = initializer(train_dataloader, epochs=epochs)

train(model, train_dataloader, epochs = epochs)

Total train steps with 2 epochs: 584
*****Epoch 0 Train Start*****
Epoch: 0, Step : 10, LR : 1.962328767123288e-05, Avg Loss : 5.4514
Epoch: 0, Step : 20, LR : 1.9280821917808223e-05, Avg Loss : 2.3209
Epoch: 0, Step : 30, LR : 1.8938356164383563e-05, Avg Loss : 1.6471
Epoch: 0, Step : 40, LR : 1.8595890410958907e-05, Avg Loss : 0.6542
Epoch: 0, Step : 50, LR : 1.8253424657534248e-05, Avg Loss : 0.4968
Epoch: 0, Step : 60, LR : 1.791095890410959e-05, Avg Loss : 0.4939
Epoch: 0, Step : 70, LR : 1.7568493150684932e-05, Avg Loss : 0.4392
Epoch: 0, Step : 80, LR : 1.7226027397260273e-05, Avg Loss : 0.4556
Epoch: 0, Step : 90, LR : 1.6883561643835617e-05, Avg Loss : 0.4811
Epoch: 0, Step : 100, LR : 1.654109589041096e-05, Avg Loss : 0.4093
Epoch: 0, Step : 110, LR : 1.61986301369863e-05, Avg Loss : 0.4320
Epoch: 0, Step : 120, LR : 1.5856164383561646e-05, Avg Loss : 0.3005
Epoch: 0, Step : 130, LR : 1.551369863013699e-05, Avg Loss : 0.3374
Epoch: 0, Step : 140, LR : 1.517123287671233e-05, A

In [40]:
def predict(model, test_dataloader):
    """
    test_dataloader의 label별 확률값과 실제 label 값을 반환
    """

    model.eval()
    model.to(device)

    all_logits = []
    all_labels = []

    for step, batch in enumerate(test_dataloader):
        print(f"{step+1}/{len(test_dataloader)}\r", end = "")
        
        batch_input, batch_label = batch
        
        batch_input = batch_input.to(device)
        
        with torch.no_grad():
            logits = model(**batch_input)
            all_logits.append(logits)
        all_labels.extend(batch_label)

    all_logits = torch.cat(all_logits, dim=0)
    probs = torch.tensor(all_logits).cpu().numpy()
    all_labels = np.array(all_labels)

    return probs, all_labels

In [41]:
probs, labels = predict(model, valid_dataloader)

73/73

  probs = torch.tensor(all_logits).cpu().numpy()


In [42]:
for i in range(len(probs)):
    print(f"probs: {probs[i][0]:5.2f},  label: {labels[i]:5.2f},  diff: {(probs[i] - labels[i])[0]:5.2f}")

probs:  0.98,  label:  0.20,  diff:  0.78
probs:  4.00,  label:  3.83,  diff:  0.16
probs:  0.02,  label:  0.00,  diff:  0.02
probs:  4.26,  label:  4.00,  diff:  0.26
probs:  3.42,  label:  2.71,  diff:  0.71
probs: -0.02,  label:  0.00,  diff: -0.02
probs:  0.06,  label:  0.00,  diff:  0.06
probs:  0.08,  label:  0.00,  diff:  0.08
probs:  0.16,  label:  0.33,  diff: -0.18
probs:  0.16,  label:  0.00,  diff:  0.16
probs:  0.17,  label:  0.50,  diff: -0.33
probs:  3.90,  label:  4.14,  diff: -0.24
probs:  0.05,  label:  0.00,  diff:  0.05
probs:  0.53,  label:  0.00,  diff:  0.53
probs:  1.26,  label:  0.83,  diff:  0.43
probs:  0.26,  label:  1.33,  diff: -1.07
probs:  0.68,  label:  0.00,  diff:  0.68
probs:  3.58,  label:  2.86,  diff:  0.72
probs:  0.35,  label:  0.00,  diff:  0.35
probs:  1.62,  label:  2.00,  diff: -0.38
probs:  0.01,  label:  0.14,  diff: -0.13
probs:  1.44,  label:  0.83,  diff:  0.61
probs:  1.88,  label:  1.17,  diff:  0.72
probs:  1.30,  label:  1.60,  diff

probs:  4.59,  label:  4.50,  diff:  0.09
probs:  4.49,  label:  4.20,  diff:  0.29
probs:  0.10,  label:  0.17,  diff: -0.06
probs:  0.61,  label:  0.17,  diff:  0.44
probs:  3.73,  label:  3.67,  diff:  0.06
probs:  4.44,  label:  3.83,  diff:  0.61
probs:  3.69,  label:  3.67,  diff:  0.03
probs:  0.19,  label:  0.86,  diff: -0.67
probs:  4.04,  label:  4.43,  diff: -0.38
probs:  3.18,  label:  3.00,  diff:  0.18
probs:  4.01,  label:  3.43,  diff:  0.58
probs:  4.24,  label:  4.67,  diff: -0.43
probs:  0.09,  label:  0.00,  diff:  0.09
probs:  4.19,  label:  4.00,  diff:  0.19
probs:  4.28,  label:  3.67,  diff:  0.61
probs:  0.16,  label:  0.00,  diff:  0.16
probs:  2.59,  label:  2.83,  diff: -0.24
probs:  2.71,  label:  2.67,  diff:  0.05
probs:  0.30,  label:  0.00,  diff:  0.30
probs:  2.88,  label:  3.67,  diff: -0.78
probs:  4.00,  label:  4.33,  diff: -0.33
probs:  2.22,  label:  2.17,  diff:  0.05
probs:  3.65,  label:  2.60,  diff:  1.05
probs:  3.82,  label:  1.14,  diff

In [43]:
from sklearn.metrics import mean_squared_error

In [44]:
mean_squared_error(probs.flatten(), labels)

0.25823173

In [45]:
s1 = tokenizer_krbert_sub([['코딩 못하는 개발자도 취직 가능할까요?ㅠㅠ', '포도빛 향기에 취해만 가는데']],
                            truncation = True,
                            padding = "longest",
                            max_length=128,
                            return_tensors = "pt")

s2 = tokenizer_krbert_sub([['가스비가 너무 많이 나왔어요.', '가스비가 왜 이렇게 많이 나왔어!']],
                            truncation = True,
                            padding = "longest",
                            max_length=128,
                            return_tensors = "pt")

In [46]:
s1.to(device)
s2.to(device)

{'input_ids': tensor([[   2, 5453, 3213, 1000,  549, 1623, 1844,    5,    3, 5453, 3213, 1464,
         1559,  549, 1623, 2875, 1218,    3]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}

In [47]:
with torch.no_grad():
    print(model(**s1))

tensor([[0.2473]], device='cuda:0')


In [48]:
with torch.no_grad():
    print(model(**s2))

tensor([[4.0776]], device='cuda:0')
