# CoLA (문법성 판단)

# Data

## imports

In [1]:
import pickle
import pandas as pd
import torch
from pathlib import Path
import multiprocessing
import time
import numpy as np
from tqdm import tqdm
from typing import List
import random
from functools import partial
from itertools import repeat
import re

from Korpora import Korpora

## fetch additional data
추가 학습을 위해 KorSTS와 KorNLI 데이터셋에서 텍스트만 가져온다.

In [2]:
# 추가적으로 사용할 추가 데이터 가져오기
def create_additional_text(name):
    path = Path('.')
    root_dir = path / 'dataset'
    Korpora.fetch(name, root_dir = root_dir)
    corpus = Korpora.load(name)
    additional_text = corpus.get_all_texts()

    temp = []
    for i, text in enumerate(tqdm(additional_text)):
        temp.append(text+'\n')

    additional_text = set(temp)
    del temp
    additional_text = list(additional_text)
    
    new_file = root_dir / 'additional.txt'
    with open(new_file, 'a+', encoding='utf-8') as writer:
        writer.writelines(additional_text)

In [3]:
#create_additional_text("korsts")
#create_additional_text("kornli")

## Preprocessing
1. 추가 데이터셋에서 일부러 조사가 틀린 텍스트를 만들어서 문법이 틀린 데이터도 증강

In [4]:
from pyjosa import josa, jonsung

In [5]:
class SabotageSentence(object):
    def __init__(self, sentence: str):
        self.sentence = sentence
        self.josa_dict = {
            'for_jongsung':['을','은','이','과'], 
            'no_jongsung':['를','는','가','와','나','로','야','랑','며']
        }


    @property
    def get_all_josa(self):
        return self.josa_dict


    def jongsung_wrong_josa(self) -> str:
        new_sent = ''
        for _, word in enumerate(self.sentence.split()):
            tmp_word = word[:-1]
            if word[-1] in self.josa_dict['for_jongsung']:
                tmp_word+=random.choice(self.josa_dict['no_jongsung'])
                new_sent+=tmp_word
                new_sent+=' '
            elif word[-1] in self.josa_dict['no_jongsung']:
                tmp_word+=random.choice(self.josa_dict['for_jongsung'])
                new_sent+=tmp_word
                new_sent+=' '
            else:
                new_sent+=word
                new_sent+=' '

        return new_sent
    


2. 기존 데이터셋을 pandas DataFrame으로 변환

In [6]:
def load_data(filename):
    data_dir = Path("./dataset/cola/") / filename
    dataset = pd.read_csv(
        data_dir, 
        sep="\t", 
        header=0, 
        encoding='utf-8', 
        names=['source', 'acceptability_label', 'source_annotation', 'sentence']
    )
    dataset['label'] = dataset['acceptability_label'].astype(int)

    return dataset


def augment_data_orig(new_data: List[str]):
    tmp_data_holder = {'source':[], 'label':[], 'source_annotation':[], 'sentence':[]}
    for i, row in enumerate(new_data):
        if (re.match('[a-zA-Z]', row) is not None) or (len(row) >= 70) or (len(row) == 0) or (row[-2:]!='.\n'):
            continue
        else:
            tmp_data_holder['source'].append('T'+str(10001+i))
            tmp_data_holder['label'].append(1)
            tmp_data_holder['source_annotation'].append('*')
            assert type(row) == str
            tmp_data_holder['sentence'].append(row.replace('\n',''))

    dataset = pd.DataFrame(tmp_data_holder)
    return dataset


def augment_data(data):
    tmp_data_holder = {'source':[], 'label':[], 'source_annotation':[], 'sentence':[]}
    for _, row in data.iterrows():
        tmp_data_holder['source'].append(row['source'])
        tmp_data_holder['label'].append(0)
        tmp_data_holder['source_annotation'].append(np.NaN)
        
        text = SabotageSentence(row['sentence']).jongsung_wrong_josa()
        tmp_data_holder['sentence'].append(text)

    dataset = pd.DataFrame(tmp_data_holder)
    return dataset

    
def read_txt(path='./dataset/additional.txt') -> List[str]:
    with open(path, 'r+', encoding='utf-8') as reader:
        new_data = reader.readlines()
    
    tmp_list = []
    for text in new_data:
        text.rstrip('\n')
        text.replace('\n','')
        tmp_list.append(text)
    new_data = tmp_list

    return new_data

def multiprocess_aug(orig_dataset, func_name):
    num_process = multiprocessing.cpu_count()
    
    chunk_size = int(orig_dataset.shape[0] / num_process)
    chunks = [orig_dataset.iloc[orig_dataset.index[i:i+chunk_size]] for i in range(0, orig_dataset.shape[0], chunk_size)]
    assert len(chunks) != 0

    with multiprocessing.Pool(processes=num_process) as pool:
        results = pool.map(func_name, chunks)
        
        new_dataset = pd.concat(results)
        dataset = pd.concat([orig_dataset, new_dataset])

    return dataset


def tokenize_datasets(dataset, tokenizer, arch="encoder"):
    sentence = dataset['sentence'].tolist()
    tokenize_sent = tokenizer(
        sentence,
        return_tensors="pt",
        padding = True,
        truncation = True,
        max_length = 200,
        add_special_tokens=True,
        return_token_type_ids = True
    )

    return tokenize_sent

## Custom Dataset Class

In [7]:
class ColaDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset, labels= None, test=False):
        self.tokenized_dataset = tokenized_dataset
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.tokenized_dataset.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

# Model

In [8]:
import torch.nn as nn
from transformers import ElectraModel, ElectraPreTrainedModel

In [9]:
class Electra(ElectraPreTrainedModel):
    def __init__(self, config):
        super(Electra, self).__init__(config)
        self.electra = ElectraModel(config)
        self.num_labels = config.num_labels
        self.linear = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(p=0.1)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.electra(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        output = outputs[0][:, 0, :]
        output = self.linear(self.dropout(output))
        output = torch.tanh(output)
        logits = self.classifier(output)
        outputs = (logits,) + outputs[2:]

        return outputs


# Loss

In [10]:
class CrossEntropy(nn.Module):
    def __init__(self):
        super(CrossEntropy, self).__init__()
        self.CE = nn.CrossEntropyLoss()
        

    def forward(self, inputs, target):
        """
        :param inputs: predictions
        :param target: target labels
        :return: loss
        """
        loss = self.CE(inputs, target)
        return loss

_criterion_entrypoints = {
    'cross_entropy': CrossEntropy,
}

def criterion_entrypoint(criterion_name):
    return _criterion_entrypoints[criterion_name]

def is_criterion(criterion_name):
    return criterion_name in _criterion_entrypoints

def create_criterion(criterion_name, **kwargs):
    if is_criterion(criterion_name):
        create_fn = criterion_entrypoint(criterion_name)
        criterion = create_fn(**kwargs)
    else:
        raise RuntimeError('Unknown loss (%s)' % criterion_name)
    return criterion

# Utility function(s)

In [11]:
def check_arch(model_type):
  archs = {
    "encoder" : ["Bert", "Electra", "XLMRoberta", "Electra_BoolQ", "Roberta"],
    "encoder-decoder" : ["T5", "Bart", "Bart_BoolQ"]
  }
  for arch in archs:
    if model_type in archs[arch]:
      return arch
  raise ValueError(f"Model [{model_type}] no defined archtecture")

# Training setup

In [12]:
import os
import argparse
from importlib import import_module
import glob
import re
from collections import defaultdict
import time
from time import sleep

from sklearn.metrics import accuracy_score, classification_report

import transformers
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader


## Set seed

In [13]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    

## Training utilities

In [14]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

def output_dir(output_path, exist_ok = False):
    path = Path(output_path)
    if (path.exists() and exist_ok) or (not path.exists()):
        return str(path)
    else:
        dirs = glob.glob(f"{path}*")
        matches = [re.search(rf"%s(\d+)" %path.stem, d) for d in dirs]
        i = [int(m.groups()[0]) for m in matches if m]
        n = max(i) + 1 if i else 2
        return f"{path}{n}"

# Training

In [15]:
def train(args):
    model_dir = args.model_dir
    set_seed(args.seed)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # tokenizer
    MODEL_NAME = args.pretrained_model
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # load dataset
    datasets_ = load_data("./NIKL_CoLA_train.tsv")

    # 아래 코드는 데이터를 증강하는 코드지만 MCC에 도움이 되지 않는 관계로 주석처리함
    # new_data = read_txt()
    # new_data = augment_data_orig(new_data)
    # new_data_corrupt = multiprocess_aug(new_data, augment_data)
    # datasets_ = pd.concat([datasets_, new_data, new_data_corrupt], ignore_index=True)

    # make validation sets from training set
    labels_ = datasets_["label"]
    length = len(labels_)
    kf = args.kfold
    class_indexs = defaultdict(list)
    for i, label_ in enumerate(labels_):
        class_indexs[np.argmax(label_)].append(i)
    val_indices = set()
    for index in class_indexs: 
        val_indices = (val_indices | set(class_indexs[index][int(
            len(class_indexs[index])*(kf-1)/9): int(len(class_indexs[index])*kf/9)]))
    train_indices = set(range(length)) - val_indices

    train_dataset = datasets_.loc[np.array(list(train_indices))]
    val_dataset = datasets_.loc[np.array(list(val_indices))]

    train_label = train_dataset['label'].values
    val_label = val_dataset['label'].values

    tokenized_train = tokenize_datasets(
        train_dataset, tokenizer, check_arch(args.model_type))
    tokenized_val = tokenize_datasets(
        val_dataset, tokenizer, check_arch(args.model_type))

    train_dataset = ColaDataset(tokenized_train, train_label)
    val_dataset = ColaDataset(tokenized_val, val_label)

    train_loader = DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True,
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=args.valid_batch_size,
        shuffle=False,
        drop_last=False,
    )

    config_module = getattr(import_module(
        "transformers"), args.model_type + "Config")

    model_config = config_module.from_pretrained(MODEL_NAME)
    model_config.num_labels = 2

    model = Electra.from_pretrained(MODEL_NAME, config=model_config)
    model = nn.DataParallel(model)

    model.parameters
    model.to(device)

    save_dir = output_dir(os.path.join(model_dir, args.name, str(args.kfold)))

    for name, param in model.named_parameters():
        if ('cls_fc_layer' not in name) and ('label_classifier' not in name):  # classifier layer
            param.requires_grad = False

    criterion = create_criterion(args.criterion)  # default: cross_entropy
    opt_module = getattr(import_module("transformers"), args.optimizer)
    optimizer = opt_module(
        model.parameters(),
        lr=args.lr,
        weight_decay=args.weight_decay,
        eps=1e-8
    )
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=len(train_loader) * args.epochs,
        last_epoch=- 1
    )

    # logging
    best_val_mcc = -1
    best_val_loss = np.inf
    for epoch in range(args.epochs):
        pbar = tqdm(train_loader, dynamic_ncols=True)
        if epoch == args.freeze_epoch:
            for name, param in model.named_parameters():
                param.requires_grad = True

        model.train()

        loss_value = 0
        matches = 0
        for idx, items in enumerate(pbar):
            item = {key: val.to(device) for key, val in items.items()}

            optimizer.zero_grad()
            outs = model(**item)
            loss = criterion(outs[0], item['labels'])

            preds = torch.argmax(outs[0], dim=-1)

            loss.backward()
            optimizer.step()
            scheduler.step()

            loss_value += loss.item()
            matches += (preds == item['labels']).sum().item()
            if (idx + 1) % args.log_interval == 0:
                train_loss = loss_value / args.log_interval
                train_acc = matches / args.batch_size / args.log_interval
                current_lr = get_lr(optimizer)
                pbar.set_description(
                    f"Epoch: [{epoch}/{args.epochs}]({idx + 1}/{len(train_loader)}) || loss: {train_loss:4.4} || acc: {train_acc:4.2%} || lr {current_lr:4.4}")

                loss_value = 0
                matches = 0

    # validation
    with torch.no_grad():
        pbar = tqdm(val_loader, dynamic_ncols=True)
        print("Calculating validation results...")
        model.eval()
        val_loss_items = []
        val_acc_items = []
        acc_okay = 0
        count_all = 0
        TP = 0
        FP = 0
        TN = 0
        FN = 0
        eps = 1e-9
        for idx, items in enumerate(pbar):
            sleep(0.01)
            item = {key: val.to(device) for key, val in items.items()}

            outs = model(**item)

            preds = torch.argmax(outs[0], dim=-1)
            loss = criterion(outs[0], item['labels']).item()

            acc_item = (item['labels'] == preds).sum().item()

            TRUE = (item['labels'] == preds)
            FALSE = (item['labels'] != preds)

            TP += (TRUE * preds).sum().item()
            TN += (TRUE * (preds == 0)).sum().item()
            FP += (FALSE * preds).sum().item()
            FN += (FALSE * (preds == 0)).sum().item()

            val_loss_items.append(loss)
            val_acc_items.append(acc_item)
            acc_okay += acc_item
            count_all += len(preds)

            # Calculate MCC
            MCC = ((TP*TN) - (FP*FN)) / \
                (((TP+FP+eps)*(TP+FN+eps)*(TN+FP+eps)*(TN+FN+eps))**0.5)

            pbar.set_description(
                f"Epoch: [{epoch}/{args.epochs}]({idx + 1}/{len(val_loader)}) || val_loss: {loss:4.4} || acc: {acc_okay/count_all:4.2%} || MCC: {MCC:4.2%}")

        val_loss = np.sum(val_loss_items) / len(val_loss_items)
        val_acc = acc_okay / count_all

        if MCC > best_val_mcc:
            print(
                f"New best model for val mcc : {MCC:4.2%}! saving the best model..")
            model_to_save = model.module if hasattr(model, "module") else model
            model_to_save.save_pretrained(f"{save_dir}/best")
            torch.save(args, os.path.join(
                f"{save_dir}/best", "training_args.bin"))
            best_val_mcc = MCC

        if val_loss < best_val_loss:
            best_val_loss = val_loss
        print(
            f"[Val] acc : {val_acc:4.2%}, loss: {val_loss:4.4}|| "
            f"best mcc : {best_val_mcc:4.2%}, best loss: {best_val_loss:4.4}|| "
            f"MCC : {MCC:4.2%}|| "
            f"TP:{TP} / TN:{TN} / FP:{FP} / FN:{FN}"
        )

    time.sleep(5)
    torch.cuda.empty_cache()


## Training arguments

In [16]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= "0,1,2,3"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
args = argparse.Namespace(
    seed = 42,
    epochs = 30,
    freeze_epoch=0,
    optimizer = 'AdamW',
    weight_decay = 0.01,
    warmup_steps = 500,
    log_interval = 20,
    kfold = 9,

    criterion = 'cross_entropy',
    dropout_rate = 0.1,
    model_type = "Electra",
    pretrained_model = "tunib/electra-ko-base",
    lr = 4e-6,
    batch_size = 32,
    valid_batch_size = 128,

    val_ratio=0.2,
    name = 'exp',
    model_dir = os.environ.get('SM_MODEL_DIR', './results'),
    custompretrain = ""
)

args.name = f'{args.model_type}_{args.lr}_{args.kfold}'

## Training Results

In [17]:
print('='*40)
print(f"k-fold num : {args.kfold}")
print('='*40)

train(args)

k-fold num : 9


Some weights of the model checkpoint at tunib/electra-ko-base were not used when initializing Electra: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing Electra from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Electra from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Electra were not initialized from the model checkpoint at tunib/electra-ko-base and are newly initialized: ['classifier.bias', 'linear.bias', 'classifier.weight', 'linear.weight']
You should probably TRAIN this model on a down-stream task to be 

Calculating validation results...


Epoch: [29/30](14/14) || val_loss: 0.7995 || acc: 80.67% || MCC: 61.83%: 100%|██████████| 14/14 [00:02<00:00,  6.45it/s]


New best model for val mcc : 61.83%! saving the best model..
[Val] acc : 80.67%, loss: 0.8838|| best mcc : 61.83%, best loss: 0.8838|| MCC : 61.83%|| TP:779 / TN:644 / FP:229 / FN:112


# Evaluation

In [31]:
def evaluate(args):
    tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)

    file = 'NIKL_CoLA_dev.tsv'
    dataset = load_data(file)
    tokenized_test = tokenize_datasets(dataset, tokenizer)
    test_label = dataset['label'].values
    test_dataset = ColaDataset(tokenized_test, test_label)
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=args.test_batch_size,
        shuffle=False
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model = Electra.from_pretrained(args.model_dir) 
    model.parameters
    model.to(device)
    model.eval()

    pbar = tqdm(test_loader)
    print("Calculating validation results...")
    test_acc_items = []
    acc_okay = 0
    count_all = 0
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    eps = 1e-9

    for idx, items in enumerate(pbar):
        sleep(0.01)

        item = {key: val.to(device) for key, val in items.items()}
        with torch.no_grad():
            outs = model(**item)

        preds = torch.argmax(outs[0], dim=-1)
        labels = item['labels']

        acc_item = (labels == preds).sum().item()

        TRUE = (labels == preds)
        FALSE = (labels != preds)

        TP += (TRUE * preds).sum().item()
        TN += (TRUE * (preds==0)).sum().item()
        FP += (FALSE * preds).sum().item()
        FN += (FALSE * (preds==0)).sum().item()

        MCC = ((TP*TN) - (FP*FN)) / (((TP+FP+eps)*(TP+FN+eps)*(TN+FP+eps)*(TN+FN+eps))**0.5)

        test_acc_items.append(acc_item)
        acc_okay += acc_item
        count_all += len(preds)

        pbar.set_description(f"({idx + 1}/{len(test_loader)}) || acc: {acc_okay/count_all:4.2%} || MCC: {MCC:4.2%}")

    test_acc = acc_okay / count_all

    print(
        f"[Test] acc : {test_acc:4.2%}|| "
        f"MCC : {MCC:4.2%}|| "
        f"TP:{TP} / TN:{TN} / FP:{FP} / FN:{FN}\n"
        f"======================================\n"
        f"Test MCC: {MCC:4.2%}"
    )
    time.sleep(5)
    torch.cuda.empty_cache()

## Evaluation arguments

In [32]:
#eval args
args = argparse.Namespace(
    model_type = "Electra",
    pretrained_model = "tunib/electra-ko-base",

    model_dir = './results/Electra_4e-06_9/97/best',
    criterion = 'cross_entropy',
    num_labels=2,

    test_batch_size=8
)

# Inference

In [33]:
evaluate(args)

(9/254) || acc: 76.39% || MCC: 53.29%:   2%|▏         | 5/254 [00:00<00:05, 49.35it/s]

Calculating validation results...


(254/254) || acc: 75.84% || MCC: 51.64%: 100%|██████████| 254/254 [00:05<00:00, 48.49it/s]


[Test] acc : 75.84%|| MCC : 51.64%|| TP:892 / TN:649 / FP:312 / FN:179
Test MCC: 51.64%
