# Data

In [1]:
import pickle
import pandas as pd
import torch
from g2pk import G2p
from pathlib import Path
import multiprocessing
import time

g2p = G2p()

## Preprocessing

In [2]:
def load_data(filename):
    data_dir = Path("./dataset/cola/") / filename
    dataset = pd.read_csv(
        data_dir, 
        sep="\t", 
        header=0, 
        encoding='utf-8', 
        names=['source', 'acceptability_label', 'source_annotation', 'sentence']
    )
    dataset['label'] = dataset['acceptability_label'].astype(int)

    return dataset


def augment_data(dataset):
    tmp_data_holder = {'source':[], 'label':[], 'source_annotation':[], 'sentence':[]}
    for _, row in dataset[dataset['label'] == 1].iterrows():
        tmp_data_holder['source'].append(row['source'])
        tmp_data_holder['label'].append(0)
        tmp_data_holder['source_annotation'].append('')
        tmp_data_holder['sentence'].append(g2p(row['sentence']))

    dataset = pd.DataFrame(tmp_data_holder)
    return dataset
    

def multiprocess_aug(dataset):
    num_process = multiprocessing.cpu_count()
    chunk_size = int(dataset.shape[0] / num_process)
    chunks = [dataset.iloc[dataset.index[i:i+chunk_size]] for i in range(0, dataset.shape[0], chunk_size)]
    assert len(chunks) != 0

    pool = multiprocessing.Pool(processes=num_process)
    results = pool.map(augment_data, chunks)
    
    new_dataset = pd.concat(results)
    dataset = pd.concat([dataset, new_dataset])

    pool.close()
    pool.join()

    return dataset


def tokenize_datasets(dataset, tokenizer, arch="encoder"):
    sentence = dataset['sentence'].tolist()
    tokenize_sent = tokenizer(
        sentence,
        return_tensors="pt",
        padding = True,
        truncation = True,
        max_length = 150,
        add_special_tokens=True,
        return_token_type_ids = True
    )

    return tokenize_sent

In [5]:
train_dataset = load_data("./NIKL_CoLA_train.tsv")
train_dataset = multiprocess_aug(train_dataset)
train_dataset

Unnamed: 0,source,acceptability_label,source_annotation,sentence,label
0,T00001,1.0,,높은 달이 떴다.,1
1,T00001,0.0,*,달이 뜸이 높았다.,0
2,T00002,1.0,,실없는 사람이 까불까불한다.,1
3,T00003,1.0,,나는 철수에게 공을 던졌다.,1
4,T00004,1.0,,내가 순이와 둘이서 다툰다.,1
...,...,...,...,...,...
118,T09994,,,밤새 그 수를 다 머건는 게다.,0
119,T09996,,,수호는 지베 아 노지 아낟따.,0
120,T09998,,,철수느 녕이가 아주 어려운 논무느 렬심히 일걷따고 생가캗따.,0
0,T09999,,,선생니미 순히에게 채그 릴께 하시나 순히는 채그 릭찌 안는다.,0


## Custom Dataset Class

In [3]:
class ColaDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset, labels= None, test=False):
        self.tokenized_dataset = tokenized_dataset
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.tokenized_dataset.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

# Model

In [4]:
import torch.nn as nn
from transformers import ElectraModel, ElectraPreTrainedModel

In [20]:
class Electra(ElectraPreTrainedModel):
    def __init__(self, config):
        super(Electra, self).__init__(config)
        self.electra = ElectraModel(config)
        self.num_labels = config.num_labels
        self.linear = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(p=0.1)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.electra(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        output = outputs[0][:, 0, :]
        output = self.linear(self.dropout(output))
        output = torch.tanh(output)
        logits = self.classifier(output)
        outputs = (logits,) + outputs[2:]

        return outputs


# Loss

In [6]:
class CrossEntropy(nn.Module):
    def __init__(self):
        super(CrossEntropy, self).__init__()
        self.CE = nn.CrossEntropyLoss()
        

    def forward(self, inputs, target):
        """
        :param inputs: predictions
        :param target: target labels
        :return: loss
        """
        # target = torch.argmax(target, dim=-1)
        loss = self.CE(inputs, target)
        return loss

_criterion_entrypoints = {
    'cross_entropy': CrossEntropy,
}

def criterion_entrypoint(criterion_name):
    return _criterion_entrypoints[criterion_name]

def is_criterion(criterion_name):
    return criterion_name in _criterion_entrypoints

def create_criterion(criterion_name, **kwargs):
    if is_criterion(criterion_name):
        create_fn = criterion_entrypoint(criterion_name)
        criterion = create_fn(**kwargs)
    else:
        raise RuntimeError('Unknown loss (%s)' % criterion_name)
    return criterion

# Utility function(s)

In [7]:
def check_arch(model_type):
  archs = {
    "encoder" : ["Bert", "Electra", "XLMRoberta", "Electra_BoolQ", "Roberta"],
    "encoder-decoder" : ["T5", "Bart", "Bart_BoolQ"]
  }
  for arch in archs:
    if model_type in archs[arch]:
      return arch
  raise ValueError(f"Model [{model_type}] no defined archtecture")

# Training setup

In [8]:
import json
import numpy as np
import os
import random
import argparse
from importlib import import_module
import glob
import re
from collections import defaultdict
from tqdm import tqdm
import time
from time import sleep

from sklearn.metrics import accuracy_score, classification_report

import transformers
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader


## Set seed

In [9]:
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    

## Training utilities

In [10]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

def output_dir(output_path, exist_ok = False):
    path = Path(output_path)
    if (path.exists() and exist_ok) or (not path.exists()):
        return str(path)
    else:
        dirs = glob.glob(f"{path}*")
        matches = [re.search(rf"%s(\d+)" %path.stem, d) for d in dirs]
        i = [int(m.groups()[0]) for m in matches if m]
        n = max(i) + 1 if i else 2
        return f"{path}{n}"

# Training

In [14]:
def train(args):
    model_dir = args.model_dir
    set_seed()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    num_classes = 2

    # tokenizer
    MODEL_NAME=args.pretrained_model
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # load dataset
    datasets_ = load_data("./NIKL_CoLA_train.tsv")
    datasets_ = multiprocess_aug(datasets_) # augment training data
    labels_ = datasets_["label"]
    

    length = len(labels_)
    kf = args.kfold
    class_indexs = defaultdict(list)
    for i, label_ in enumerate(labels_):
        class_indexs[np.argmax(label_)].append(i) #  class index [0] = [2,3,5,6], class index[1]=[나머지]
    val_indices = set()
    for index in class_indexs: # stratified: key : 0, 1 classindex[0][0/5:1/5]
        val_indices = (val_indices | set(class_indexs[index][int(len(class_indexs[index])*(kf-1)/9) : int(len(class_indexs[index])*kf/9)]))
    train_indices = set(range(length)) - val_indices

    train_dataset = datasets_.loc[np.array(list(train_indices))]
    val_dataset = datasets_.loc[np.array(list(val_indices))]

    train_label = train_dataset['label'].values
    val_label = val_dataset['label'].values

    tokenized_train = tokenize_datasets(train_dataset, tokenizer, check_arch(args.model_type))
    tokenized_val = tokenize_datasets(val_dataset, tokenizer, check_arch(args.model_type))

    train_dataset = ColaDataset(tokenized_train, train_label)
    val_dataset = ColaDataset(tokenized_val, val_label)

    train_loader = DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True,
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=args.valid_batch_size,
        shuffle=False,
        drop_last=False,
    )

    config_module = getattr(import_module("transformers"), args.model_type + "Config")
    
    model_config = config_module.from_pretrained(MODEL_NAME)
    model_config.num_labels = 2

    model = Electra.from_pretrained(MODEL_NAME, config=model_config)

    model.parameters
    model.to(device)

    save_dir = output_dir(os.path.join(model_dir, args.name, str(args.kfold)))

    for name, param in model.named_parameters():
        if ('cls_fc_layer' not in name) and ('label_classifier' not in name): # classifier layer
            param.requires_grad = False

    criterion = create_criterion(args.criterion)  # default: cross_entropy
    opt_module = getattr(import_module("transformers"), args.optimizer)
    optimizer = opt_module(
        model.parameters(),
        lr=args.lr,
        weight_decay=args.weight_decay,
        eps = 1e-8
    )
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=args.warmup_steps, 
        num_training_steps=len(train_loader) * args.epochs, 
        last_epoch=- 1
    )   

    ## logging
    start_time = time.time()

    best_val_mcc = -1
    best_val_loss = np.inf
    for epoch in range(args.epochs):
        pbar = tqdm(train_loader, dynamic_ncols=True)
        if epoch == args.freeze_epoch:
            for name, param in model.named_parameters():
                param.requires_grad = True

        model.train()
        
        loss_value = 0
        matches = 0
        for idx, items in enumerate(pbar):
            item = {key: val.to(device) for key, val in items.items()}

            optimizer.zero_grad()
            outs = model(**item)
            loss = criterion(outs[0], item['labels'])

            preds = torch.argmax(outs[0], dim=-1)

            loss.backward()
            optimizer.step()
            scheduler.step()

            loss_value += loss.item()
            matches += (preds == item['labels']).sum().item()
            if (idx + 1) % args.log_interval == 0:
                train_loss = loss_value / args.log_interval
                train_acc = matches / args.batch_size / args.log_interval
                current_lr = get_lr(optimizer)
                pbar.set_description(f"Epoch: [{epoch}/{args.epochs}]({idx + 1}/{len(train_loader)}) || loss: {train_loss:4.4} || acc: {train_acc:4.2%} || lr {current_lr:4.4}")

                loss_value = 0
                matches = 0     

    ## validation
    with torch.no_grad():
        pbar = tqdm(val_loader, dynamic_ncols=True)
        print("Calculating validation results...")
        model.eval()
        val_loss_items = []
        val_acc_items = []
        acc_okay = 0
        count_all = 0
        TP = 0
        FP = 0
        TN = 0
        FN = 0
        eps = 1e-9
        for idx, items in enumerate(pbar):
            sleep(0.01)
            item = {key: val.to(device) for key, val in items.items()}

            outs = model(**item)

            preds = torch.argmax(outs[0], dim=-1)
            loss = criterion(outs[0], item['labels']).item()

            acc_item = (item['labels'] == preds).sum().item()

            TRUE = (item['labels'] == preds)
            FALSE = (item['labels'] != preds)

            TP += (TRUE * preds).sum().item()
            TN += (TRUE * (preds==0)).sum().item()
            FP += (FALSE * preds).sum().item()
            FN += (FALSE * (preds==0)).sum().item()

            val_loss_items.append(loss)
            val_acc_items.append(acc_item)
            acc_okay += acc_item
            count_all += len(preds)

            MCC = ((TP*TN) - (FP*FN)) / (((TP+FP+eps)*(TP+FN+eps)*(TN+FP+eps)*(TN+FN+eps))**0.5)

            pbar.set_description(f"Epoch: [{epoch}/{args.epochs}]({idx + 1}/{len(val_loader)}) || val_loss: {loss:4.4} || acc: {acc_okay/count_all:4.2%} || MCC: {MCC:4.2%}")

        val_loss = np.sum(val_loss_items) / len(val_loss_items)
        val_acc = acc_okay / count_all

        if MCC > best_val_mcc:
            print(f"New best model for val mcc : {MCC:4.2%}! saving the best model..")
            model_to_save = model.module if hasattr(model, "module") else model
            model_to_save.save_pretrained(f"{save_dir}/best")
            torch.save(args, os.path.join(f"{save_dir}/best", "training_args.bin"))
            best_val_mcc = MCC

        if val_loss < best_val_loss:
            best_val_loss = val_loss
        print(
            f"[Val] acc : {val_acc:4.2%}, loss: {val_loss:4.4}|| "
            f"best mcc : {best_val_mcc:4.2%}, best loss: {best_val_loss:4.4}|| "
            f"MCC : {MCC:4.2%}|| "
            f"TP:{TP} / TN:{TN} / FP:{FP} / FN:{FN}"
        )
    
    time.sleep(5)
    torch.cuda.empty_cache()

In [15]:
#training args
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= "0,1,2,3"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
args = argparse.Namespace(
    seed = 42,
    epochs = 30,
    freeze_epoch=0,
    optimizer = 'AdamW',
    weight_decay = 0.01,
    warmup_steps = 500,
    log_interval = 20,
    kfold = 9,

    criterion = 'cross_entropy',
    dropout_rate = 0.2,
    model_type = "Electra",
    pretrained_model = "monologg/koelectra-base-v3-discriminator",
    lr = 4e-6,
    batch_size = 32,
    valid_batch_size = 128,

    val_ratio=0.2,
    name = 'exp',
    model_dir = os.environ.get('SM_MODEL_DIR', './results'),
    custompretrain = ""
)

args.name = f'{args.model_type}V3_{args.lr}_9k{args.kfold}'

In [None]:
print('='*40)
print(f"k-fold num : {args.kfold}")
print('='*40)

train(args)

# Evaluation

In [17]:
#eval args
args = argparse.Namespace(
    model_type = "Electra",
    pretrained_model = "monologg/koelectra-base-v3-discriminator",
    dropout_rate = 0,
    model_dir = './results/ElectraV3_4e-06_9k9/95/best',
    criterion = 'cross_entropy',
    num_labels=2,

    test_batch_size=8
)

In [21]:
def evaluate(args):
    tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)

    file = 'NIKL_CoLA_dev.tsv'
    dataset = load_data(file)
    tokenized_test = tokenize_datasets(dataset, tokenizer)
    test_label = dataset['label'].values
    test_dataset = ColaDataset(tokenized_test, test_label)
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=args.test_batch_size,
        shuffle=False
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model = Electra.from_pretrained(args.model_dir) 
    model.parameters
    model.to(device)
    model.eval()

    pbar = tqdm(test_loader)
    print("Calculating validation results...")
    test_loss_items = []
    test_acc_items = []
    acc_okay = 0
    count_all = 0
    # results = []
    # preds = []
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    eps = 1e-9

    for idx, items in enumerate(pbar):
        sleep(0.01)
    #     item = {key: val.to(device) for key, val in items.items()}
    #     with torch.no_grad():
    #         outputs = model(**item)
    #     logits = outputs[0]
    #     m = nn.Softmax(dim=1)
    #     logits = m(logits)
    #     logits = logits.detach().cpu().numpy()   # (Batch_size, 5)  5개의 클래스 확률형태
    #     pred = logits[:,1]
    #     result = np.argmax(logits, axis=-1)
    #     results += result.tolist()
    #     preds += pred.tolist()
    
    # pred_answer = np.array(results).flatten()
    # preds = np.array(preds).flatten()

        item = {key: val.to(device) for key, val in items.items()}
        with torch.no_grad():
            outs = model(**item)

        preds = torch.argmax(outs[0], dim=-1)
        criterion = create_criterion(args.criterion)
        labels = item['labels']
        loss = criterion(outs[0], labels).item()

        acc_item = (labels == preds).sum().item()

        TRUE = (labels == preds)
        FALSE = (labels != preds)

        TP += (TRUE * preds).sum().item()
        TN += (TRUE * (preds==0)).sum().item()
        FP += (FALSE * preds).sum().item()
        FN += (FALSE * (preds==0)).sum().item()

        MCC = ((TP*TN) - (FP*FN)) / (((TP+FP+eps)*(TP+FN+eps)*(TN+FP+eps)*(TN+FN+eps))**0.5)

        test_loss_items.append(loss)
        test_acc_items.append(acc_item)
        acc_okay += acc_item
        count_all += len(preds)

        pbar.set_description(f"({idx + 1}/{len(test_loader)}) || test_loss: {loss:4.4} || acc: {acc_okay/count_all:4.2%} || MCC: {MCC:4.2%}")

        test_loss = np.sum(test_loss_items) / len(test_loss_items)
        test_acc = acc_okay / count_all

        print(
            f"[Val] acc : {test_acc:4.2%}, loss: {test_loss:4.4}|| "
            f"MCC : {MCC:4.2%}|| "
            f"TP:{TP} / TN:{TN} / FP:{FP} / FN:{FN}"
        )

In [22]:
evaluate(args)

Some weights of the model checkpoint at ./results/ElectraV3_4e-06_9k9/95/best were not used when initializing Electra: ['pooling.dense.bias', 'pooling.dense.weight']
- This IS expected if you are initializing Electra from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Electra from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Electra were not initialized from the model checkpoint at ./results/ElectraV3_4e-06_9k9/95/best and are newly initialized: ['linear.bias', 'linear.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/254 [00:00<?, ?it/s]

Calculating validation results...


  0%|          | 0/254 [00:00<?, ?it/s]


AttributeError: 'Namespace' object has no attribute 'criterion'