In [1]:
# ! pip install BackTranslation

## 모듈 임포트

In [2]:
import copy
import glob
import os
import random
import json
import time
import re
from time import sleep
from importlib import import_module
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score
from easydict import EasyDict

import torch
import torch.nn as nn
import transformers
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import (
    BertModel,
    BertPreTrainedModel,
    ElectraModel,
    ElectraPreTrainedModel,
    XLMRobertaModel,
    BartModel,
    BartPretrainedModel,
    T5Model,
    RobertaModel,
)
from transformers import MBartModel, MBartConfig
from transformers import BertTokenizer, BertModel
from BackTranslation import BackTranslation

## Data Augmentation by Backtranslation
---
- Google Translation 사용
- 아래 모듈 설치 필요
- ```bash
pip install BackTranslation
```
- 매우 오래걸리므로 전처리된 파일 사용

In [3]:
original_train_data = "dataset/copa/SKT_COPA_Train.tsv"
augmented_train_data = "dataset/copa/SKT_COPA_Train_aug.tsv"
valid_data = "dataset/copa/SKT_COPA_Dev.tsv"

In [4]:
dataset = pd.read_csv(
    original_train_data,
    delimiter="\t",
    names=["ID", "sentence", "question", "1", "2", "answer"],
    header=0,
)

- 아래와 같은 코드를 사용하여 backtranslate하였음

In [5]:
saved_backtranslated = "dataset/copa/en_new_sentences.pth"

In [6]:
# trans = BackTranslation(url=['translate.google.co.kr',])
# def augment_sentence(trans, s, tmp='en'):
#     return trans.translate(s, src='ko', tmp=tmp).result_text
# tmps = [en']
# new_datasets = dict()
# new_sentences = dict()

# for tmp in tmps:
#     new_dataset = copy.deepcopy(dataset)
#     sentences = new_dataset['sentence'].tolist()
#     new_sentences[tmp] = list()
#     for sent in tqdm(sentences):
#         new_sentences[tmp].append(augment_sentence(trans, sent, tmp=tmp))

# torch.save(new_sentences, saved_backtranslated)

In [7]:
sent_en = torch.load(saved_backtranslated)
sent = dict()
sent['en'] = sent_en['en']

## 원본 데이터셋

In [8]:
dataset.head()

Unnamed: 0,ID,sentence,question,1,2,answer
0,1,이퀄라이저로 저음 음역대 소리 크기를 키웠다.,결과,베이스 소리가 잘 들리게 되었다.,베이스 소리가 들리지 않게 되었다.,1
1,2,음료에 초콜렛 시럽을 넣었다.,결과,음료수가 더 달아졌다.,음료수가 차가워졌다.,1
2,3,남자는 휴대폰을 호수에 빠뜨렸다.,결과,휴대폰이 업그레이드 되었다.,휴대폰이 고장났다.,2
3,4,옆 집 사람이 이사를 나갔다.,원인,옆 집 사람은 계약이 완료되었다.,옆 집 사람은 계약을 연장했다.,1
4,5,문을 밀었다.,결과,문이 잠겼다.,문이 열렸다.,2


In [9]:
new_dataset = copy.deepcopy(dataset)
new_dataset['ID'] += len(dataset)
new_dataset['sentence'] = sent['en']

## BackTranslate로 augment한 데이터셋

In [10]:
new_dataset.head()

Unnamed: 0,ID,sentence,question,1,2,answer
0,3081,이퀄라이저는베이스 스캔의 사운드를 올렸습니다.,결과,베이스 소리가 잘 들리게 되었다.,베이스 소리가 들리지 않게 되었다.,1
1,3082,나는 음료에 초콜릿 시럽을 넣었다.,결과,음료수가 더 달아졌다.,음료수가 차가워졌다.,1
2,3083,그 남자는 호수에 휴대 전화를 넣었습니다.,결과,휴대폰이 업그레이드 되었다.,휴대폰이 고장났다.,2
3,3084,집 옆에있는 사람이 나갔다.,원인,옆 집 사람은 계약이 완료되었다.,옆 집 사람은 계약을 연장했다.,1
4,3085,나는 문을 밀었다.,결과,문이 잠겼다.,문이 열렸다.,2


## 데이터 합병

In [11]:
new_dataset = dataset.append(new_dataset)
new_dataset

Unnamed: 0,ID,sentence,question,1,2,answer
0,1,이퀄라이저로 저음 음역대 소리 크기를 키웠다.,결과,베이스 소리가 잘 들리게 되었다.,베이스 소리가 들리지 않게 되었다.,1
1,2,음료에 초콜렛 시럽을 넣었다.,결과,음료수가 더 달아졌다.,음료수가 차가워졌다.,1
2,3,남자는 휴대폰을 호수에 빠뜨렸다.,결과,휴대폰이 업그레이드 되었다.,휴대폰이 고장났다.,2
3,4,옆 집 사람이 이사를 나갔다.,원인,옆 집 사람은 계약이 완료되었다.,옆 집 사람은 계약을 연장했다.,1
4,5,문을 밀었다.,결과,문이 잠겼다.,문이 열렸다.,2
...,...,...,...,...,...,...
3075,6156,계약자로 일한 남자들은 떠났다.,원인,계약을 연장했다.,계약이 종료되었다.,2
3076,6157,목 마른.,원인,물을 마시지 못했다.,텀블러를 샀다.,1
3077,6158,나는 그 노래를 오랫동안 전화 했어.,결과,목이 아프다.,노래방이 폐업했다.,1
3078,6159,사람들은 한 번 함께 일하고 있습니다.,원인,우리나라 축구팀이 골을 넣었다.,우리나라 축구팀이 경기에서 패배했다.,2


In [12]:
new_dataset.to_csv(augmented_train_data, sep='\t')

# COPA 학습 & Inference to json 코드
---

## Transformers의 Wrapper Class와 일부 테스트 모델 선언 및 구현부

In [13]:
class FCLayer(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.0, use_activation=True):
        super(FCLayer, self).__init__()
        self.use_activation = use_activation
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(input_dim, output_dim)
        self.tanh = nn.Tanh()

    def forward(self, x):
        x = self.dropout(x)
        if self.use_activation:
            x = self.tanh(x)
        return self.linear(x)


class PoolingHead(nn.Module):
    def __init__(
        self, input_dim: int, inner_dim: int, pooler_dropout: float,
    ):
        super().__init__()
        self.dense = nn.Linear(input_dim, inner_dim)
        self.dropout = nn.Dropout(p=pooler_dropout)

    def forward(self, hidden_states: torch.Tensor):
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.dense(hidden_states)
        hidden_states = torch.tanh(hidden_states)
        return hidden_states


class Bert(BertPreTrainedModel):
    def __init__(self, config, args):
        super(Bert, self).__init__(config)
        self.bert = BertModel(config=config)  # Load pretrained bert

        self.num_labels = config.num_labels

        self.pooling = PoolingHead(
            input_dim=config.hidden_size,
            inner_dim=config.hidden_size,
            pooler_dropout=0.1,
        )
        self.qa_classifier = nn.Linear(config.hidden_size, self.num_labels - 1)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        input_ids2=None,
        attention_mask2=None,
        token_type_ids2=None,
        labels=None,
    ):
        outputs = self.bert(
            input_ids, attention_mask=attention_mask
        )  # sequence_output, pooled_output, (hidden_states), (attentions)
        outputs2 = self.bert(input_ids2, attention_mask=attention_mask2)
        sequence_output = outputs[0]
        sequence_output2 = outputs2[0]
        pooled_output = outputs[0][:, 0, :]  # [CLS]
        pooled_output2 = outputs2[0][:, 0, :]

        sentence_representation = torch.cat([pooled_output, pooled_output2], dim=1)

        pooled_output = self.pooling(pooled_output)
        pooled_output2 = self.pooling(pooled_output2)

        logits1 = self.qa_classifier(pooled_output)
        logits2 = self.qa_classifier(pooled_output2)

        logits = torch.cat([logits1, logits2], dim=1)

        outputs = (logits,) + outputs[
            2:
        ]  # add hidden states and attention if they are here

        return outputs  # logits, (hidden_states), (attentions)


class XLMRoberta(XLMRobertaModel):
    def __init__(self, config, args):
        super(XLMRoberta, self).__init__(config)
        self.xlmroberta = XLMRobertaModel.from_pretrained(
            "xlm-roberta-large", config=config
        )  # Load pretrained Electra

        self.num_labels = config.num_labels

        self.pooling = PoolingHead(
            input_dim=config.hidden_size,
            inner_dim=config.hidden_size,
            pooler_dropout=0.1,
        )
        self.qa_classifier = nn.Linear(config.hidden_size, self.num_labels - 1)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        input_ids2=None,
        attention_mask2=None,
        token_type_ids2=None,
        labels=None,
    ):
        outputs = self.xlmroberta(
            input_ids, attention_mask=attention_mask
        )  # sequence_output, pooled_output, (hidden_states), (attentions)
        outputs2 = self.xlmroberta(input_ids2, attention_mask=attention_mask2)
        sequence_output = outputs[0]
        sequence_output2 = outputs2[0]
        pooled_output = outputs[0][:, 0, :]  # [CLS]
        pooled_output2 = outputs2[0][:, 0, :]

        sentence_representation = torch.cat([pooled_output, pooled_output2], dim=1)

        pooled_output = self.pooling(pooled_output)
        pooled_output2 = self.pooling(pooled_output2)

        logits1 = self.qa_classifier(pooled_output)
        logits2 = self.qa_classifier(pooled_output2)

        logits = torch.cat([logits1, logits2], dim=1)

        outputs = (logits,) + outputs[
            2:
        ]  # add hidden states and attention if they are here

        return outputs  # logits, (hidden_states), (attentions)


class Electra_BoolQ(ElectraPreTrainedModel):
    def __init__(self, config, args):
        super(Electra_BoolQ, self).__init__(config)

        # self.num_labels = config.num_labels
        self.num_labels = config.num_labels
        self.model = ElectraModel.from_pretrained(
            "monologg/koelectra-base-v3-discriminator", config=config
        )
        self.pooling = PoolingHead(
            input_dim=config.hidden_size,
            inner_dim=config.hidden_size,
            pooler_dropout=0.1,
        )
        self.qa_classifier = nn.Linear(config.hidden_size, self.num_labels - 1)
        # self.sparse = Sparsemax()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        input_ids2=None,
        attention_mask2=None,
        token_type_ids2=None,
        labels=None,
    ):
        outputs = self.model(
            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        )  # sequence_output, pooled_output, (hidden_states), (attentions)
        outputs2 = self.model(
            input_ids2, attention_mask=attention_mask2, token_type_ids=token_type_ids2
        )
        sequence_output = outputs[0]
        sequence_output2 = outputs2[0]
        pooled_output = outputs[0][:, 0, :]  # [CLS]
        pooled_output2 = outputs2[0][:, 0, :]

        sentence_representation = torch.cat([pooled_output, pooled_output2], dim=1)

        pooled_output = self.pooling(pooled_output)
        pooled_output2 = self.pooling(pooled_output2)

        logits1 = self.qa_classifier(pooled_output)
        logits2 = self.qa_classifier(pooled_output2)

        logits = torch.cat([logits1, logits2], dim=1)

        outputs = (logits,) + outputs[
            2:
        ]  # add hidden states and attention if they are here

        return outputs  # logits, (hidden_states), (attentions)


class Roberta(RobertaModel):
    def __init__(self, config, args):
        super(Roberta, self).__init__(config)
        self.roberta = RobertaModel.from_pretrained(
            "klue/roberta-large", config=config
        )  # Load pretrained Electra

        self.num_labels = config.num_labels

        self.pooling = PoolingHead(
            input_dim=config.hidden_size,
            inner_dim=config.hidden_size,
            pooler_dropout=0.1,
        )
        self.qa_classifier = nn.Linear(config.hidden_size, self.num_labels - 1)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        input_ids2=None,
        attention_mask2=None,
        token_type_ids2=None,
        labels=None,
    ):
        outputs = self.roberta(
            input_ids, attention_mask=attention_mask
        )  # sequence_output, pooled_output, (hidden_states), (attentions)
        outputs2 = self.roberta(input_ids2, attention_mask=attention_mask2)
        sequence_output = outputs[0]
        sequence_output2 = outputs2[0]
        pooled_output = outputs[0][:, 0, :]  # [CLS]
        pooled_output2 = outputs2[0][:, 0, :]

        sentence_representation = torch.cat([pooled_output, pooled_output2], dim=1)

        pooled_output = self.pooling(pooled_output)
        pooled_output2 = self.pooling(pooled_output2)

        logits1 = self.qa_classifier(pooled_output)
        logits2 = self.qa_classifier(pooled_output2)

        logits = torch.cat([logits1, logits2], dim=1)

        outputs = (logits,) + outputs[
            2:
        ]  # add hidden states and attention if they are here

        return outputs  # logits, (hidden_states), (attentions)


## 데이터 전처리부
---

In [14]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset, labels):
        self.tokenized_dataset = tokenized_dataset
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.tokenized_dataset.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
def load_data(dataset_dir):
    dataset = pd.read_csv(
        dataset_dir,
        delimiter="\t",
        names=["ID", "sentence", "question", "1", "2", "answer"],
        header=0,
    )
    dataset["label"] = dataset["answer"].astype(int) - 1

    new_sentence1_1 = []
    new_sentence1_2 = []
    new_sentence2_1 = []
    new_sentence2_2 = []
    for i in range(len(dataset)):
        s = dataset.iloc[i]["sentence"]
        q = dataset.iloc[i]["question"]
        s1 = dataset.iloc[i]["1"]
        s2 = dataset.iloc[i]["2"]
        lb = dataset.iloc[i]["label"]
        if q == "결과":
            new_sentence1_1.append("[결과]" + s)
            # new_sentence1_1.append(s)
            new_sentence1_2.append(s1)
            new_sentence2_1.append("[결과]" + s)
            # new_sentence2_1.append(s)
            new_sentence2_2.append(s2)

        else:
            new_sentence1_1.append("[원인]" + s1)
            # new_sentence1_1.append(s1)
            new_sentence1_2.append(s)
            new_sentence2_1.append("[원인]" + s2)
            # new_sentence2_1.append(s2)
            new_sentence2_2.append(s)

    dataset["new_sentence1_1"] = new_sentence1_1
    dataset["new_sentence1_2"] = new_sentence1_2
    dataset["new_sentence2_1"] = new_sentence2_1
    dataset["new_sentence2_2"] = new_sentence2_2

    return dataset


def tokenized_dataset(dataset, tokenizer, arch="encoder"):
    sentence1_1 = dataset["new_sentence1_1"].tolist()
    sentence1_2 = dataset["new_sentence1_2"].tolist()
    sentence2_1 = dataset["new_sentence2_1"].tolist()
    sentence2_2 = dataset["new_sentence2_2"].tolist()

    tokenized_sentences = tokenizer(
        sentence1_1,
        sentence1_2,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=150,
        add_special_tokens=True,
        return_token_type_ids=True,
    )
    tokenized_sentences2 = tokenizer(
        sentence2_1,
        sentence2_2,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=150,
        add_special_tokens=True,
        return_token_type_ids=True,
    )
    for key, value in tokenized_sentences2.items():
        tokenized_sentences[key + "2"] = value

    return tokenized_sentences


## 트레이닝

In [15]:
def check_arch(model_type):
    archs = {
        "encoder": ["Bert", "Electra", "XLMRoberta", "Electra_BoolQ", "Roberta"],
        "encoder-decoder": ["T5", "Bart", "Bart_BoolQ"],
    }
    for arch in archs:
        if model_type in archs[arch]:
            return arch
    raise ValueError(f"Model [{model_type}] no defined archtecture")


def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    acc = accuracy_score(labels, preds)
    return {
      'accuracy': acc,
    }

def increment_output_dir(output_path, exist_ok=False):
    path = Path(output_path)
    if (path.exists() and exist_ok) or (not path.exists()):
        return str(path)
    else:
        dirs = glob.glob(f"{path}*")
        matches = [re.search(rf"%s(\d+)" % path.stem, d) for d in dirs]
        i = [int(m.groups()[0]) for m in matches if m]
        n = max(i) + 1 if i else 2
        return f"{path}{n}"

def train(model_dir, args):

    seed_everything(args.seed)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"device(GPU) : {torch.cuda.is_available()}")
    num_classes = 2

    # load model and tokenizerƒ
    MODEL_NAME = args.pretrained_model
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # load dataset
    train_dataset = load_data(augmented_train_data)
    val_dataset = load_data(valid_data)

    train_label = train_dataset["label"].values
    val_label = val_dataset["label"].values

    # tokenizing dataset
    tokenized_train = tokenized_dataset(
        train_dataset, tokenizer, check_arch(args.model_type)
    )
    tokenized_val = tokenized_dataset(
        val_dataset, tokenizer, check_arch(args.model_type)
    )

    # make dataset for pytorch.
    train_dataset = CustomDataset(tokenized_train, train_label)
    val_dataset = CustomDataset(tokenized_val, val_label)
    # -- data_loader
    train_loader = DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True,
    )

    val_loader = DataLoader(
        val_dataset, batch_size=args.valid_batch_size, shuffle=False, drop_last=False,
    )

    # setting model hyperparameter
    if args.model_type == "Electra_BoolQ":
        config_module = ElectraConfig
    else:
        config_module = getattr(
            import_module("transformers"), args.model_type + "Config"
        )

    model_config = config_module.from_pretrained(MODEL_NAME)
    model_config.num_labels = 2

    model_module = eval(args.model_type)

    if args.model_type in ["BERT", "Electra"]:
        model = model_module.from_pretrained(
            MODEL_NAME, config=model_config, args=args
        )
    else:
        model = model_module(config=model_config, args=args)

    model.parameters
    model.to(device)
    save_dir = increment_output_dir(os.path.join(model_dir, args.name, str(args.kfold)))

    # Freeze Parameter
    for name, param in model.named_parameters():
        if ("cls_fc_layer" not in name) and (
            "label_classifier" not in name
        ):  # classifier layer
            param.requires_grad = False

    # -- loss & metric
    criterion = nn.CrossEntropyLoss()
    
    opt_module = getattr(import_module("transformers"), args.optimizer)
    optimizer = opt_module(
        model.parameters(), lr=args.lr, weight_decay=args.weight_decay, eps=1e-8
    )
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=len(train_loader) * args.epochs,
        last_epoch=-1,
    )

    # -- logging
    start_time = time.time()
    logger = SummaryWriter(log_dir=save_dir)
    with open(os.path.join(save_dir, "config.json"), "w", encoding="utf-8") as f:
        json.dump(vars(args), f, ensure_ascii=False, indent=4)

    best_val_acc = 0
    best_val_loss = np.inf
    for epoch in range(args.epochs):
        # train loop
        # unFreeze parameters
        if epoch == args.freeze_epoch:
            for name, param in model.named_parameters():
                param.requires_grad = True
        model.train()
        loss_value = 0
        matches = 0
        for idx, items in enumerate(train_loader):
            item = {key: val.to(device) for key, val in items.items()}

            optimizer.zero_grad()
            outs = model(**item)
            loss = criterion(outs[0], item["labels"])

            preds = torch.argmax(outs[0], dim=-1)

            loss.backward()
            optimizer.step()
            scheduler.step()

            loss_value += loss.item()
            matches += (preds == item["labels"]).sum().item()
            if (idx + 1) % args.log_interval == 0:
                train_loss = loss_value / args.log_interval
                train_acc = matches / args.batch_size / args.log_interval
                current_lr = get_lr(optimizer)
                print(
                    f"Epoch[{epoch}/{args.epochs}]({idx + 1}/{len(train_loader)}) || "
                    f"training loss {train_loss:4.4} || training accuracy {train_acc:4.2%} || lr {current_lr}"
                )

                logger.add_scalar(
                    "Train/loss", train_loss, epoch * len(train_loader) + idx
                )
                logger.add_scalar(
                    "Train/accuracy", train_acc, epoch * len(train_loader) + idx
                )
                logger.add_scalar("LR", current_lr, epoch * len(train_loader) + idx)

                loss_value = 0
                matches = 0

        # val loop
        with torch.no_grad():
            print("Calculating validation results...")
            model.eval()
            val_loss_items = []
            val_acc_items = []
            acc_okay = 0
            count_all = 0
            for idx, items in enumerate(tqdm(val_loader)):
                sleep(0.01)
                item = {key: val.to(device) for key, val in items.items()}

                outs = model(**item)

                preds = torch.argmax(outs[0], dim=-1)
                loss = criterion(outs[0], item["labels"]).item()

                acc_item = (item["labels"] == preds).sum().item()

                val_loss_items.append(loss)
                val_acc_items.append(acc_item)
                acc_okay += acc_item
                count_all += len(preds)

            val_loss = np.sum(val_loss_items) / len(val_loss_items)
            val_acc = acc_okay / count_all

            if val_acc > best_val_acc:
                print(
                    f"New best model for val acc : {val_acc:4.2%}! saving the best model.."
                )
                model_to_save = model.module if hasattr(model, "module") else model
                model_to_save.save_pretrained(f"{save_dir}/best")
                torch.save(args, os.path.join(f"{save_dir}/best", "training_args.bin"))
                best_val_acc = val_acc

            if val_loss < best_val_loss:
                best_val_loss = val_loss
            print(
                f"[Val] acc : {val_acc:4.2%}, loss: {val_loss:4.4}|| "
                f"best acc : {best_val_acc:4.2%}, best loss: {best_val_loss:4.4}"
            )

            logger.add_scalar("Val/loss", val_loss, epoch)
            logger.add_scalar("Val/accuracy", val_acc, epoch)
            s = f"Time elapsed: {(time.time() - start_time)/60: .2f} min"
            print(s)
            print()
            if epoch > 24:
                model_to_save = model.module if hasattr(model, "module") else model
                model_to_save.save_pretrained(f"{save_dir}/best")
                torch.save(args, os.path.join(f"{save_dir}/best", "training_args.bin"))
                break
    return model

## Training Configuration
---
1. Roberta-large pretrained model 사용하여 fine-tune
2. 10epoch 내외로 수렴하는 것을 확인해서 15epoch만 돌림

In [16]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

args  = EasyDict(dict(
    epochs = 15,
    model_type = "Roberta",
    pretrained_model = "klue/roberta-large",
    lr = 8e-6,
    batch_size = 32,
    freeze_epoch = 0,
    valid_batch_size = 128,
    val_ratio = 0.2,
    dropout_rate = 0.1,
    criterion = 'cross_entropy',
    optimizer = 'AdamW',
    weight_decay = 0.01,
    warmup_steps = 500,
    seed = 42,
    log_interval = 20,
    kfold = 1,
    model_dir = "./copa_data_results/results",
))
    
    
    
args.name = f'TrainAll_{args.model_type}_{args.lr}'

## Training

In [17]:
model = train(args.model_dir, args)

device(GPU) : True


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

Epoch[0/15](20/192) || training loss 0.6938 || training accuracy 50.62% || lr 3.2e-07
Epoch[0/15](40/192) || training loss 0.6936 || training accuracy 48.44% || lr 6.4e-07
Epoch[0/15](60/192) || training loss 0.6956 || training accuracy 47.34% || lr 9.6e-07
Epoch[0/15](80/192) || training loss 0.6906 || training accuracy 52.81% || lr 1.28e-06
Epoch[0/15](100/192) || training loss 0.6913 || training accuracy 54.22% || lr 1.6e-06
Epoch[0/15](120/192) || training loss 0.6918 || training accuracy 53.59% || lr 1.92e-06
Epoch[0/15](140/192) || training loss 0.6883 || training accuracy 56.41% || lr 2.24e-06
Epoch[0/15](160/192) || training loss 0.6875 || training accuracy 54.53% || lr 2.56e-06
Epoch[0/15](180/192) || training loss 0.6704 || training accuracy 61.09% || lr 2.88e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

New best model for val acc : 78.80%! saving the best model..
[Val] acc : 78.80%, loss: 0.4907|| best acc : 78.80%, best loss: 0.4907
Time elapsed:  1.08 min

Epoch[1/15](20/192) || training loss 0.4782 || training accuracy 77.50% || lr 3.392e-06
Epoch[1/15](40/192) || training loss 0.381 || training accuracy 84.06% || lr 3.712e-06
Epoch[1/15](60/192) || training loss 0.4154 || training accuracy 82.19% || lr 4.032e-06
Epoch[1/15](80/192) || training loss 0.3368 || training accuracy 85.62% || lr 4.352e-06
Epoch[1/15](100/192) || training loss 0.3439 || training accuracy 85.47% || lr 4.6719999999999995e-06
Epoch[1/15](120/192) || training loss 0.3751 || training accuracy 82.97% || lr 4.992e-06
Epoch[1/15](140/192) || training loss 0.2885 || training accuracy 88.28% || lr 5.312e-06
Epoch[1/15](160/192) || training loss 0.2783 || training accuracy 86.72% || lr 5.632e-06
Epoch[1/15](180/192) || training loss 0.3064 || training accuracy 87.81% || lr 5.952e-06
Calculating validation results...

  0%|          | 0/4 [00:00<?, ?it/s]

New best model for val acc : 91.20%! saving the best model..
[Val] acc : 91.20%, loss: 0.2792|| best acc : 91.20%, best loss: 0.2792
Time elapsed:  2.26 min

Epoch[2/15](20/192) || training loss 0.1712 || training accuracy 93.28% || lr 6.464e-06
Epoch[2/15](40/192) || training loss 0.1538 || training accuracy 92.97% || lr 6.784e-06
Epoch[2/15](60/192) || training loss 0.1435 || training accuracy 94.38% || lr 7.104e-06
Epoch[2/15](80/192) || training loss 0.1839 || training accuracy 92.50% || lr 7.424e-06
Epoch[2/15](100/192) || training loss 0.1547 || training accuracy 94.38% || lr 7.743999999999999e-06
Epoch[2/15](120/192) || training loss 0.1975 || training accuracy 92.81% || lr 7.986554621848738e-06
Epoch[2/15](140/192) || training loss 0.1413 || training accuracy 95.00% || lr 7.919327731092437e-06
Epoch[2/15](160/192) || training loss 0.141 || training accuracy 95.16% || lr 7.852100840336134e-06
Epoch[2/15](180/192) || training loss 0.1112 || training accuracy 95.47% || lr 7.784873

  0%|          | 0/4 [00:00<?, ?it/s]

New best model for val acc : 91.60%! saving the best model..
[Val] acc : 91.60%, loss: 0.3272|| best acc : 91.60%, best loss: 0.2792
Time elapsed:  3.43 min

Epoch[3/15](20/192) || training loss 0.06428 || training accuracy 97.34% || lr 7.677310924369748e-06
Epoch[3/15](40/192) || training loss 0.066 || training accuracy 97.97% || lr 7.610084033613444e-06
Epoch[3/15](60/192) || training loss 0.06924 || training accuracy 97.97% || lr 7.542857142857142e-06
Epoch[3/15](80/192) || training loss 0.05448 || training accuracy 98.28% || lr 7.47563025210084e-06
Epoch[3/15](100/192) || training loss 0.05794 || training accuracy 97.97% || lr 7.408403361344538e-06
Epoch[3/15](120/192) || training loss 0.0687 || training accuracy 97.66% || lr 7.341176470588234e-06
Epoch[3/15](140/192) || training loss 0.04584 || training accuracy 99.06% || lr 7.273949579831932e-06
Epoch[3/15](160/192) || training loss 0.03607 || training accuracy 99.06% || lr 7.20672268907563e-06
Epoch[3/15](180/192) || training lo

  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 90.60%, loss: 0.4094|| best acc : 91.60%, best loss: 0.2792
Time elapsed:  4.46 min

Epoch[4/15](20/192) || training loss 0.04117 || training accuracy 98.59% || lr 7.031932773109243e-06
Epoch[4/15](40/192) || training loss 0.03569 || training accuracy 98.75% || lr 6.964705882352941e-06
Epoch[4/15](60/192) || training loss 0.02685 || training accuracy 99.06% || lr 6.897478991596638e-06
Epoch[4/15](80/192) || training loss 0.03934 || training accuracy 98.12% || lr 6.830252100840335e-06
Epoch[4/15](100/192) || training loss 0.02512 || training accuracy 99.22% || lr 6.763025210084033e-06
Epoch[4/15](120/192) || training loss 0.02224 || training accuracy 99.38% || lr 6.695798319327731e-06
Epoch[4/15](140/192) || training loss 0.02148 || training accuracy 99.69% || lr 6.628571428571428e-06
Epoch[4/15](160/192) || training loss 0.02032 || training accuracy 99.06% || lr 6.5613445378151255e-06
Epoch[4/15](180/192) || training loss 0.02397 || training accuracy 99.38% || lr 6.49411764

  0%|          | 0/4 [00:00<?, ?it/s]

New best model for val acc : 92.00%! saving the best model..
[Val] acc : 92.00%, loss: 0.3837|| best acc : 92.00%, best loss: 0.2792
Time elapsed:  5.64 min

Epoch[5/15](20/192) || training loss 0.02074 || training accuracy 99.06% || lr 6.386554621848739e-06
Epoch[5/15](40/192) || training loss 0.01713 || training accuracy 99.53% || lr 6.319327731092436e-06
Epoch[5/15](60/192) || training loss 0.02199 || training accuracy 99.22% || lr 6.252100840336134e-06
Epoch[5/15](80/192) || training loss 0.02969 || training accuracy 98.75% || lr 6.184873949579832e-06
Epoch[5/15](100/192) || training loss 0.01875 || training accuracy 99.69% || lr 6.1176470588235285e-06
Epoch[5/15](120/192) || training loss 0.0256 || training accuracy 98.91% || lr 6.0504201680672265e-06
Epoch[5/15](140/192) || training loss 0.02173 || training accuracy 99.69% || lr 5.9831932773109244e-06
Epoch[5/15](160/192) || training loss 0.01657 || training accuracy 99.53% || lr 5.9159663865546215e-06
Epoch[5/15](180/192) || tra

  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 91.20%, loss: 0.4203|| best acc : 92.00%, best loss: 0.2792
Time elapsed:  6.66 min

Epoch[6/15](20/192) || training loss 0.01329 || training accuracy 99.69% || lr 5.741176470588235e-06
Epoch[6/15](40/192) || training loss 0.02177 || training accuracy 99.69% || lr 5.6739495798319324e-06
Epoch[6/15](60/192) || training loss 0.01304 || training accuracy 99.69% || lr 5.6067226890756295e-06
Epoch[6/15](80/192) || training loss 0.009519 || training accuracy 99.84% || lr 5.5394957983193275e-06
Epoch[6/15](100/192) || training loss 0.01774 || training accuracy 99.53% || lr 5.472268907563025e-06
Epoch[6/15](120/192) || training loss 0.01284 || training accuracy 99.69% || lr 5.4050420168067225e-06
Epoch[6/15](140/192) || training loss 0.008861 || training accuracy 99.69% || lr 5.33781512605042e-06
Epoch[6/15](160/192) || training loss 0.009139 || training accuracy 99.84% || lr 5.2705882352941176e-06
Epoch[6/15](180/192) || training loss 0.01358 || training accuracy 99.53% || lr 5.20

  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 91.80%, loss: 0.4458|| best acc : 92.00%, best loss: 0.2792
Time elapsed:  7.69 min

Epoch[7/15](20/192) || training loss 0.01839 || training accuracy 99.22% || lr 5.0957983193277305e-06
Epoch[7/15](40/192) || training loss 0.01006 || training accuracy 99.69% || lr 5.0285714285714285e-06
Epoch[7/15](60/192) || training loss 0.009921 || training accuracy 99.84% || lr 4.9613445378151256e-06
Epoch[7/15](80/192) || training loss 0.006946 || training accuracy 99.84% || lr 4.8941176470588235e-06
Epoch[7/15](100/192) || training loss 0.01625 || training accuracy 99.53% || lr 4.826890756302521e-06
Epoch[7/15](120/192) || training loss 0.009226 || training accuracy 99.69% || lr 4.7596638655462185e-06
Epoch[7/15](140/192) || training loss 0.01174 || training accuracy 99.69% || lr 4.692436974789916e-06
Epoch[7/15](160/192) || training loss 0.005835 || training accuracy 100.00% || lr 4.625210084033614e-06
Epoch[7/15](180/192) || training loss 0.007438 || training accuracy 99.84% || lr 

  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 91.80%, loss: 0.4472|| best acc : 92.00%, best loss: 0.2792
Time elapsed:  8.72 min

Epoch[8/15](20/192) || training loss 0.006374 || training accuracy 100.00% || lr 4.4504201680672266e-06
Epoch[8/15](40/192) || training loss 0.006707 || training accuracy 99.84% || lr 4.3831932773109245e-06
Epoch[8/15](60/192) || training loss 0.006601 || training accuracy 99.84% || lr 4.315966386554622e-06
Epoch[8/15](80/192) || training loss 0.007672 || training accuracy 99.84% || lr 4.2487394957983195e-06
Epoch[8/15](100/192) || training loss 0.005372 || training accuracy 99.84% || lr 4.181512605042017e-06
Epoch[8/15](120/192) || training loss 0.008952 || training accuracy 99.69% || lr 4.114285714285714e-06
Epoch[8/15](140/192) || training loss 0.006693 || training accuracy 99.69% || lr 4.047058823529412e-06
Epoch[8/15](160/192) || training loss 0.004281 || training accuracy 100.00% || lr 3.979831932773109e-06
Epoch[8/15](180/192) || training loss 0.003682 || training accuracy 100.00% ||

  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 92.00%, loss: 0.483|| best acc : 92.00%, best loss: 0.2792
Time elapsed:  9.74 min

Epoch[9/15](20/192) || training loss 0.002338 || training accuracy 100.00% || lr 3.805042016806722e-06
Epoch[9/15](40/192) || training loss 0.003144 || training accuracy 100.00% || lr 3.73781512605042e-06
Epoch[9/15](60/192) || training loss 0.001843 || training accuracy 100.00% || lr 3.670588235294117e-06
Epoch[9/15](80/192) || training loss 0.009822 || training accuracy 99.53% || lr 3.603361344537815e-06
Epoch[9/15](100/192) || training loss 0.007631 || training accuracy 99.69% || lr 3.5361344537815122e-06
Epoch[9/15](120/192) || training loss 0.00453 || training accuracy 100.00% || lr 3.46890756302521e-06
Epoch[9/15](140/192) || training loss 0.002018 || training accuracy 100.00% || lr 3.4016806722689073e-06
Epoch[9/15](160/192) || training loss 0.005038 || training accuracy 100.00% || lr 3.3344537815126052e-06
Epoch[9/15](180/192) || training loss 0.01228 || training accuracy 99.69% || l

  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 90.40%, loss: 0.4537|| best acc : 92.00%, best loss: 0.2792
Time elapsed:  10.77 min

Epoch[10/15](20/192) || training loss 0.001722 || training accuracy 100.00% || lr 3.159663865546218e-06
Epoch[10/15](40/192) || training loss 0.004434 || training accuracy 100.00% || lr 3.092436974789916e-06
Epoch[10/15](60/192) || training loss 0.004272 || training accuracy 99.84% || lr 3.0252100840336132e-06
Epoch[10/15](80/192) || training loss 0.002358 || training accuracy 100.00% || lr 2.9579831932773108e-06
Epoch[10/15](100/192) || training loss 0.001988 || training accuracy 100.00% || lr 2.8907563025210083e-06
Epoch[10/15](120/192) || training loss 0.005524 || training accuracy 99.84% || lr 2.823529411764706e-06
Epoch[10/15](140/192) || training loss 0.001401 || training accuracy 100.00% || lr 2.7563025210084033e-06
Epoch[10/15](160/192) || training loss 0.005311 || training accuracy 99.84% || lr 2.689075630252101e-06
Epoch[10/15](180/192) || training loss 0.002814 || training accur

  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 90.80%, loss: 0.4928|| best acc : 92.00%, best loss: 0.2792
Time elapsed:  11.79 min

Epoch[11/15](20/192) || training loss 0.006685 || training accuracy 99.53% || lr 2.5142857142857142e-06
Epoch[11/15](40/192) || training loss 0.001531 || training accuracy 100.00% || lr 2.4470588235294118e-06
Epoch[11/15](60/192) || training loss 0.002856 || training accuracy 99.84% || lr 2.3798319327731093e-06
Epoch[11/15](80/192) || training loss 0.004037 || training accuracy 99.84% || lr 2.312605042016807e-06
Epoch[11/15](100/192) || training loss 0.003251 || training accuracy 100.00% || lr 2.2453781512605043e-06
Epoch[11/15](120/192) || training loss 0.007113 || training accuracy 99.84% || lr 2.1781512605042014e-06
Epoch[11/15](140/192) || training loss 0.001861 || training accuracy 100.00% || lr 2.110924369747899e-06
Epoch[11/15](160/192) || training loss 0.001811 || training accuracy 100.00% || lr 2.0436974789915965e-06
Epoch[11/15](180/192) || training loss 0.001926 || training accu

  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 90.80%, loss: 0.4789|| best acc : 92.00%, best loss: 0.2792
Time elapsed:  12.82 min

Epoch[12/15](20/192) || training loss 0.002779 || training accuracy 99.84% || lr 1.86890756302521e-06
Epoch[12/15](40/192) || training loss 0.00307 || training accuracy 100.00% || lr 1.8016806722689076e-06
Epoch[12/15](60/192) || training loss 0.003593 || training accuracy 99.84% || lr 1.734453781512605e-06
Epoch[12/15](80/192) || training loss 0.003506 || training accuracy 100.00% || lr 1.6672268907563026e-06
Epoch[12/15](100/192) || training loss 0.00425 || training accuracy 100.00% || lr 1.6e-06
Epoch[12/15](120/192) || training loss 0.001967 || training accuracy 100.00% || lr 1.5327731092436974e-06
Epoch[12/15](140/192) || training loss 0.004414 || training accuracy 99.84% || lr 1.4655462184873948e-06
Epoch[12/15](160/192) || training loss 0.002093 || training accuracy 100.00% || lr 1.3983193277310923e-06
Epoch[12/15](180/192) || training loss 0.005751 || training accuracy 99.84% || lr

  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 91.40%, loss: 0.479|| best acc : 92.00%, best loss: 0.2792
Time elapsed:  13.85 min

Epoch[13/15](20/192) || training loss 0.001776 || training accuracy 100.00% || lr 1.2235294117647059e-06
Epoch[13/15](40/192) || training loss 0.001028 || training accuracy 100.00% || lr 1.1563025210084034e-06
Epoch[13/15](60/192) || training loss 0.002657 || training accuracy 100.00% || lr 1.0890756302521007e-06
Epoch[13/15](80/192) || training loss 0.0009199 || training accuracy 100.00% || lr 1.0218487394957982e-06
Epoch[13/15](100/192) || training loss 0.004904 || training accuracy 99.84% || lr 9.546218487394957e-07
Epoch[13/15](120/192) || training loss 0.001452 || training accuracy 100.00% || lr 8.873949579831932e-07
Epoch[13/15](140/192) || training loss 0.002073 || training accuracy 100.00% || lr 8.201680672268907e-07
Epoch[13/15](160/192) || training loss 0.002645 || training accuracy 100.00% || lr 7.529411764705882e-07
Epoch[13/15](180/192) || training loss 0.00177 || training accu

  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 91.00%, loss: 0.4957|| best acc : 92.00%, best loss: 0.2792
Time elapsed:  14.88 min

Epoch[14/15](20/192) || training loss 0.0006829 || training accuracy 100.00% || lr 5.781512605042017e-07
Epoch[14/15](40/192) || training loss 0.005888 || training accuracy 99.84% || lr 5.109243697478991e-07
Epoch[14/15](60/192) || training loss 0.004824 || training accuracy 99.69% || lr 4.436974789915966e-07
Epoch[14/15](80/192) || training loss 0.001067 || training accuracy 100.00% || lr 3.764705882352941e-07
Epoch[14/15](100/192) || training loss 0.003102 || training accuracy 100.00% || lr 3.0924369747899157e-07
Epoch[14/15](120/192) || training loss 0.001379 || training accuracy 100.00% || lr 2.4201680672268904e-07
Epoch[14/15](140/192) || training loss 0.001097 || training accuracy 100.00% || lr 1.7478991596638653e-07
Epoch[14/15](160/192) || training loss 0.002399 || training accuracy 100.00% || lr 1.0756302521008403e-07
Epoch[14/15](180/192) || training loss 0.0005987 || training ac

  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 91.00%, loss: 0.4968|| best acc : 92.00%, best loss: 0.2792
Time elapsed:  15.91 min



## Inference : 
---
- target_dir(Best Val Accuracy model) 를 상황에 맞게 수정해야 함.

In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
target_dir = "copa_data_results/results/TrainAll_Roberta_8e-06/1/best"
model_module = eval(args.model_type)
model = model_module.from_pretrained(target_dir, args=args)
model.parameters
model.to(device)
model.eval()
""

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

''

In [19]:
tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)

dataset = load_data(valid_data)
test_label = dataset["label"].values

tokenized_test = tokenized_dataset(dataset, tokenizer, check_arch(args.model_type))
test_dataset = CustomDataset(tokenized_test, test_label)

In [20]:
def inference(model, tokenized_sent, device):
    dataloader = DataLoader(tokenized_sent, batch_size=8, shuffle=False)
    model.eval()
    results = []
    preds = []

    for i, items in enumerate(tqdm(dataloader)):
        item = {key: val.to(device) for key, val in items.items()}
        with torch.no_grad():
            outputs = model(**item)
        logits = outputs[0]
        m = nn.Softmax(dim=1)
        logits = m(logits)
        logits = logits.detach().cpu().numpy()  # (Batch_size, 5)  5개의 클래스 확률형태
        pred = logits[:, 1]
        result = np.argmax(logits, axis=-1)
        results += result.tolist()
        preds += pred.tolist()

    return np.array(results).flatten(), np.array(preds).flatten()

In [21]:
pred_answer, preds = inference(model, tokenized_sent=test_dataset, device=device)

  0%|          | 0/63 [00:00<?, ?it/s]

In [22]:
# make json
submission_json = {"copa": []}
for i, pred in enumerate(pred_answer.tolist()):
    submission_json["copa"].append({"idx": i, "label": int(pred + 1)})
with open("submission.json", "w") as fp:
    json.dump(submission_json, fp)

In [23]:
dataset["model_answer"] = pred_answer
dataset["model_pred"] = preds
dataset.to_csv("copa_result.csv", index=False, encoding="utf-8-sig")