# COPA 학습 & Inference to json 코드
---

## 모듈 임포트

In [1]:
import glob
import os
import random
import json
import time
import re
from time import sleep
from importlib import import_module
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score
from easydict import EasyDict

import torch
import torch.nn as nn
import transformers
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import (
    BertModel,
    BertPreTrainedModel,
    ElectraModel,
    ElectraPreTrainedModel,
    XLMRobertaModel,
    BartModel,
    BartPretrainedModel,
    T5Model,
    RobertaModel,
)
from transformers import MBartModel, MBartConfig
from transformers import BertTokenizer, BertModel


## Transformers의 Wrapper Class와 일부 테스트 모델 선언 및 구현부

In [2]:
class FCLayer(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.0, use_activation=True):
        super(FCLayer, self).__init__()
        self.use_activation = use_activation
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(input_dim, output_dim)
        self.tanh = nn.Tanh()

    def forward(self, x):
        x = self.dropout(x)
        if self.use_activation:
            x = self.tanh(x)
        return self.linear(x)


class PoolingHead(nn.Module):
    def __init__(
        self, input_dim: int, inner_dim: int, pooler_dropout: float,
    ):
        super().__init__()
        self.dense = nn.Linear(input_dim, inner_dim)
        self.dropout = nn.Dropout(p=pooler_dropout)

    def forward(self, hidden_states: torch.Tensor):
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.dense(hidden_states)
        hidden_states = torch.tanh(hidden_states)
        return hidden_states


class Bert(BertPreTrainedModel):
    def __init__(self, config, args):
        super(Bert, self).__init__(config)
        self.bert = BertModel(config=config)  # Load pretrained bert

        self.num_labels = config.num_labels

        self.pooling = PoolingHead(
            input_dim=config.hidden_size,
            inner_dim=config.hidden_size,
            pooler_dropout=0.1,
        )
        self.qa_classifier = nn.Linear(config.hidden_size, self.num_labels - 1)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        input_ids2=None,
        attention_mask2=None,
        token_type_ids2=None,
        labels=None,
    ):
        outputs = self.bert(
            input_ids, attention_mask=attention_mask
        )  # sequence_output, pooled_output, (hidden_states), (attentions)
        outputs2 = self.bert(input_ids2, attention_mask=attention_mask2)
        sequence_output = outputs[0]
        sequence_output2 = outputs2[0]
        pooled_output = outputs[0][:, 0, :]  # [CLS]
        pooled_output2 = outputs2[0][:, 0, :]

        sentence_representation = torch.cat([pooled_output, pooled_output2], dim=1)

        pooled_output = self.pooling(pooled_output)
        pooled_output2 = self.pooling(pooled_output2)

        logits1 = self.qa_classifier(pooled_output)
        logits2 = self.qa_classifier(pooled_output2)

        logits = torch.cat([logits1, logits2], dim=1)

        outputs = (logits,) + outputs[
            2:
        ]  # add hidden states and attention if they are here

        return outputs  # logits, (hidden_states), (attentions)


class XLMRoberta(XLMRobertaModel):
    def __init__(self, config, args):
        super(XLMRoberta, self).__init__(config)
        self.xlmroberta = XLMRobertaModel.from_pretrained(
            "xlm-roberta-large", config=config
        )  # Load pretrained Electra

        self.num_labels = config.num_labels

        self.pooling = PoolingHead(
            input_dim=config.hidden_size,
            inner_dim=config.hidden_size,
            pooler_dropout=0.1,
        )
        self.qa_classifier = nn.Linear(config.hidden_size, self.num_labels - 1)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        input_ids2=None,
        attention_mask2=None,
        token_type_ids2=None,
        labels=None,
    ):
        outputs = self.xlmroberta(
            input_ids, attention_mask=attention_mask
        )  # sequence_output, pooled_output, (hidden_states), (attentions)
        outputs2 = self.xlmroberta(input_ids2, attention_mask=attention_mask2)
        sequence_output = outputs[0]
        sequence_output2 = outputs2[0]
        pooled_output = outputs[0][:, 0, :]  # [CLS]
        pooled_output2 = outputs2[0][:, 0, :]

        sentence_representation = torch.cat([pooled_output, pooled_output2], dim=1)

        pooled_output = self.pooling(pooled_output)
        pooled_output2 = self.pooling(pooled_output2)

        logits1 = self.qa_classifier(pooled_output)
        logits2 = self.qa_classifier(pooled_output2)

        logits = torch.cat([logits1, logits2], dim=1)

        outputs = (logits,) + outputs[
            2:
        ]  # add hidden states and attention if they are here

        return outputs  # logits, (hidden_states), (attentions)


class Electra_BoolQ(ElectraPreTrainedModel):
    def __init__(self, config, args):
        super(Electra_BoolQ, self).__init__(config)

        # self.num_labels = config.num_labels
        self.num_labels = config.num_labels
        self.model = ElectraModel.from_pretrained(
            "monologg/koelectra-base-v3-discriminator", config=config
        )
        self.pooling = PoolingHead(
            input_dim=config.hidden_size,
            inner_dim=config.hidden_size,
            pooler_dropout=0.1,
        )
        self.qa_classifier = nn.Linear(config.hidden_size, self.num_labels - 1)
        # self.sparse = Sparsemax()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        input_ids2=None,
        attention_mask2=None,
        token_type_ids2=None,
        labels=None,
    ):
        outputs = self.model(
            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        )  # sequence_output, pooled_output, (hidden_states), (attentions)
        outputs2 = self.model(
            input_ids2, attention_mask=attention_mask2, token_type_ids=token_type_ids2
        )
        sequence_output = outputs[0]
        sequence_output2 = outputs2[0]
        pooled_output = outputs[0][:, 0, :]  # [CLS]
        pooled_output2 = outputs2[0][:, 0, :]

        sentence_representation = torch.cat([pooled_output, pooled_output2], dim=1)

        pooled_output = self.pooling(pooled_output)
        pooled_output2 = self.pooling(pooled_output2)

        logits1 = self.qa_classifier(pooled_output)
        logits2 = self.qa_classifier(pooled_output2)

        logits = torch.cat([logits1, logits2], dim=1)

        outputs = (logits,) + outputs[
            2:
        ]  # add hidden states and attention if they are here

        return outputs  # logits, (hidden_states), (attentions)


class Roberta(RobertaModel):
    def __init__(self, config, args):
        super(Roberta, self).__init__(config)
        self.roberta = RobertaModel.from_pretrained(
            "klue/roberta-large", config=config
        )  # Load pretrained Electra

        self.num_labels = config.num_labels

        self.pooling = PoolingHead(
            input_dim=config.hidden_size,
            inner_dim=config.hidden_size,
            pooler_dropout=0.1,
        )
        self.qa_classifier = nn.Linear(config.hidden_size, self.num_labels - 1)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        input_ids2=None,
        attention_mask2=None,
        token_type_ids2=None,
        labels=None,
    ):
        outputs = self.roberta(
            input_ids, attention_mask=attention_mask
        )  # sequence_output, pooled_output, (hidden_states), (attentions)
        outputs2 = self.roberta(input_ids2, attention_mask=attention_mask2)
        sequence_output = outputs[0]
        sequence_output2 = outputs2[0]
        pooled_output = outputs[0][:, 0, :]  # [CLS]
        pooled_output2 = outputs2[0][:, 0, :]

        sentence_representation = torch.cat([pooled_output, pooled_output2], dim=1)

        pooled_output = self.pooling(pooled_output)
        pooled_output2 = self.pooling(pooled_output2)

        logits1 = self.qa_classifier(pooled_output)
        logits2 = self.qa_classifier(pooled_output2)

        logits = torch.cat([logits1, logits2], dim=1)

        outputs = (logits,) + outputs[
            2:
        ]  # add hidden states and attention if they are here

        return outputs  # logits, (hidden_states), (attentions)


## 데이터 전처리부
---

In [3]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset, labels):
        self.tokenized_dataset = tokenized_dataset
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.tokenized_dataset.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
def load_data(dataset_dir):
    dataset = pd.read_csv(
        dataset_dir,
        delimiter="\t",
        names=["ID", "sentence", "question", "1", "2", "answer"],
        header=0,
    )
    dataset["label"] = dataset["answer"].astype(int) - 1

    new_sentence1_1 = []
    new_sentence1_2 = []
    new_sentence2_1 = []
    new_sentence2_2 = []
    for i in range(len(dataset)):
        s = dataset.iloc[i]["sentence"]
        q = dataset.iloc[i]["question"]
        s1 = dataset.iloc[i]["1"]
        s2 = dataset.iloc[i]["2"]
        lb = dataset.iloc[i]["label"]
        if q == "결과":
            new_sentence1_1.append("[결과]" + s)
            # new_sentence1_1.append(s)
            new_sentence1_2.append(s1)
            new_sentence2_1.append("[결과]" + s)
            # new_sentence2_1.append(s)
            new_sentence2_2.append(s2)

        else:
            new_sentence1_1.append("[원인]" + s1)
            # new_sentence1_1.append(s1)
            new_sentence1_2.append(s)
            new_sentence2_1.append("[원인]" + s2)
            # new_sentence2_1.append(s2)
            new_sentence2_2.append(s)

    dataset["new_sentence1_1"] = new_sentence1_1
    dataset["new_sentence1_2"] = new_sentence1_2
    dataset["new_sentence2_1"] = new_sentence2_1
    dataset["new_sentence2_2"] = new_sentence2_2

    return dataset


def tokenized_dataset(dataset, tokenizer, arch="encoder"):
    sentence1_1 = dataset["new_sentence1_1"].tolist()
    sentence1_2 = dataset["new_sentence1_2"].tolist()
    sentence2_1 = dataset["new_sentence2_1"].tolist()
    sentence2_2 = dataset["new_sentence2_2"].tolist()

    tokenized_sentences = tokenizer(
        sentence1_1,
        sentence1_2,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=150,
        add_special_tokens=True,
        return_token_type_ids=True,
    )
    tokenized_sentences2 = tokenizer(
        sentence2_1,
        sentence2_2,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=150,
        add_special_tokens=True,
        return_token_type_ids=True,
    )
    for key, value in tokenized_sentences2.items():
        tokenized_sentences[key + "2"] = value

    return tokenized_sentences


## 트레이닝

In [4]:
def check_arch(model_type):
    archs = {
        "encoder": ["Bert", "Electra", "XLMRoberta", "Electra_BoolQ", "Roberta"],
        "encoder-decoder": ["T5", "Bart", "Bart_BoolQ"],
    }
    for arch in archs:
        if model_type in archs[arch]:
            return arch
    raise ValueError(f"Model [{model_type}] no defined archtecture")


def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    acc = accuracy_score(labels, preds)
    return {
      'accuracy': acc,
    }

def increment_output_dir(output_path, exist_ok=False):
    path = Path(output_path)
    if (path.exists() and exist_ok) or (not path.exists()):
        return str(path)
    else:
        dirs = glob.glob(f"{path}*")
        matches = [re.search(rf"%s(\d+)" % path.stem, d) for d in dirs]
        i = [int(m.groups()[0]) for m in matches if m]
        n = max(i) + 1 if i else 2
        return f"{path}{n}"

def train(model_dir, args):

    seed_everything(args.seed)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"device(GPU) : {torch.cuda.is_available()}")
    num_classes = 2

    # load model and tokenizerƒ
    MODEL_NAME = args.pretrained_model
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # load dataset
    train_dataset = load_data("./dataset/copa/SKT_COPA_Train.tsv")
    val_dataset = load_data("./dataset/copa/SKT_COPA_Dev.tsv")

    train_label = train_dataset["label"].values
    val_label = val_dataset["label"].values

    # tokenizing dataset
    tokenized_train = tokenized_dataset(
        train_dataset, tokenizer, check_arch(args.model_type)
    )
    tokenized_val = tokenized_dataset(
        val_dataset, tokenizer, check_arch(args.model_type)
    )

    # make dataset for pytorch.
    train_dataset = CustomDataset(tokenized_train, train_label)
    val_dataset = CustomDataset(tokenized_val, val_label)
    # -- data_loader
    train_loader = DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True,
    )

    val_loader = DataLoader(
        val_dataset, batch_size=args.valid_batch_size, shuffle=False, drop_last=False,
    )

    # setting model hyperparameter
    if args.model_type == "Electra_BoolQ":
        config_module = ElectraConfig
    else:
        config_module = getattr(
            import_module("transformers"), args.model_type + "Config"
        )

    model_config = config_module.from_pretrained(MODEL_NAME)
    model_config.num_labels = 2

    model_module = eval(args.model_type)

    if args.model_type in ["BERT", "Electra"]:
        model = model_module.from_pretrained(
            MODEL_NAME, config=model_config, args=args
        )
    else:
        model = model_module(config=model_config, args=args)

    model.parameters
    model.to(device)
    save_dir = increment_output_dir(os.path.join(model_dir, args.name, str(args.kfold)))

    # Freeze Parameter
    for name, param in model.named_parameters():
        if ("cls_fc_layer" not in name) and (
            "label_classifier" not in name
        ):  # classifier layer
            param.requires_grad = False

    # -- loss & metric
    criterion = nn.CrossEntropyLoss()
    
    opt_module = getattr(import_module("transformers"), args.optimizer)
    optimizer = opt_module(
        model.parameters(), lr=args.lr, weight_decay=args.weight_decay, eps=1e-8
    )
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=len(train_loader) * args.epochs,
        last_epoch=-1,
    )

    # -- logging
    start_time = time.time()
    logger = SummaryWriter(log_dir=save_dir)
    with open(os.path.join(save_dir, "config.json"), "w", encoding="utf-8") as f:
        json.dump(vars(args), f, ensure_ascii=False, indent=4)

    best_val_acc = 0
    best_val_loss = np.inf
    for epoch in range(args.epochs):
        # train loop
        # unFreeze parameters
        if epoch == args.freeze_epoch:
            for name, param in model.named_parameters():
                param.requires_grad = True
        model.train()
        loss_value = 0
        matches = 0
        for idx, items in enumerate(train_loader):
            item = {key: val.to(device) for key, val in items.items()}

            optimizer.zero_grad()
            outs = model(**item)
            loss = criterion(outs[0], item["labels"])

            preds = torch.argmax(outs[0], dim=-1)

            loss.backward()
            optimizer.step()
            scheduler.step()

            loss_value += loss.item()
            matches += (preds == item["labels"]).sum().item()
            if (idx + 1) % args.log_interval == 0:
                train_loss = loss_value / args.log_interval
                train_acc = matches / args.batch_size / args.log_interval
                current_lr = get_lr(optimizer)
                print(
                    f"Epoch[{epoch}/{args.epochs}]({idx + 1}/{len(train_loader)}) || "
                    f"training loss {train_loss:4.4} || training accuracy {train_acc:4.2%} || lr {current_lr}"
                )

                logger.add_scalar(
                    "Train/loss", train_loss, epoch * len(train_loader) + idx
                )
                logger.add_scalar(
                    "Train/accuracy", train_acc, epoch * len(train_loader) + idx
                )
                logger.add_scalar("LR", current_lr, epoch * len(train_loader) + idx)

                loss_value = 0
                matches = 0

        # val loop
        with torch.no_grad():
            print("Calculating validation results...")
            model.eval()
            val_loss_items = []
            val_acc_items = []
            acc_okay = 0
            count_all = 0
            for idx, items in enumerate(tqdm(val_loader)):
                sleep(0.01)
                item = {key: val.to(device) for key, val in items.items()}

                outs = model(**item)

                preds = torch.argmax(outs[0], dim=-1)
                loss = criterion(outs[0], item["labels"]).item()

                acc_item = (item["labels"] == preds).sum().item()

                val_loss_items.append(loss)
                val_acc_items.append(acc_item)
                acc_okay += acc_item
                count_all += len(preds)

            val_loss = np.sum(val_loss_items) / len(val_loss_items)
            val_acc = acc_okay / count_all

            if val_acc > best_val_acc:
                print(
                    f"New best model for val acc : {val_acc:4.2%}! saving the best model.."
                )
                model_to_save = model.module if hasattr(model, "module") else model
                model_to_save.save_pretrained(f"{save_dir}/best")
                torch.save(args, os.path.join(f"{save_dir}/best", "training_args.bin"))
                best_val_acc = val_acc

            if val_loss < best_val_loss:
                best_val_loss = val_loss
            print(
                f"[Val] acc : {val_acc:4.2%}, loss: {val_loss:4.4}|| "
                f"best acc : {best_val_acc:4.2%}, best loss: {best_val_loss:4.4}"
            )

            logger.add_scalar("Val/loss", val_loss, epoch)
            logger.add_scalar("Val/accuracy", val_acc, epoch)
            s = f"Time elapsed: {(time.time() - start_time)/60: .2f} min"
            print(s)
            print()
            if epoch > 24:
                model_to_save = model.module if hasattr(model, "module") else model
                model_to_save.save_pretrained(f"{save_dir}/best")
                torch.save(args, os.path.join(f"{save_dir}/best", "training_args.bin"))
                break
    return model

## Training Configuration

In [5]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

args  = EasyDict(dict(
    epochs = 20,
    model_type = "Roberta",
    pretrained_model = "klue/roberta-large",
    lr = 8e-6,
    batch_size = 32,
    freeze_epoch = 0,
    valid_batch_size = 128,
    val_ratio = 0.2,
    dropout_rate = 0.1,
    criterion = 'cross_entropy',
    optimizer = 'AdamW',
    weight_decay = 0.01,
    warmup_steps = 500,
    seed = 42,
    log_interval = 20,
    kfold = 1,
    model_dir = "./copa_data_results/results",
))
    
    
    
args.name = f'TrainAll_{args.model_type}_{args.lr}'

## Training

In [6]:
model = train(args.model_dir, args)

device(GPU) : True


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

Epoch[0/20](20/96) || training loss 0.6946 || training accuracy 48.91% || lr 3.2e-07
Epoch[0/20](40/96) || training loss 0.6953 || training accuracy 45.62% || lr 6.4e-07
Epoch[0/20](60/96) || training loss 0.6916 || training accuracy 51.72% || lr 9.6e-07
Epoch[0/20](80/96) || training loss 0.6929 || training accuracy 51.88% || lr 1.28e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

New best model for val acc : 64.60%! saving the best model..
[Val] acc : 64.60%, loss: 0.6916|| best acc : 64.60%, best loss: 0.6916
Time elapsed:  0.47 min

Epoch[1/20](20/96) || training loss 0.6949 || training accuracy 51.41% || lr 1.856e-06
Epoch[1/20](40/96) || training loss 0.6866 || training accuracy 55.94% || lr 2.176e-06
Epoch[1/20](60/96) || training loss 0.6763 || training accuracy 61.09% || lr 2.496e-06
Epoch[1/20](80/96) || training loss 0.565 || training accuracy 74.53% || lr 2.816e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

New best model for val acc : 86.80%! saving the best model..
[Val] acc : 86.80%, loss: 0.3683|| best acc : 86.80%, best loss: 0.3683
Time elapsed:  1.03 min

Epoch[2/20](20/96) || training loss 0.3661 || training accuracy 84.69% || lr 3.392e-06
Epoch[2/20](40/96) || training loss 0.3448 || training accuracy 85.62% || lr 3.712e-06
Epoch[2/20](60/96) || training loss 0.3482 || training accuracy 85.94% || lr 4.032e-06
Epoch[2/20](80/96) || training loss 0.2247 || training accuracy 91.41% || lr 4.352e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

New best model for val acc : 90.20%! saving the best model..
[Val] acc : 90.20%, loss: 0.2971|| best acc : 90.20%, best loss: 0.2971
Time elapsed:  1.59 min

Epoch[3/20](20/96) || training loss 0.1969 || training accuracy 91.72% || lr 4.928e-06
Epoch[3/20](40/96) || training loss 0.1922 || training accuracy 93.28% || lr 5.248e-06
Epoch[3/20](60/96) || training loss 0.1778 || training accuracy 93.91% || lr 5.567999999999999e-06
Epoch[3/20](80/96) || training loss 0.1803 || training accuracy 92.03% || lr 5.887999999999999e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 89.60%, loss: 0.3174|| best acc : 90.20%, best loss: 0.2971
Time elapsed:  2.00 min

Epoch[4/20](20/96) || training loss 0.09187 || training accuracy 96.56% || lr 6.464e-06
Epoch[4/20](40/96) || training loss 0.09666 || training accuracy 96.09% || lr 6.784e-06
Epoch[4/20](60/96) || training loss 0.1046 || training accuracy 96.09% || lr 7.104e-06
Epoch[4/20](80/96) || training loss 0.1265 || training accuracy 95.94% || lr 7.424e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

New best model for val acc : 91.00%! saving the best model..
[Val] acc : 91.00%, loss: 0.3339|| best acc : 91.00%, best loss: 0.2971
Time elapsed:  2.55 min

Epoch[5/20](20/96) || training loss 0.06943 || training accuracy 98.12% || lr 8e-06
Epoch[5/20](40/96) || training loss 0.04864 || training accuracy 98.44% || lr 7.887323943661972e-06
Epoch[5/20](60/96) || training loss 0.06444 || training accuracy 97.34% || lr 7.774647887323943e-06
Epoch[5/20](80/96) || training loss 0.0898 || training accuracy 96.56% || lr 7.661971830985914e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

New best model for val acc : 92.40%! saving the best model..
[Val] acc : 92.40%, loss: 0.3091|| best acc : 92.40%, best loss: 0.2971
Time elapsed:  3.11 min

Epoch[6/20](20/96) || training loss 0.04308 || training accuracy 98.91% || lr 7.459154929577465e-06
Epoch[6/20](40/96) || training loss 0.03518 || training accuracy 98.59% || lr 7.3464788732394365e-06
Epoch[6/20](60/96) || training loss 0.03919 || training accuracy 98.59% || lr 7.233802816901408e-06
Epoch[6/20](80/96) || training loss 0.03839 || training accuracy 97.81% || lr 7.12112676056338e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 90.80%, loss: 0.376|| best acc : 92.40%, best loss: 0.2971
Time elapsed:  3.52 min

Epoch[7/20](20/96) || training loss 0.0218 || training accuracy 99.06% || lr 6.918309859154929e-06
Epoch[7/20](40/96) || training loss 0.02091 || training accuracy 99.53% || lr 6.805633802816901e-06
Epoch[7/20](60/96) || training loss 0.02408 || training accuracy 99.53% || lr 6.6929577464788726e-06
Epoch[7/20](80/96) || training loss 0.03042 || training accuracy 98.75% || lr 6.580281690140845e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 91.40%, loss: 0.3941|| best acc : 92.40%, best loss: 0.2971
Time elapsed:  3.93 min

Epoch[8/20](20/96) || training loss 0.02373 || training accuracy 98.91% || lr 6.377464788732395e-06
Epoch[8/20](40/96) || training loss 0.02151 || training accuracy 99.22% || lr 6.264788732394366e-06
Epoch[8/20](60/96) || training loss 0.01639 || training accuracy 99.69% || lr 6.152112676056338e-06
Epoch[8/20](80/96) || training loss 0.008522 || training accuracy 99.84% || lr 6.0394366197183095e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 92.40%, loss: 0.4694|| best acc : 92.40%, best loss: 0.2971
Time elapsed:  4.34 min

Epoch[9/20](20/96) || training loss 0.01163 || training accuracy 99.53% || lr 5.836619718309859e-06
Epoch[9/20](40/96) || training loss 0.01632 || training accuracy 99.69% || lr 5.723943661971831e-06
Epoch[9/20](60/96) || training loss 0.01219 || training accuracy 99.53% || lr 5.611267605633802e-06
Epoch[9/20](80/96) || training loss 0.0234 || training accuracy 99.53% || lr 5.4985915492957745e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 92.00%, loss: 0.4444|| best acc : 92.40%, best loss: 0.2971
Time elapsed:  4.75 min

Epoch[10/20](20/96) || training loss 0.02596 || training accuracy 98.91% || lr 5.295774647887324e-06
Epoch[10/20](40/96) || training loss 0.0148 || training accuracy 99.69% || lr 5.183098591549296e-06
Epoch[10/20](60/96) || training loss 0.01284 || training accuracy 99.53% || lr 5.070422535211268e-06
Epoch[10/20](80/96) || training loss 0.01002 || training accuracy 99.69% || lr 4.957746478873239e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 92.40%, loss: 0.4539|| best acc : 92.40%, best loss: 0.2971
Time elapsed:  5.17 min

Epoch[11/20](20/96) || training loss 0.0116 || training accuracy 99.53% || lr 4.754929577464788e-06
Epoch[11/20](40/96) || training loss 0.005938 || training accuracy 100.00% || lr 4.64225352112676e-06
Epoch[11/20](60/96) || training loss 0.01036 || training accuracy 99.69% || lr 4.529577464788733e-06
Epoch[11/20](80/96) || training loss 0.006183 || training accuracy 99.84% || lr 4.4169014084507046e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 90.80%, loss: 0.4617|| best acc : 92.40%, best loss: 0.2971
Time elapsed:  5.58 min

Epoch[12/20](20/96) || training loss 0.01075 || training accuracy 99.69% || lr 4.214084507042253e-06
Epoch[12/20](40/96) || training loss 0.005227 || training accuracy 99.84% || lr 4.101408450704225e-06
Epoch[12/20](60/96) || training loss 0.002887 || training accuracy 100.00% || lr 3.988732394366197e-06
Epoch[12/20](80/96) || training loss 0.005085 || training accuracy 100.00% || lr 3.876056338028169e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 91.80%, loss: 0.4453|| best acc : 92.40%, best loss: 0.2971
Time elapsed:  5.99 min

Epoch[13/20](20/96) || training loss 0.002965 || training accuracy 100.00% || lr 3.6732394366197183e-06
Epoch[13/20](40/96) || training loss 0.004252 || training accuracy 100.00% || lr 3.56056338028169e-06
Epoch[13/20](60/96) || training loss 0.008777 || training accuracy 99.84% || lr 3.4478873239436615e-06
Epoch[13/20](80/96) || training loss 0.00348 || training accuracy 100.00% || lr 3.335211267605634e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 91.60%, loss: 0.4442|| best acc : 92.40%, best loss: 0.2971
Time elapsed:  6.40 min

Epoch[14/20](20/96) || training loss 0.004145 || training accuracy 100.00% || lr 3.132394366197183e-06
Epoch[14/20](40/96) || training loss 0.003527 || training accuracy 99.84% || lr 3.0197183098591547e-06
Epoch[14/20](60/96) || training loss 0.002808 || training accuracy 100.00% || lr 2.9070422535211266e-06
Epoch[14/20](80/96) || training loss 0.003978 || training accuracy 100.00% || lr 2.7943661971830984e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 91.40%, loss: 0.453|| best acc : 92.40%, best loss: 0.2971
Time elapsed:  6.82 min

Epoch[15/20](20/96) || training loss 0.001923 || training accuracy 100.00% || lr 2.591549295774648e-06
Epoch[15/20](40/96) || training loss 0.003019 || training accuracy 100.00% || lr 2.4788732394366193e-06
Epoch[15/20](60/96) || training loss 0.002429 || training accuracy 100.00% || lr 2.366197183098591e-06
Epoch[15/20](80/96) || training loss 0.002802 || training accuracy 100.00% || lr 2.2535211267605635e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 91.60%, loss: 0.4582|| best acc : 92.40%, best loss: 0.2971
Time elapsed:  7.23 min

Epoch[16/20](20/96) || training loss 0.002695 || training accuracy 100.00% || lr 2.0507042253521125e-06
Epoch[16/20](40/96) || training loss 0.00178 || training accuracy 100.00% || lr 1.9380281690140844e-06
Epoch[16/20](60/96) || training loss 0.00179 || training accuracy 100.00% || lr 1.8253521126760562e-06
Epoch[16/20](80/96) || training loss 0.004201 || training accuracy 100.00% || lr 1.712676056338028e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 91.60%, loss: 0.4638|| best acc : 92.40%, best loss: 0.2971
Time elapsed:  7.64 min

Epoch[17/20](20/96) || training loss 0.004792 || training accuracy 99.84% || lr 1.5098591549295774e-06
Epoch[17/20](40/96) || training loss 0.001833 || training accuracy 100.00% || lr 1.3971830985915492e-06
Epoch[17/20](60/96) || training loss 0.002725 || training accuracy 100.00% || lr 1.284507042253521e-06
Epoch[17/20](80/96) || training loss 0.001229 || training accuracy 100.00% || lr 1.171830985915493e-06
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 91.60%, loss: 0.4605|| best acc : 92.40%, best loss: 0.2971
Time elapsed:  8.06 min

Epoch[18/20](20/96) || training loss 0.002797 || training accuracy 100.00% || lr 9.690140845070422e-07
Epoch[18/20](40/96) || training loss 0.002013 || training accuracy 100.00% || lr 8.56338028169014e-07
Epoch[18/20](60/96) || training loss 0.002531 || training accuracy 99.84% || lr 7.436619718309859e-07
Epoch[18/20](80/96) || training loss 0.002906 || training accuracy 99.84% || lr 6.309859154929577e-07
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 92.00%, loss: 0.464|| best acc : 92.40%, best loss: 0.2971
Time elapsed:  8.47 min

Epoch[19/20](20/96) || training loss 0.0007484 || training accuracy 100.00% || lr 4.28169014084507e-07
Epoch[19/20](40/96) || training loss 0.004334 || training accuracy 99.84% || lr 3.1549295774647887e-07
Epoch[19/20](60/96) || training loss 0.003066 || training accuracy 99.84% || lr 2.028169014084507e-07
Epoch[19/20](80/96) || training loss 0.003256 || training accuracy 99.69% || lr 9.014084507042254e-08
Calculating validation results...


  0%|          | 0/4 [00:00<?, ?it/s]

[Val] acc : 91.60%, loss: 0.4652|| best acc : 92.40%, best loss: 0.2971
Time elapsed:  8.88 min



## Inference : 
---
- target_dir(Best Val Accuracy model) 를 상황에 맞게 수정해야 함.

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
target_dir = "copa_data_results/results/TrainAll_Roberta_8e-06/1/best"
model_module = eval(args.model_type)
model = model_module.from_pretrained(target_dir, args=args)
model.parameters
model.to(device)
model.eval()
""

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

''

In [9]:
tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)

dataset = load_data("dataset/copa/SKT_COPA_Dev.tsv")
test_label = dataset["label"].values

tokenized_test = tokenized_dataset(dataset, tokenizer, check_arch(args.model_type))
test_dataset = CustomDataset(tokenized_test, test_label)

In [10]:
def inference(model, tokenized_sent, device):
    dataloader = DataLoader(tokenized_sent, batch_size=8, shuffle=False)
    model.eval()
    results = []
    preds = []

    for i, items in enumerate(tqdm(dataloader)):
        item = {key: val.to(device) for key, val in items.items()}
        with torch.no_grad():
            outputs = model(**item)
        logits = outputs[0]
        m = nn.Softmax(dim=1)
        logits = m(logits)
        logits = logits.detach().cpu().numpy()  # (Batch_size, 5)  5개의 클래스 확률형태
        pred = logits[:, 1]
        result = np.argmax(logits, axis=-1)
        results += result.tolist()
        preds += pred.tolist()

    return np.array(results).flatten(), np.array(preds).flatten()

In [11]:
pred_answer, preds = inference(model, tokenized_sent=test_dataset, device=device)

  0%|          | 0/63 [00:00<?, ?it/s]

In [12]:
# make json
submission_json = {"copa": []}
for i, pred in enumerate(pred_answer.tolist()):
    submission_json["copa"].append({"idx": i, "label": int(pred + 1)})
with open("submission.json", "w") as fp:
    json.dump(submission_json, fp)

In [13]:
dataset["model_answer"] = pred_answer
dataset["model_pred"] = preds
dataset.to_csv("copa_result.csv", index=False, encoding="utf-8-sig")