## model architecure

In [1]:
import torch
import torch.nn as nn
from transformers import AutoModel, RobertaPreTrainedModel

class FCLayer(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.0, use_activation=True):
        super(FCLayer, self).__init__()
        self.use_activation = use_activation
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(input_dim, output_dim)
        self.tanh = nn.Tanh()

    def forward(self, x):
        x = self.dropout(x)
        if self.use_activation:
            x = self.tanh(x) 
        return self.linear(x)


class R_RoBERTa_WiC(RobertaPreTrainedModel):
    def __init__(self,  model_name, config, dropout_rate):
        super(R_RoBERTa_WiC, self).__init__(config)
        self.model = AutoModel.from_pretrained(model_name, config=config)  # Load pretrained XLMRoberta

        self.num_labels = config.num_labels

        self.cls_fc_layer = FCLayer(config.hidden_size, config.hidden_size, dropout_rate)
        self.entity_fc_layer1 = FCLayer(config.hidden_size, config.hidden_size, dropout_rate)
        self.entity_fc_layer2 = FCLayer(config.hidden_size, config.hidden_size, dropout_rate)

        self.label_classifier = FCLayer(
            config.hidden_size * 3,
            config.num_labels,
            dropout_rate,
            use_activation=False,
        )

    @staticmethod
    def entity_average(hidden_output, e_mask):
        """
        Average the entity hidden state vectors (H_i ~ H_j)
        :param hidden_output: [batch_size, j-i+1, dim]
        :param e_mask: [batch_size, max_seq_len]
                e.g. e_mask[0] == [0, 0, 0, 1, 1, 1, 0, 0, ... 0]
        :return: [batch_size, dim]
        """
        e_mask_unsqueeze = e_mask.unsqueeze(1)  # [b, 1, j-i+1]
        length_tensor = (e_mask != 0).sum(dim=1).unsqueeze(1)  # [batch_size, 1]

        # [b, 1, j-i+1] * [b, j-i+1, dim] = [b, 1, dim] -> [b, dim]
        sum_vector = torch.bmm(e_mask_unsqueeze.float(), hidden_output).squeeze(1)
        avg_vector = sum_vector.float() / length_tensor.float()  # broadcasting
        return avg_vector

    def forward(self, input_ids, attention_mask, labels, e1_mask, e2_mask):
        outputs = self.model(
            input_ids, attention_mask=attention_mask
        )  # sequence_output, pooled_output, (hidden_states), (attentions)
        sequence_output = outputs[0] #batch, max_len, hidden_size  

        e1_h = self.entity_average(sequence_output, e1_mask)
        e2_h = self.entity_average(sequence_output, e2_mask)
        # Dropout -> tanh -> fc_layer (Share FC layer for e1 and e2)
        sentence_representation = self.cls_fc_layer(outputs.pooler_output)

        e1_h = self.entity_fc_layer1(e1_h)
        e2_h = self.entity_fc_layer2(e2_h)
        # Concat -> fc_layer
        concat_h = torch.cat([sentence_representation, e1_h, e2_h], dim=-1)
        logits = self.label_classifier(concat_h)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        # Softmax
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

## Data Lodaer

In [2]:
import pickle as pickle
import os
import pandas as pd
import torch
from tqdm import tqdm

class WICDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset, labels):
        self.tokenized_dataset = tokenized_dataset
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def load_data(dataset_dir, mode = 'train'):
    dataset = pd.read_csv(dataset_dir, delimiter='\t')
    li = []
    for s1, s2 in zip(list(dataset['SENTENCE1']), list(dataset['SENTENCE2'])):
        li.append(s1+' '+s2)
    dataset["ANSWER"] = dataset["ANSWER"].astype(int)
    if mode == 'test':
        dataset["ANSWER"] = [0] * len(dataset)
    return dataset

def convert_sentence_to_features(train_dataset, tokenizer, max_len):
    
    max_seq_len=max_len
    pad_token=tokenizer.pad_token_id
    add_sep_token=False
    mask_padding_with_zero=True
    
    all_input_ids = []
    all_attention_mask = []
    all_e1_mask=[]
    all_e2_mask=[]
    all_label=[]
    m_len=0
    for idx in tqdm(range(len(train_dataset))):
        sentence = '<s>' + train_dataset['SENTENCE1'][idx][:train_dataset['start_s1'][idx]] \
            + ' <e1> ' + train_dataset['SENTENCE1'][idx][train_dataset['start_s1'][idx]:train_dataset['end_s1'][idx]] \
            + ' </e1> ' + train_dataset['SENTENCE1'][idx][train_dataset['end_s1'][idx]:] + '</s>' \
            + ' ' \
            + '<s>' + train_dataset['SENTENCE2'][idx][:train_dataset['start_s2'][idx]] \
            + ' <e2> ' + train_dataset['SENTENCE2'][idx][train_dataset['start_s2'][idx]:train_dataset['end_s2'][idx]] \
            + ' </e2> ' + train_dataset['SENTENCE2'][idx][train_dataset['end_s2'][idx]:] + '</s>'
        
        token = tokenizer.tokenize(sentence)
        m_len = max(m_len, len(token))
        e11_p = token.index("<e1>")  # the start position of entity1
        e12_p = token.index("</e1>")  # the end position of entity1
        e21_p = token.index("<e2>")  # the start position of entity2
        e22_p = token.index("</e2>")  # the end position of entity2

        token[e11_p] = "$"
        token[e12_p] = "$"
        token[e21_p] = "#"
        token[e22_p] = "#"

        e11_p += 1
        e12_p += 1
        e21_p += 1
        e22_p += 1

        special_tokens_count = 1

        if len(token) < max_seq_len - special_tokens_count:
            input_ids = tokenizer.convert_tokens_to_ids(token)
            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

            padding_length = max_seq_len - len(input_ids)
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)

            e1_mask = [0] * len(attention_mask)
            e2_mask = [0] * len(attention_mask)

            for i in range(e11_p, e12_p + 1):
                e1_mask[i] = 1
            for i in range(e21_p, e22_p + 1):
                e2_mask[i] = 1

            assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
            assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(
                len(attention_mask), max_seq_len
            )

            all_input_ids.append(input_ids)
            all_attention_mask.append(attention_mask)
            all_e1_mask.append(e1_mask)
            all_e2_mask.append(e2_mask)
            all_label.append(train_dataset['ANSWER'][idx])

    all_features = {
        'input_ids' : torch.tensor(all_input_ids),
        'attention_mask' : torch.tensor(all_attention_mask),
        'e1_mask' : torch.tensor(all_e1_mask),
        'e2_mask' : torch.tensor(all_e2_mask)
    }  
    return WICDataset(all_features, all_label)

In [3]:
import os
import pandas as pd

os.environ["CUDA_VISIBLE_DEVICES"] = "4"
BASE_DIR = "/workspace/github/nlp_project/NIKL-KLUE/"
import torch
import numpy as np
import random
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import json
import logging
import os
import torch.nn as nn
from tqdm import tqdm
from transformers import AdamW, get_linear_schedule_with_warmup, AutoTokenizer
import torch.nn.functional as F

from transformers import AutoModel, AutoConfig
import argparse

In [4]:
# seed 고정 
def seed_everything(seed):
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)  # if use multi-GPU
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  np.random.seed(seed)
  random.seed(seed)


def compute_metrics(preds, labels):
    assert len(preds) == len(labels)
    return acc_and_f1(preds, labels)

def simple_accuracy(preds, labels):
    return (preds == labels).mean()

def acc_and_f1(preds, labels, average="macro"):
    acc = simple_accuracy(preds, labels)
    return {
        "acc": acc,
    }

def init_logger():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

class Trainer(object):
    def __init__(self, args, model_dir = None,train_dataset=None, dev_dataset=None, test_dataset=None,tokenizer=None):
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        self.test_dataset = test_dataset
        self.tokenizer = tokenizer
        self.model_dir = model_dir 
        self.best_score = 0
        self.hold_epoch = 0

        self.eval_batch_size = args.eval_batch_size
        self.train_batch_size = args.train_batch_size
        self.max_steps = args.max_steps
        self.weight_decay = args.weight_decay
        self.learning_rate = args.lr
        self.adam_epsilon= args.adam_epsilon
        self.warmup_steps = args.warmup_steps
        self.num_train_epochs = args.num_train_epochs
        self.logging_steps = args.logging_steps
        self.max_grad_norm = args.max_grad_norm
        self.dropout_rate = args.dropout_rate
        self.gradient_accumulation_steps = args.gradient_accumulation_steps
        
        self.config = AutoConfig.from_pretrained(
            "klue/roberta-large",
            num_labels = 2
        )
        self.model = R_RoBERTa_WiC(
           "klue/roberta-large", 
            config=self.config, 
            dropout_rate = self.dropout_rate,
        )

        # GPU or CPU
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)
        
        
    def train(self):
        init_logger()
        seed_everything(args.seed)
        train_sampler = RandomSampler(self.train_dataset)
        train_dataloader = DataLoader(
            self.train_dataset,
            sampler=train_sampler,
            batch_size=self.train_batch_size,
        )

        if self.max_steps > 0:
            t_total = self.max_steps
            self.num_train_epochs = (
                self.max_steps // (len(train_dataloader) // self.gradient_accumulation_steps) + 1
            )
        else:
            t_total = len(train_dataloader) // self.gradient_accumulation_steps * self.num_train_epochs

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.weight_decay,
            },
            {
                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=self.learning_rate,
            eps=self.adam_epsilon,
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=t_total,
        )
        
        # Train!
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(self.train_dataset))
        logger.info("  Num Epochs = %d", self.num_train_epochs)
        logger.info("  Total train batch size = %d", self.train_batch_size)
        logger.info("  Gradient Accumulation steps = %d", self.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)
        logger.info("  Logging steps = %d", self.logging_steps)

        global_step = 0
        tr_loss = 0.0
        self.model.zero_grad()

        train_iterator = tqdm(range(int(self.num_train_epochs)), desc="Epoch")

        for epo_step in train_iterator:
            self.global_epo = epo_step
            epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):
                self.model.train()
                batch = tuple(batch[t].to(self.device) for t in batch)  # GPU or CPU
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "labels": batch[4],
                    "e1_mask": batch[2],
                    "e2_mask": batch[3]
                }
                
                outputs = self.model(**inputs)
                loss = outputs[0]

                if self.gradient_accumulation_steps > 1:
                    loss = loss / self.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()
                if (step + 1) % self.gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    self.model.zero_grad()
                    global_step += 1

                if self.logging_steps > 0 and global_step % self.logging_steps == 0:
                    logger.info("  global steps = %d", global_step)

                if 0 < self.max_steps < global_step:
                    epoch_iterator.close()
                    break
            
            self.evaluate("dev")
            if self.hold_epoch > 4:
                train_iterator.close()
                break
                
            if 0 < self.max_steps < global_step:
                train_iterator.close()
                break
          

        return global_step, tr_loss / global_step
    
   
    def evaluate(self, mode):
        # We use test dataset because semeval doesn't have dev dataset
        if mode == "test":
            dataset = self.test_dataset
        elif mode == "dev":
            dataset = self.dev_dataset
        elif mode == "train":
            dataset = self.train_dataset
        else:
            raise Exception("Only dev and test dataset available")

        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.eval_batch_size)

        # Eval!
        logger.info('---------------------------------------------------')
        logger.info("***** Running evaluation on %s dataset *****", mode)
        logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", self.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None

        self.model.eval()

        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(batch[t].to(self.device) for t in batch)
            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "labels": batch[4],
                    "e1_mask": batch[2],
                    "e2_mask": batch[3],
                }
                #with torch.cuda.amp.autocast():
                outputs = self.model(**inputs)
                tmp_eval_loss, logits = outputs[:2]
                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1

            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        results = {"loss": eval_loss}
        preds = np.argmax(preds, axis=1)
        result = compute_metrics(preds, out_label_ids)
        
        if mode == "dev":
            if result['acc']>self.best_score:
                self.save_model()
                self.best_score = result['acc']
                print('save new best model acc : ',str(self.best_score))
                self.hold_epoch = 0
            else:
                self.hold_epoch += 1
        
        
        results.update(result)

        logger.info("***** Eval results *****")
        for key in sorted(results.keys()):
            logger.info("  {} = {:.4f}".format(key, results[key]))
        logger.info("---------------------------------------------------")
        return results
        

    def save_model(self,new_dir=None):
        # Save model checkpoint (Overwrite)
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)
        if new_dir == None:
            pass
        else:
            if not os.path.exists(new_dir):
                os.makedirs(new_dir)
            self.model_dir = new_dir
        model_to_save = self.model.module if hasattr(self.model, "module") else self.model
        model_to_save.save_pretrained(self.model_dir)

        # Save training arguments together with the trained model
        logger.info("Saving model checkpoint to %s", self.model_dir)

  


In [5]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import easydict

args = easydict.EasyDict({
 
        "num_train_epochs": 10,
        "train_batch_size": 4,
        "eval_batch_size": 4,
        "max_steps": -1,
        "dropout_rate": 0.1,
        "lr" : 1e-5,
        "adam_epsilon" : 1e-8,
        "weight_decay" : 0.01,
        "warmup_steps" : 64,
        "seed" : 42,
        "logging_steps" : 500,
        "max_grad_norm" : 1.0,
        "gradient_accumulation_steps" : 1,
        "train_data_dir" : f"{BASE_DIR}WIC/Data/NIKL_SKT_WiC_Train.tsv",
        "dev_data_dir" : f"{BASE_DIR}WIC/Data/NIKL_SKT_WiC_Dev.tsv" 
})

train_dataset = load_data(args.train_data_dir)
dev_dataset = load_data(args.dev_data_dir)
ADDITIONAL_SPECIAL_TOKENS = ["<e1>", "</e1>", "<e2>", "</e2>"]
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large", return_token_type_ids=False)
tokenizer.add_special_tokens({"additional_special_tokens": ADDITIONAL_SPECIAL_TOKENS})

concat_dataset = train_dataset

def make_fold(x):
  if x <= concat_dataset.shape[0]*0.2:
      return 0
  elif x > concat_dataset.shape[0]*0.2 and x <= concat_dataset.shape[0]*0.4:
      return 1
  elif x > concat_dataset.shape[0]*0.4 and x <= concat_dataset.shape[0]*0.6 :
      return 2
  elif x > concat_dataset.shape[0]*0.6 and x <= concat_dataset.shape[0]*0.8 :
      return 3
  else:
      return 4

concat_dataset['fold']= concat_dataset['ID'].apply(make_fold)
concat_dataset = concat_dataset.drop(['ID', 'Target'],axis=1)

logger = logging.getLogger(__name__)
for fold in tqdm(range(5)): 
  trn_idx = concat_dataset[concat_dataset['fold'] != fold].index
  val_idx = concat_dataset[concat_dataset['fold'] == fold].index

  half_val_len = len(val_idx)//2
  add_trn_idx = val_idx[:half_val_len]

  trn_idx.append(add_trn_idx)
  val_idx = val_idx[half_val_len:]

  train_folds = concat_dataset.loc[trn_idx].reset_index(drop=True).drop(['fold'],axis=1)
  valid_folds = concat_dataset.loc[val_idx].reset_index(drop=True).drop(['fold'],axis=1)

  train_Dataset = convert_sentence_to_features(train_dataset, tokenizer, max_len = 280)
  valid_Dataset = convert_sentence_to_features(dev_dataset, tokenizer, max_len= 280)

  trainer = Trainer(args,
                  train_dataset=train_Dataset,
                  dev_dataset=valid_Dataset,
                  tokenizer =tokenizer,
                  model_dir = f'{BASE_DIR}/roberta_model_fold_{str(fold)}')

  trainer.train()
  trainer.save_model(new_dir=f'{BASE_DIR}/roberta_model_final_fold_{str(fold)}')

100%|██████████| 15496/15496 [00:15<00:00, 984.14it/s]
100%|██████████| 1166/1166 [00:01<00:00, 752.67it/s]
Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'r

KeyboardInterrupt: 

## Inference


In [6]:
import pickle as pickle
import os
import pandas as pd
import torch
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import random
from itertools import chain
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import copy
import csv
import json
import logging
import os
import torch.nn as nn
from tqdm.auto import tqdm
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn.functional as F
from transformers import AutoTokenizer,AutoModel, RobertaPreTrainedModel, AutoConfig, RobertaModel
import numpy as np
import os 

class FCLayer(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.0, use_activation=True):
        super(FCLayer, self).__init__()
        self.use_activation = use_activation
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(input_dim, output_dim)
        self.tanh = nn.Tanh()

    def forward(self, x):
        x = self.dropout(x)
        if self.use_activation:
            x = self.tanh(x)
        return self.linear(x)

class Roberta_WiC(RobertaPreTrainedModel):
    def __init__(self,  model_name, config, dropout_rate):
        super(Roberta_WiC, self).__init__(config)
        self.model = AutoModel.from_pretrained(model_name, config=config)  # Load pretrained XLMRoberta

        self.num_labels = config.num_labels

        self.cls_fc_layer = FCLayer(config.hidden_size, config.hidden_size, dropout_rate)
        self.entity_fc_layer1 = FCLayer(config.hidden_size, config.hidden_size, dropout_rate)
        self.entity_fc_layer2 = FCLayer(config.hidden_size, config.hidden_size, dropout_rate)

        self.label_classifier = FCLayer(
            config.hidden_size * 3,
            config.num_labels,
            dropout_rate,
            use_activation=False,
        )

    @staticmethod
    def entity_average(hidden_output, e_mask):
        """
        Average the entity hidden state vectors (H_i ~ H_j)
        :param hidden_output: [batch_size, j-i+1, dim]
        :param e_mask: [batch_size, max_seq_len]
                e.g. e_mask[0] == [0, 0, 0, 1, 1, 1, 0, 0, ... 0]
        :return: [batch_size, dim]
        """
        e_mask_unsqueeze = e_mask.unsqueeze(1)  # [b, 1, j-i+1]
        length_tensor = (e_mask != 0).sum(dim=1).unsqueeze(1)  # [batch_size, 1]

        # [b, 1, j-i+1] * [b, j-i+1, dim] = [b, 1, dim] -> [b, dim]
        sum_vector = torch.bmm(e_mask_unsqueeze.float(), hidden_output).squeeze(1)
        avg_vector = sum_vector.float() / length_tensor.float()  # broadcasting
        return avg_vector

    def forward(self, input_ids, attention_mask, labels, e1_mask, e2_mask):
        outputs = self.model(
            input_ids, attention_mask=attention_mask
        )  
        sequence_output = outputs[0] 
        e1_h = self.entity_average(sequence_output, e1_mask)
        e2_h = self.entity_average(sequence_output, e2_mask)

        sentence_representation = self.cls_fc_layer(outputs.pooler_output)
        
        e1_h = self.entity_fc_layer1(e1_h)
        e2_h = self.entity_fc_layer2(e2_h)

        concat_h = torch.cat([sentence_representation, e1_h, e2_h], dim=-1)
        logits = self.label_classifier(concat_h)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        # Softmax
        if labels is not None:
            if self.num_labels == 1:
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

class RE_Dataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset, labels):
        self.tokenized_dataset = tokenized_dataset
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def init_logger():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    

def test_pred(test_dataset, eval_batch_size, model):
    test_dataset = test_dataset
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler,batch_size=eval_batch_size)

    logger = logging.getLogger(__name__)
    init_logger()

    # Eval!
    logger.info("***** Running evaluation on %s dataset *****", "test")
    logger.info("  Batch size = %d", eval_batch_size)

    nb_eval_steps = 0
    preds = None
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()

    for batch in tqdm(test_dataloader, desc="Predicting"):
        batch = tuple(batch[t].to(device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": None,
                "e1_mask": batch[2],
                "e2_mask": batch[3],
            }
            outputs = model(**inputs)
            pred = outputs[0]

        nb_eval_steps += 1

        if preds is None:
            preds = pred.detach().cpu().numpy()
        else:
            preds = np.append(preds, pred.detach().cpu().numpy(), axis=0)

    preds_label = np.argmax(preds, axis=1)
    df = pd.DataFrame(preds, columns=['pred_0','pred_1'])
    df['label'] = preds_label
    preds = preds.astype(int)
    return df 


def load_test_data(dataset_dir):
    dataset = pd.read_csv(dataset_dir, delimiter='\t')
    li = []
    for s1, s2 in zip(list(dataset['SENTENCE1']), list(dataset['SENTENCE2'])):
        li.append(s1+' '+s2)
    dataset["ANSWER"] = dataset["ANSWER"].astype(int)
    return dataset

def convert_sentence_to_features(train_dataset, tokenizer, max_len, mode='train'):
    max_seq_len=max_len
    pad_token=tokenizer.pad_token_id
    add_sep_token=False
    mask_padding_with_zero=True
    
    all_input_ids = []
    all_attention_mask = []
    all_e1_mask=[]
    all_e2_mask=[]
    all_label=[]
    m_len=0
    for idx in tqdm(range(len(train_dataset))):
        sentence = '<s>' + train_dataset['SENTENCE1'][idx][:train_dataset['start_s1'][idx]] \
            + ' <e1> ' + train_dataset['SENTENCE1'][idx][train_dataset['start_s1'][idx]:train_dataset['end_s1'][idx]] \
            + ' </e1> ' + train_dataset['SENTENCE1'][idx][train_dataset['end_s1'][idx]:] + '</s>' \
            + ' ' \
            + '<s>' + train_dataset['SENTENCE2'][idx][:train_dataset['start_s2'][idx]] \
            + ' <e2> ' + train_dataset['SENTENCE2'][idx][train_dataset['start_s2'][idx]:train_dataset['end_s2'][idx]] \
            + ' </e2> ' + train_dataset['SENTENCE2'][idx][train_dataset['end_s2'][idx]:] + '</s>'

            
        
        token = tokenizer.tokenize(sentence)
        m_len = max(m_len, len(token))
        e11_p = token.index("<e1>")  # the start position of entity1
        e12_p = token.index("</e1>")  # the end position of entity1
        e21_p = token.index("<e2>")  # the start position of entity2
        e22_p = token.index("</e2>")  # the end position of entity2

        token[e11_p] = "$"
        token[e12_p] = "$"
        token[e21_p] = "#"
        token[e22_p] = "#"

        e11_p += 1
        e12_p += 1
        e21_p += 1
        e22_p += 1

        special_tokens_count = 1

        if len(token) < max_seq_len - special_tokens_count:
            input_ids = tokenizer.convert_tokens_to_ids(token)
            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

            padding_length = max_seq_len - len(input_ids)
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)

            e1_mask = [0] * len(attention_mask)
            e2_mask = [0] * len(attention_mask)

            for i in range(e11_p, e12_p + 1):
                e1_mask[i] = 1
            for i in range(e21_p, e22_p + 1):
                e2_mask[i] = 1

            assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
            assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(
                len(attention_mask), max_seq_len
            )

            all_input_ids.append(input_ids)
            all_attention_mask.append(attention_mask)
            all_e1_mask.append(e1_mask)
            all_e2_mask.append(e2_mask)
            all_label.append(train_dataset['ANSWER'][idx])

    all_features = {
        'input_ids' : torch.tensor(all_input_ids),
        'attention_mask' : torch.tensor(all_attention_mask),
        'e1_mask' : torch.tensor(all_e1_mask),
        'e2_mask' : torch.tensor(all_e2_mask)
    }  
    return RE_Dataset(all_features, all_label)

def softmax(sr):
    
    max_val = np.max(sr)
    exp_a = np.exp(sr-max_val)
    sum_exp_a = np.sum(exp_a)
    y = exp_a / sum_exp_a
    return y

def compute_metrics(preds, labels):
    assert len(preds) == len(labels)
    return acc_and_f1(preds, labels)

def simple_accuracy(preds, labels):
    return (preds == labels).mean()

def acc_and_f1(preds, labels, average="macro"):
    acc = simple_accuracy(preds, labels)
    return {
        "acc": acc,
    }


In [7]:
eval_batch_size = 4
ADDITIONAL_SPECIAL_TOKENS = ["<e1>", "</e1>", "<e2>", "</e2>"]
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large", return_token_type_ids=False)
tokenizer.add_special_tokens({"additional_special_tokens": ADDITIONAL_SPECIAL_TOKENS})

test_dataset = load_test_data(f"{BASE_DIR}WIC/Data/NIKL_SKT_WiC_Dev.tsv")
test_Dataset = convert_sentence_to_features(test_dataset, tokenizer, max_len= 280, mode='eval')

n_fold = 5
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

for fold in tqdm(range(n_fold)):
    config = AutoConfig.from_pretrained(
            "klue/roberta-large",
            num_labels= 2
        )
    model = Roberta_WiC(
            'klue/roberta-large',
            config= config, 
            dropout_rate = 0.1
        )
    model.load_state_dict(torch.load(f'{BASE_DIR}WIC/exp2/roberta_model_final_fold_'+str(fold)+'/pytorch_model.bin', map_location=device))
    model.eval()
    result = test_pred(test_Dataset, eval_batch_size, model)
    result.to_csv(f'{BASE_DIR}{str(fold)}_rbt_result.csv', index=False)

ensemble= pd.DataFrame()
for fold in range(n_fold):
    df = pd.read_csv(f'{BASE_DIR}{str(fold)}_rbt_result.csv')
    ensemble['label'+str(fold)]= df['label']


soft_ensemble= pd.DataFrame()
soft_ensemble['pred_0'] = ensemble['label0']
soft_ensemble['pred_1'] = ensemble['label0']
soft_ensemble['pred_0'] = 0
soft_ensemble['pred_1'] = 0

for fold in range(n_fold):
    df = pd.read_csv(f'{BASE_DIR}{str(fold)}_rbt_result.csv')
    df= df.drop('label',axis=1)
    df = df.apply(softmax,axis=1)
    soft_ensemble['pred_0'] += df['pred_0']
    soft_ensemble['pred_1'] += df['pred_1']

soft_ensemble['predicted'] = [1 if p_0 < p_1 else 0 for p_0, p_1 in zip(soft_ensemble['pred_0'], soft_ensemble['pred_1'])]
result = compute_metrics(soft_ensemble['predicted'], test_dataset['ANSWER'])
print('================= devset acc =================')
print(f"accuracy : {result['acc']}")


HBox(children=(FloatProgress(value=0.0, max=1166.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

HBox(children=(FloatProgress(value=0.0, description='Predicting', max=292.0, style=ProgressStyle(description_w…






Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

HBox(children=(FloatProgress(value=0.0, description='Predicting', max=292.0, style=ProgressStyle(description_w…




Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

HBox(children=(FloatProgress(value=0.0, description='Predicting', max=292.0, style=ProgressStyle(description_w…




Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

HBox(children=(FloatProgress(value=0.0, description='Predicting', max=292.0, style=ProgressStyle(description_w…




Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

HBox(children=(FloatProgress(value=0.0, description='Predicting', max=292.0, style=ProgressStyle(description_w…



accuracy : 0.934819897084048
