In [1]:
import os 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import random
from tqdm import tqdm
import optuna
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer
from tokenizers import Tokenizer
from transformers import PreTrainedTokenizerFast, BartModel
from itertools import combinations



In [6]:
test = pd.read_csv('data/test_data.csv',encoding='utf-8')
test['premise'] = test['premise'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 0-9]", "")
test['hypothesis'] = test['hypothesis'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 0-9]", "")


  
  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
def seed_everything(seed:int = 2023):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

seed_everything()

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [9]:
class BERTDataset(torch.utils.data.Dataset):
    def __init__(self, pair_dataset, label):
        self.pair_dataset = pair_dataset
        self.label = label

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
        item['label'] = torch.tensor(self.label[idx])
        
        return item

    def __len__(self):
        return len(self.label)
    
def label_to_num(label):
    label_dict = {"entailment": 0, "contradiction": 1, "neutral": 2, "answer": 3}
    num_label = []

    for v in label:
        num_label.append(label_dict[v])

    return num_label

test_label = label_to_num(test['label'].values)

In [10]:
def make_prob(model_str,weight_address):

    Tokenizer_NAME = model_str
    tokenizer = AutoTokenizer.from_pretrained(Tokenizer_NAME)

    MODEL_NAME = weight_address
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
    model.resize_token_embeddings(tokenizer.vocab_size)
    model.to(device)

    
    tokenized_test = tokenizer(
        list(test['premise']),
        list(test['hypothesis']),
        return_tensors="pt",
        max_length=128,
        padding=True,
        truncation=True,
        add_special_tokens=True
    )

    test_dataset = BERTDataset(tokenized_test, test_label)
    
    dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    model.eval()
    output_pred = []
    output_prob = []

    for i, data in enumerate(tqdm(dataloader)):
        with torch.no_grad():
            outputs = model(
                input_ids=data['input_ids'].to(device),
                attention_mask=data['attention_mask'].to(device),
                token_type_ids=data['token_type_ids'].to(device)
            )
        logits = outputs[0]
        prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
        logits = logits.detach().cpu().numpy()
        result = np.argmax(logits, axis=-1)

        output_pred.append(result)
        output_prob.append(prob)

    pred_answer, output_prob = np.concatenate(output_pred).tolist(), np.concatenate(output_prob, axis=0).tolist()
    df = pd.DataFrame(output_prob)
    
    return df 

In [13]:
Large_data_roberta_hyperparameter = make_prob("klue/roberta-large",
                                              'result/Large_data_roberta_hyperparameter_tune/run-3/checkpoint-7200')

100%|██████████| 105/105 [00:07<00:00, 14.74it/s]


In [140]:
Roberta_Large_Concat = make_prob("klue/roberta-large",
                                 "result/Roberta_Large_Concat/checkpoint-4000")

100%|██████████| 105/105 [00:07<00:00, 14.49it/s]


In [141]:
# ko_electra_with_kakao_backtrans_klue_data = make_prob("tunib/electra-ko-base",
#                                                       "result/kakao_backtrans_klue_electra/checkpoint-5000")

100%|██████████| 105/105 [00:02<00:00, 42.19it/s]


In [142]:
# ko_electra_concat = make_prob("snunlp/KR-ELECTRA-discriminator",
#                               "result/Kr_Electra_Concat/checkpoint-2000")

100%|██████████| 105/105 [00:02<00:00, 36.81it/s]


In [15]:
simple_roberta_tune = make_prob("klue/roberta-large",
                                "result/simple_roberta_tune/run-12/checkpoint-4000")

100%|██████████| 105/105 [00:07<00:00, 14.71it/s]


In [16]:
simple_roberta_more_epoch = make_prob("klue/roberta-large",
                                      "result/simple_roberta_more_epoch/checkpoint-5000")

100%|██████████| 105/105 [00:07<00:00, 14.73it/s]


In [145]:
# pure_test_electra = make_prob("tunib/electra-ko-base",
#                               "result/pure_test_electra/checkpoint-4500")

100%|██████████| 105/105 [00:02<00:00, 42.28it/s]


In [12]:
use_more_data = make_prob("tunib/electra-ko-base",
                              "result/many_data_electra/best_model")

100%|██████████| 105/105 [00:02<00:00, 44.53it/s]


In [147]:
# electra_hyperparameter_tune = make_prob("tunib/electra-ko-base",
#                               "result/electra_hyperparameter_tune/run-2/checkpoint-7200")

100%|██████████| 105/105 [00:02<00:00, 43.10it/s]


In [17]:
prob_list = [Large_data_roberta_hyperparameter,
             simple_roberta_tune,
             simple_roberta_more_epoch,
             use_more_data]

In [18]:
def soft_voting(lists,combination):
    voted_list = []
    label_idx = { "entailment": 0, "contradiction": 1,  'neutral':2}
    for p in combinations(lists,combination):
        df = pd.DataFrame(np.zeros(shape=(1666,3), dtype=float))
        submission = pd.read_csv("data/sample_submission.csv")
        for i in p:
            df += i
        df = df/len(p)
        result = [np.argmax(val) for val in np.array(df)]
        out = [list(label_idx.keys())[_] for _ in result] 
        submission['label'] = out
        voted_list.append(submission)
    return voted_list

In [23]:
soft_voting(prob_list,4)[0].to_csv("full_soft_voted.csv",index=False)

In [24]:
combination_3 = soft_voting(prob_list,3)

for i in range(len(combination_3)):
    combination_3[i].to_csv("soft_voted/combination_{}th_soft_voted.csv".format(i),index=False)