In [1]:
import random
import logging
from IPython.display import display, HTML
import os

import numpy as np
import pandas as pd
import datasets
from datasets import load_dataset, load_metric, ClassLabel, Sequence
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
model_checkpoint = "klue/roberta-large"
batch_size = 32
task = "nli"
MODEL_P = "models/klue-roberta-large-augmented.pth"
RANDOM_SEED = 17
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
os.environ["PYTHONHASHSEED"] = str(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)  # type: ignore
torch.backends.cudnn.deterministic = True  # type: ignore
torch.backends.cudnn.benchmark = True  # type: ignore

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [4]:
dataset = pd.read_csv("data/train_data.csv",index_col=False)
test = pd.read_csv("data/test_data.csv",index_col=False)

In [5]:
dataset_train, dataset_val = train_test_split(dataset,test_size = 0.2,random_state = RANDOM_SEED)

In [6]:
def random_deletion(words, p):
    if len(words) == 1:
        return words

    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]

    return new_words

def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)

	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0

	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words

	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
	return new_words

In [7]:
rd_augmenteds = []
rs_augmenteds = []

for title in dataset_train["title"]:
    tokenized = tokenizer.tokenize(title)
    
    rd = random_deletion(tokenized, 0.2)
    rs = random_swap(tokenized, 2)
    
    rd = tokenizer.convert_tokens_to_string(rd)
    rs = tokenizer.convert_tokens_to_string(rs)
    
    rd_augmenteds.append(rd)
    rs_augmenteds.append(rs)

In [8]:
rd_augmentation = pd.DataFrame({'title' : rd_augmenteds, 'topic_idx': dataset_train.topic_idx})
rs_augmentation = pd.DataFrame({'title' : rs_augmenteds, 'topic_idx': dataset_train.topic_idx})

In [9]:
dataset_train = pd.concat([dataset_train,rd_augmentation,rs_augmentation])

In [10]:
dataset_train.head()

Unnamed: 0,index,title,topic_idx
25339,25339.0,더민주 서영교 여파 지역위원장 심사기준 강화,6
24704,24704.0,맛집에 너그러운 한국인 해외여행서도 JMT 찾았다,3
1834,1834.0,특징주 삼성물산 지배구조 이슈 부각에 강세종합,1
17604,17604.0,생필품난 베네수엘라 콜롬비아와의 국경 1년 만에 재개방,4
19362,19362.0,금태섭 국민 10명 중 8명 판결문 공개 원해,6


In [11]:
dataset_train = shuffle(dataset_train,random_state = RANDOM_SEED)

In [12]:
dataset_train.head()

Unnamed: 0,index,title,topic_idx
14493,,반라엘라입 의원 이번 WJC와 대립,4
12945,,美 B 한반도후 행보는 전문가 무력시위 예상,4
3341,,여교사는 마녀 … 아동에도 페미니즘,3
778,778.0,네이버 주소록에 암호잠금 기능 도입,0
13681,,러시아군 시작 만비즈 순찰 저지 … 터키군 공격 시리아 역할,4


In [13]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_key, label_key, bert_tokenizer):
        
        self.sentences = [ bert_tokenizer(i,truncation=True,return_token_type_ids=False) for i in dataset[sent_key] ]
        
        if not label_key == None:
            self.mode = "train"
        else:
            self.mode = "test"
            
        if self.mode == "train":
            self.labels = [np.int64(i) for i in dataset[label_key]]
        else:
            self.labels = [np.int64(0) for i in dataset[sent_key]]

    def __getitem__(self, i):
        if self.mode == "train":
            self.sentences[i]["label"] = self.labels[i]
            return self.sentences[i]
#             return ( self.sentences[i] , self.labels[i] )
        else:
            return self.sentences[i]

    def __len__(self):
        return (len(self.labels))


In [14]:
data_train = BERTDataset(dataset_train, "title", "topic_idx", tokenizer)
data_val = BERTDataset(dataset_val, "title", "topic_idx", tokenizer)
data_test = BERTDataset(test, "title", None, tokenizer)

In [15]:
num_labels = 7
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

In [16]:
metric = load_metric("glue", "qnli")

In [17]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [18]:
metric_name = "accuracy"

args = TrainingArguments(
    "test-nli",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [19]:
trainer = Trainer(
    model,
    args,
    train_dataset=data_train,
    eval_dataset=data_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.295861,0.377157,0.879641
2,0.142511,0.53347,0.875151
3,0.07103,0.809973,0.866608
4,0.026835,0.967651,0.869237
5,0.008699,1.065835,0.872303


TrainOutput(global_step=17125, training_loss=0.129010278854927)

In [26]:
trainer.evaluate()

{'eval_loss': 0.3771573603153229,
 'eval_accuracy': 0.879640784141934,
 'epoch': 5.0}

In [22]:
pred = trainer.predict(data_test)

In [23]:
pred = pred[0]

In [24]:
pred = np.argmax(pred,1)

In [25]:
submission = pd.read_csv('data/sample_submission.csv')
submission['topic_idx'] = pred
submission.to_csv("results/klue-roberta-large-simple-rd-rs.csv",index=False)