In [44]:
import random
import logging
from IPython.display import display, HTML

import numpy as np
import pandas as pd
import datasets
from datasets import load_dataset, load_metric, ClassLabel, Sequence
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
model_checkpoint = "klue/bert-base"
batch_size = 32
task = "nli"
MODEL_P = "models/klue-bert-base-augmented.pth"
RANDOM_SEED = 17

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [4]:
dataset = pd.read_csv("data/train_data.csv",index_col=False)
test = pd.read_csv("data/test_data.csv",index_col=False)

In [5]:
dataset_train, dataset_val = train_test_split(dataset,test_size = 0.2,random_state = RANDOM_SEED)

In [26]:
def random_deletion(words, p):
    if len(words) == 1:
        return words

    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]

    return new_words

def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)

	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0

	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words

	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
	return new_words

In [11]:
test_text = dataset_train["title"][1]

In [14]:
test_text_tokenized = tokenizer.tokenize(test_text)

In [15]:
tokenizer(test_text_tokenized)

{'input_ids': [[2, 18111, 3], [2, 8692, 3], [2, 7, 7, 18175, 3], [2, 121, 3], [2, 7116, 3], [2, 3749, 3], [2, 7, 7, 1552, 3], [2, 7, 7, 1478, 3], [2, 5055, 3], [2, 466, 3], [2, 8334, 3], [2, 8386, 3], [2, 7, 7, 1933, 3]], 'token_type_ids': [[0, 0, 0], [0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1], [1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1, 1, 1]]}

In [19]:
tokenizer.convert_tokens_to_string(test_text_tokenized)

'실리콘밸리 넘어서겠다 … 구글 15조원 들여 美 전역 거점화'

In [24]:
random_deletion(test_text_tokenized,0.2)

['실리콘밸리', '##겠다', '…', '구글', '15', '##조', '美', '전역', '거점', '##화']

In [30]:
rs = random_swap(test_text_tokenized,3)

In [31]:
tokenizer.convert_tokens_to_string(rs)

'실리콘밸리 … 거점 넘어서 구글 15조원 들여 美겠다 전역화'

In [32]:
test_text_tokenized = tokenizer.tokenize([test_text,test_text])

In [35]:
rd_augmenteds = []
rs_augmenteds = []

for title in dataset_train["title"]:
    tokenized = tokenizer.tokenize(title)
    
    rd = random_deletion(tokenized, 0.2)
    rs = random_swap(tokenized, 2)
    
    rd = tokenizer.convert_tokens_to_string(rd)
    rs = tokenizer.convert_tokens_to_string(rs)
    
    rd_augmenteds.append(rd)
    rs_augmenteds.append(rs)

In [38]:
rd_augmentation = pd.DataFrame({'title' : rd_augmenteds, 'topic_idx': dataset_train.topic_idx})
rs_augmentation = pd.DataFrame({'title' : rs_augmenteds, 'topic_idx': dataset_train.topic_idx})

In [41]:
dataset_train = pd.concat([dataset_train,rd_augmentation,rs_augmentation])

In [43]:
dataset_train.head()

Unnamed: 0,index,title,topic_idx
25339,25339.0,더민주 서영교 여파 지역위원장 심사기준 강화,6
24704,24704.0,맛집에 너그러운 한국인 해외여행서도 JMT 찾았다,3
1834,1834.0,특징주 삼성물산 지배구조 이슈 부각에 강세종합,1
17604,17604.0,생필품난 베네수엘라 콜롬비아와의 국경 1년 만에 재개방,4
19362,19362.0,금태섭 국민 10명 중 8명 판결문 공개 원해,6


In [45]:
dataset_train = shuffle(dataset_train,random_state = RANDOM_SEED)

In [46]:
dataset_train.head()

Unnamed: 0,index,title,topic_idx
14493,,반이스라엘 틀라입 의원 이번엔 WJC 대립,4
12945,,美 B52 한반도 출동는 … 전문가 무력시위 예상,4
3341,,여교사는 마녀 … 아동문학에도 페미니즘을,3
778,778.0,네이버 주소록에 암호잠금 기능 도입,0
13681,,러시아 시리아군비 만즈 순찰 시작 … 터키군 공격 저지 역할,4


In [47]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_key, label_key, bert_tokenizer):
        
        self.sentences = [ bert_tokenizer(i,truncation=True,return_token_type_ids=False) for i in dataset[sent_key] ]
        
        if not label_key == None:
            self.mode = "train"
        else:
            self.mode = "test"
            
        if self.mode == "train":
            self.labels = [np.int64(i) for i in dataset[label_key]]
        else:
            self.labels = [np.int64(0) for i in dataset[sent_key]]

    def __getitem__(self, i):
        if self.mode == "train":
            self.sentences[i]["label"] = self.labels[i]
            return self.sentences[i]
#             return ( self.sentences[i] , self.labels[i] )
        else:
            return self.sentences[i]

    def __len__(self):
        return (len(self.labels))


In [48]:
data_train = BERTDataset(dataset_train, "title", "topic_idx", tokenizer)
data_val = BERTDataset(dataset_val, "title", "topic_idx", tokenizer)
data_test = BERTDataset(test, "title", None, tokenizer)

In [49]:
num_labels = 7
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [50]:
metric = load_metric("glue", "qnli")

In [51]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [52]:
metric_name = "accuracy"

args = TrainingArguments(
    "test-nli",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [53]:
trainer = Trainer(
    model,
    args,
    train_dataset=data_train,
    eval_dataset=data_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [54]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.291972,0.3473,0.883912
2,0.137793,0.4657,0.87745
3,0.067254,0.712026,0.871865
4,0.033267,0.880427,0.873836
5,0.014348,0.947481,0.874822


TrainOutput(global_step=17125, training_loss=0.12816186701642335)

In [55]:
trainer.evaluate()

{'eval_loss': 0.3472997844219208,
 'eval_accuracy': 0.8839119483079619,
 'epoch': 5.0}

In [56]:
pred = trainer.predict(data_test)

In [57]:
pred = pred[0]

In [58]:
pred = np.argmax(pred,1)

In [59]:
submission = pd.read_csv('data/sample_submission.csv')
submission['topic_idx'] = pred
submission.to_csv("results/klue-bert-base-simple-rd-rs.csv",index=False)