In [1]:
import os
import random
import logging
from IPython.display import display, HTML
from tqdm import tqdm, tqdm_notebook, tnrange

import numpy as np
import pandas as pd
import datasets
from datasets import load_dataset, load_metric, ClassLabel, Sequence
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, AutoModelForMaskedLM, AutoModelForPreTraining, TrainingArguments, Trainer

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from transformers import get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset,TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score

print(torch.cuda.device_count())

device = torch.device("cuda:0")

1


In [2]:
def seed_everything(seed: int = 17):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
    
seed_everything(17)

In [3]:
model_checkpoint = "klue/bert-base"
batch_size = 8
MODEL_P = "models/klue-bert-base-mlm.pth"
RANDOM_SEED = 17

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
dataset = pd.read_csv("data/train_data.csv",index_col=False)
dataset_augmented = pd.read_csv("data/train_data_m2m_translation.csv",index_col=False)
test = pd.read_csv("data/test_data.csv",index_col=False)

In [6]:
dataset_augmented["topic_idx"] = dataset["topic_idx"]

In [7]:
dataset_train, dataset_val = train_test_split(dataset,test_size = 0.1,random_state = RANDOM_SEED)

In [8]:
train_dataset_augmented_title = dataset_augmented["title"][dataset_train["index"]]
train_dataset_augmented_topic_idx = dataset_augmented["topic_idx"][dataset_train["index"]]
train_dataset_augmented = pd.DataFrame({'title' : train_dataset_augmented_title.tolist(), "topic_idx" : train_dataset_augmented_topic_idx.tolist()})

In [9]:
dataset_train = pd.concat([dataset_train,train_dataset_augmented])

In [10]:
dataset_train.head()

Unnamed: 0,index,title,topic_idx
36615,36615.0,이란 외무 트럼프 볼턴에 들볶여 알렉산더도 못한 일 하려해,4
16758,16758.0,영상 한국 부도위험지표 12년 만에 최저…북미회담 덕분,6
30712,30712.0,도이치모터스 도이치파이낸셜 주식 160억원에 추가취득,1
1407,1407.0,서울 출신 학자의 외침 사대문 안만 서울이 아니다,3
36067,36067.0,내일날씨 전국 대체로 맑고 더워…낮 최고 22∼31도,3


In [11]:
topic_token_dict = {0:4038,1:3674,2:3647,3:3697,4:3665,5:4559,6:3713}
token_topic_dict = {4038 : 0, 3674 : 1, 3647 : 2, 3697 : 3, 3665 : 4, 4559 : 5, 3713 : 6}
topic_dict = {0: "과학", 1:"경제", 2:"사회", 3:"문화", 4:"세계", 5:"스포츠", 6 : "정치"}

tmp = []

for title in dataset_train["title"]:
    sentence = title + ".[SEP] 이 문장은 [MASK]"
    tmp.append(sentence)
dataset_train["title"] = tmp

tmp = []

for title in dataset_val["title"]:
    sentence = title + ".[SEP] 이 문장은 [MASK]"
    tmp.append(sentence)
dataset_val["title"] = tmp
    
tmp = []
for title in test["title"]:
    sentence = title + ".[SEP] 이 문장은 [MASK]"
    tmp.append(sentence)

test["title"] = tmp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [12]:
dataset_train.head()

Unnamed: 0,index,title,topic_idx
36615,36615.0,이란 외무 트럼프 볼턴에 들볶여 알렉산더도 못한 일 하려해.[SEP] 이 문장은 [...,4
16758,16758.0,영상 한국 부도위험지표 12년 만에 최저…북미회담 덕분.[SEP] 이 문장은 [MASK],6
30712,30712.0,도이치모터스 도이치파이낸셜 주식 160억원에 추가취득.[SEP] 이 문장은 [MASK],1
1407,1407.0,서울 출신 학자의 외침 사대문 안만 서울이 아니다.[SEP] 이 문장은 [MASK],3
36067,36067.0,내일날씨 전국 대체로 맑고 더워…낮 최고 22∼31도.[SEP] 이 문장은 [MASK],3


In [13]:
def bert_tokenize(dataset,sent_key,label_key,tokenizer):
    if label_key is None :
        labels = [np.int64(0) for i in dataset[sent_key]]
    else :
        labels = [np.int64(i) for i in dataset[label_key]]
    
    sentences = tokenizer(dataset[sent_key].tolist(),truncation=True,padding=True)

    input_ids = sentences.input_ids
    token_type_ids = sentences.token_type_ids
    attention_mask = sentences.attention_mask
    masked_token_idx = []
    
    for input_id in input_ids:
        masked_token_idx.append(input_id.index(4))
    
    return list([input_ids, token_type_ids, attention_mask, labels, masked_token_idx])

In [14]:
train_inputs = bert_tokenize(dataset_train,"title","topic_idx",tokenizer)
validation_inputs = bert_tokenize(dataset_val,"title","topic_idx",tokenizer)
test_inputs = bert_tokenize(test,"title",None,tokenizer)

In [15]:
len(test_inputs[0][1]), len(validation_inputs[0][1]), len(train_inputs[0][1])

(36, 35, 93)

In [16]:
# train_inputs[0][1], train_inputs[1][1], train_inputs[2][1], train_inputs[3][1], train_inputs[4][1]

In [17]:
for i in range(len(train_inputs)):
    train_inputs[i] = torch.tensor(train_inputs[i])
    
for i in range(len(validation_inputs)):
    validation_inputs[i] = torch.tensor(validation_inputs[i])
    
for i in range(len(test_inputs)):
    test_inputs[i] = torch.tensor(test_inputs[i])

In [18]:
train_data = TensorDataset(*train_inputs)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,sampler=train_sampler,batch_size=batch_size)

validation_data = TensorDataset(*validation_inputs)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data,sampler=validation_sampler,batch_size=batch_size)

test_data = TensorDataset(*test_inputs)
test_dataloader = DataLoader(test_data,batch_size=batch_size)

In [19]:
model = AutoModelForPreTraining.from_pretrained(model_checkpoint)
model.to(device)

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [25]:
# Parameters:
lr = 2e-5
adam_epsilon = 1e-8

# Number of training epochs (authors recommend between 2 and 4)
epochs = 1

num_warmup_steps = 0

warmup_ratio = 0.1
num_training_steps = len(train_dataloader)*epochs
warmup_step = int(num_training_steps * warmup_ratio)

### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr,eps=adam_epsilon)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=num_training_steps)  # PyTorch scheduler

In [28]:
train_loss_set = []
learning_rate = []

criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
criterion_cls = torch.nn.CrossEntropyLoss()

model.zero_grad()

for _ in tnrange(1,epochs+1,desc='Epoch'):
    print("<" + "="*22 + F" Epoch {_} "+ "="*22 + ">")
    batch_loss = 0
    
    # train
    model.train()
    for step, batch in enumerate(tqdm_notebook(train_dataloader)):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_token_type_ids, b_input_mask, b_labels, b_masked_token_idx = batch
        
        outputs = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask)
        
        # calculate loss
        logits_cls, logits_lm = outputs[1], outputs[0]
        
        # topic -> token 
        # Ex.6 -> 3713
        mask_label = [topic_token_dict[lb] for lb in b_labels.to('cpu').numpy() ]
        
#         label_lms = []
        
        labels_lms = []
        SEQUENCE_LENGTH = 93
        
        # label 만들기
        for idx, label in zip(b_masked_token_idx.to('cpu').numpy(),mask_label):
            labels_lm = np.full(SEQUENCE_LENGTH, dtype=np.int, fill_value=-1)
            labels_lm[idx] = label
            labels_lms.append(labels_lm)
        label_lms_pt = torch.tensor(labels_lms,dtype=torch.int64).to(device)
        
        # lm loss 계산
        loss_lm = criterion_lm(logits_lm.view(-1, logits_lm.size(2)), label_lms_pt.view(-1))
        
        # cls loss 계산
        labels_cls = [1 for _ in range(len(b_input_ids))]
        labels_cls = torch.tensor(labels_cls).to(device)
        loss_cls = criterion_cls(logits_cls, labels_cls)
        
        loss = loss_cls + loss_lm
        loss.backward()        
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        scheduler.step()
        optimizer.zero_grad()

        batch_loss += loss.item()
        
        if step % 100 == 0:
            print('\n\tBatch : {} Average Training loss : {}'.format(step, batch_loss / (batch_size * step+1) ))
        
    avg_train_loss = batch_loss / len(train_dataloader)
    train_loss_set.append(avg_train_loss)
    print(F'\n\tAverage Training loss: {avg_train_loss}')
    
    # eval
    model.eval()
    
    predict_li = []
    label_li = []
    
    for step, batch in enumerate(tqdm_notebook(validation_dataloader)):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_token_type_ids, b_input_mask, b_labels, b_masked_token_idx = batch

        outputs = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask)

        logits_cls, logits_lm = outputs[1], outputs[0]
        logits_lm_np = logits_lm.to('cpu').detach().numpy()
        pred = np.argmax(logits_lm_np,axis=2)

        masked_token_idx_np = b_masked_token_idx.to('cpu').numpy()
        
        labels_np = b_labels.to('cpu').numpy()

        for i in range(len(pred)):
            l = token_topic_dict[pred[i][masked_token_idx_np[i]]]
            predict_li.append(l)

        for l in labels_np:
            label_li.append(l)

        mask_label = [ topic_token_dict[lb] for lb in labels_np ]
        
        labels_lms = []
        SEQUENCE_LENGTH = 35

        for idx, label in zip(masked_token_idx_np,mask_label):
            
            labels_lm = np.full(SEQUENCE_LENGTH, dtype=np.int, fill_value=-1)
            labels_lm[idx] = label
            
            labels_lms.append(labels_lm)

        labels_lms_pt = torch.tensor(labels_lms,dtype=torch.int64).to(device)
        loss_lm = criterion_lm(logits_lm.view(-1, logits_lm.size(2)), labels_lms_pt.view(-1))

        labels_cls = [1 for _ in range(len(b_input_ids))]
        labels_cls = torch.tensor(labels_cls).to(device)
        loss_cls = criterion_cls(logits_cls, labels_cls)

        loss = loss_cls + loss_lm
        
        batch_loss += loss.item()

    print("\n\tAccuracy : {}".format(accuracy_score(predict_li,label_li)))
    avg_validation_loss = batch_loss / len(validation_dataloader)
    print(F'\n\tAverage validation loss: {avg_validation_loss}')

  for _ in tnrange(1,epochs+1,desc='Epoch'):


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/10272 [00:00<?, ?it/s]


	Batch : 0 Average Training loss : 0.010925743728876114

	Batch : 100 Average Training loss : 0.032129501970136046

	Batch : 200 Average Training loss : 0.03195290811558246

	Batch : 300 Average Training loss : 0.032417204623277

	Batch : 400 Average Training loss : 0.033534044069268555

	Batch : 500 Average Training loss : 0.03368914400642169

	Batch : 600 Average Training loss : 0.03387283127912249

	Batch : 700 Average Training loss : 0.03481746071300618

	Batch : 800 Average Training loss : 0.03412003802843281

	Batch : 900 Average Training loss : 0.03383348607732359

	Batch : 1000 Average Training loss : 0.03444243954336174

	Batch : 1100 Average Training loss : 0.03436295940588773

	Batch : 1200 Average Training loss : 0.034590639710288

	Batch : 1300 Average Training loss : 0.03406860911706064

	Batch : 1400 Average Training loss : 0.033646114022956086

	Batch : 1500 Average Training loss : 0.03424390977108746

	Batch : 1600 Average Training loss : 0.034230135370708833

	Batch 

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in enumerate(tqdm_notebook(validation_dataloader)):


  0%|          | 0/571 [00:00<?, ?it/s]


	Accuracy : 0.8817345597897503

	Average validation loss: 6.471789539336864


In [22]:
predict_li = []

model.eval()
for step, batch in enumerate(tqdm_notebook(test_dataloader)):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_token_type_ids, b_input_mask, b_labels, b_masked_token_idx = batch
    
    outputs = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask)
    
    logits_cls, logits_lm = outputs[1], outputs[0]
    
    logits_lm = logits_lm.to('cpu').detach().numpy()
    out = np.argmax(logits_lm,axis=2)
    
    masked_token_idx_np = b_masked_token_idx.to('cpu').numpy()
    
#     print(out)
    
    for i in range(len(out)):
        l = token_topic_dict[out[i][masked_token_idx_np[i]]]
        predict_li.append(l)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1142 [00:00<?, ?it/s]

In [23]:
len(predict_li)

9131

In [24]:
# submission = pd.read_csv('data/sample_submission.csv')
# submission['topic_idx'] = predict_li
# submission.to_csv("results/klue-bert-mlm-classification-1epoch-augmented-0805.csv",index=False)