In [1]:
import random
import logging
from IPython.display import display, HTML
from tqdm import tqdm, tqdm_notebook, tnrange

import numpy as np
import pandas as pd
import datasets
from datasets import load_dataset, load_metric, ClassLabel, Sequence
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, AutoModelForMaskedLM, AutoModelForPreTraining, TrainingArguments, Trainer

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from transformers import get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset,TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score

print(torch.cuda.device_count())

device = torch.device("cuda:0")

1


In [2]:
torch.cuda.is_available()

True

In [3]:
model_checkpoint = "klue/bert-base"
# model_checkpoint = "bert-base-multilingual-cased"
batch_size = 64
task = "nli"
MODEL_P = "models/klue-bert-base-mlm.pth"
RANDOM_SEED = 17

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
dataset = pd.read_csv("data/train_data.csv",index_col=False)
test = pd.read_csv("data/test_data.csv",index_col=False)

In [6]:
topic_token_dict = {0:4038,1:3674,2:3647,3:3697,4:3665,5:4559,6:3713}
topic_dict = {0: "과학", 1:"경제", 2:"사회", 3:"문화", 4:"세계", 5:"스포츠", 6 : "정치"}
tmp = []

for title, topic_idx in zip(dataset["title"],dataset["topic_idx"]):
    sentence = title + ".[SEP] 이 문장은 [MASK]"
    tmp.append(sentence)
dataset["title"] = tmp
    
tmp = []
for title in test["title"]:
    sentence = title + ".[SEP] 이 문장은 [MASK]"
    tmp.append(sentence)

test["title"] = tmp

In [7]:
dataset_train, dataset_val = train_test_split(dataset,test_size = 0.2,random_state = RANDOM_SEED)

In [8]:
dataset_train.head()

Unnamed: 0,index,title,topic_idx
25339,25339,더민주 서영교 여파 지역위원장 심사기준 강화.[SEP] 이 문장은 [MASK],6
24704,24704,맛집에 너그러운 한국인 해외여행서도 JMT 찾았다.[SEP] 이 문장은 [MASK],3
1834,1834,특징주 삼성물산 지배구조 이슈 부각에 강세종합.[SEP] 이 문장은 [MASK],1
17604,17604,생필품난 베네수엘라 콜롬비아와의 국경 1년 만에 재개방.[SEP] 이 문장은 [MASK],4
19362,19362,금태섭 국민 10명 중 8명 판결문 공개 원해.[SEP] 이 문장은 [MASK],6


In [9]:
# dataset_train = dataset_train.head()

In [10]:
def bert_tokenize(dataset,sent_key,label_key,tokenizer):
    if label_key is None :
        labels = [np.int64(0) for i in dataset[sent_key]]
    else :
        labels = [np.int64(i) for i in dataset[label_key]]
    
    sentences = tokenizer(dataset[sent_key].tolist(),truncation=True,padding=True)
#     sentences = tokenizer(dataset[sent_key].tolist(),truncation=True)

    input_ids = sentences.input_ids
    token_type_ids = sentences.token_type_ids
    attention_mask = sentences.attention_mask
    masked_token_idx = []
    
    for input_id in input_ids:
        masked_token_idx.append(input_id.index(4))
        
    
    return list([input_ids, token_type_ids, attention_mask, labels, masked_token_idx])

In [11]:
train_inputs = bert_tokenize(dataset_train,"title","topic_idx",tokenizer)
validation_inputs = bert_tokenize(dataset_val,"title","topic_idx",tokenizer)
test_inputs = bert_tokenize(test,"title",None,tokenizer)

In [12]:
for i in range(len(train_inputs)):
    train_inputs[i] = torch.tensor(train_inputs[i])
    
for i in range(len(validation_inputs)):
    validation_inputs[i] = torch.tensor(validation_inputs[i])
    
for i in range(len(test_inputs)):
    test_inputs[i] = torch.tensor(test_inputs[i])

In [13]:
train_data = TensorDataset(*train_inputs)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,sampler=train_sampler,batch_size=batch_size)

validation_data = TensorDataset(*validation_inputs)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data,sampler=validation_sampler,batch_size=batch_size)

test_data = TensorDataset(*test_inputs)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data,sampler=test_sampler,batch_size=batch_size)

In [14]:
data = next(iter(train_dataloader))
data[3][0]

tensor(6)

In [15]:
data

[tensor([[    2,  3833,  5499,  ...,     0,     0,     0],
         [    2, 16070, 10234,  ...,     0,     0,     0],
         [    2,  8967,  2265,  ...,     0,     0,     0],
         ...,
         [    2,  1726, 11235,  ...,     0,     0,     0],
         [    2, 20544,  3698,  ...,     0,     0,     0],
         [    2,  4048, 18600,  ...,     0,     0,     0]]),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([6, 4, 1, 3, 1, 0, 5, 4, 3, 2, 0, 5, 3, 5, 6, 0, 2, 2, 0, 0, 4, 2, 2, 6,
         3, 6, 4, 4, 6, 4, 6, 2, 6, 2, 1, 1, 6, 4, 2, 3, 5, 4, 0, 6, 4, 6, 0, 5,
         0, 6, 6

In [14]:
# model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,num_labels=7)
# model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
model = AutoModelForPreTraining.from_pretrained(model_checkpoint)
model.to(device)

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [108]:
model.train()
data = tuple(t.to(device) for t in data)
input_ids, token_ids, mask, label, masked_token_idx = data
outputs = model(input_ids, token_type_ids=token_ids, attention_mask=mask)
print(outputs)

BertForPreTrainingOutput(loss=None, prediction_logits=tensor([[[ -6.0406,   3.1712,  -4.7043,  ...,  -6.7852,  -6.0219,  -6.4100],
         [ -6.5202,   6.0657,  -7.2749,  ...,  -6.4878,  -8.5244,  -6.5877],
         [ -8.7517,   7.6491,  -6.4469,  ...,  -2.6874,  -5.8746,  -6.4186],
         ...,
         [ -7.9496,   2.6318,  -6.0783,  ...,  -5.1762,  -6.1159,  -7.9961],
         [ -6.8191,   5.3924,  -4.8108,  ...,  -4.5111,  -5.3303,  -6.0203],
         [ -7.1067,   5.0341,  -4.6863,  ...,  -5.6264,  -5.4639,  -6.7108]],

        [[ -6.5271,   3.6221,  -5.4519,  ...,  -5.8584,  -5.7478,  -5.1125],
         [ -5.3117,   8.8945,  -4.5603,  ...,  -4.1524,  -2.3396,  -1.6568],
         [ -6.4958,   5.0262,  -3.3150,  ...,  -5.1708,  -4.7854,  -4.2634],
         ...,
         [ -6.1085,   4.6625,  -5.5953,  ...,  -5.1957,  -3.4544,  -3.6032],
         [ -5.6021,   8.5952,  -4.4087,  ...,  -4.1917,  -3.5906,  -4.4249],
         [ -6.1020,   7.6569,  -4.2453,  ...,  -4.7865,  -4.0938,  -4

In [109]:
logits_cls, logits_lm = outputs[1], outputs[0]

In [110]:
logits_lm.view(-1, logits_lm.size(2)).shape

torch.Size([2240, 32000])

In [111]:
logits_lm.shape

torch.Size([64, 35, 32000])

In [112]:
masked_token_idx

tensor([22, 24, 16, 13, 21, 21, 25, 19, 23, 18, 22, 22, 16, 21, 19, 17, 23, 20,
        21, 16, 18, 18, 21, 18, 20, 19, 19, 22, 19, 19, 22, 10, 23, 16, 25, 25,
        20, 27, 21, 16, 20, 26, 19, 21, 19, 17, 26, 21, 20, 20, 21, 20, 26, 18,
        15, 22, 19, 20, 17, 19, 24, 26, 21, 23], device='cuda:0')

In [113]:
SEQUENCE_LENGTH = 35
mask_label = [topic_token_dict[lb] for lb in label.to('cpu').numpy() ]
label_lms = []
for idx, label in zip(masked_token_idx.to('cpu').numpy(),mask_label):
    label_lm = np.full(SEQUENCE_LENGTH, dtype=np.int, fill_value=-1)
    print(idx)
    print(type(idx))
    print(label_lm)
    label_lm[idx] = label
    label_lms.append(label_lm)

22
<class 'numpy.int64'>
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
24
<class 'numpy.int64'>
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
16
<class 'numpy.int64'>
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
13
<class 'numpy.int64'>
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
21
<class 'numpy.int64'>
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
21
<class 'numpy.int64'>
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
25
<class 'numpy.int64'>
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
19
<class 'numpy.int64'>
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -

In [114]:
# print(label_lms)
label_lms_np = np.array(label_lms)
label_lms_np.shape

(64, 35)

In [115]:
label_lms_pt = torch.tensor(label_lms_np,dtype=torch.int64).to(device)
label_lms_pt.view(-1)

tensor([-1, -1, -1,  ..., -1, -1, -1], device='cuda:0')

In [116]:
print(logits_lm.view(-1, logits_lm.size(2)).shape , label_lms_pt.view(-1).shape)
criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
loss_lm = criterion_lm(logits_lm.view(-1, logits_lm.size(2)), label_lms_pt.view(-1))
print(loss_lm)

torch.Size([2240, 32000]) torch.Size([2240])
tensor(7.8615, device='cuda:0', grad_fn=<NllLossBackward>)


In [128]:
len(input_ids)

64

In [129]:
criterion_cls = torch.nn.CrossEntropyLoss()
labels_cls = [1 for _ in range(len(input_ids))]
labels_cls = torch.tensor(labels_cls).to(device)
loss_cls = criterion_cls(logits_cls, labels_cls)

ValueError: Expected input batch_size (43) to match target batch_size (64).

In [118]:
loss_cls

tensor(0.7543, device='cuda:0', grad_fn=<NllLossBackward>)

In [119]:
loss = loss_cls + loss_lm

In [120]:
loss.backward()

In [54]:
# masked_token_ids.to('cpu').numpy(), label.to('cpu').numpy()

AttributeError: 'int' object has no attribute 'to'

In [20]:
# logits = outputs[0].to("cpu").detach().numpy()
# print(logits.shape)
# print(len(logits[0][0]))
# print(np.argmax(logits,axis=2).shape)
# out = np.argmax(logits,axis=2)

In [21]:
def get_predict_tensor(logits,idx):
    tmp = []
    for b,i in zip(logits,idx):
        tmp.append(b[i])
    
    pred = torch.tensor(tmp,requires_grad=True)
    
    return pred

# pred = get_predict_tensor(logits,[18])
# pred

In [22]:
# pred = get_predict_tensor(logits,masked_token_ids.to('cpu').numpy())
# pred

In [23]:
# answer = [topic_token_dict[lb] for lb in label.to('cpu').numpy() ]

In [24]:
# ans = torch.tensor(answer)
# loss_fn(pred,ans)

In [25]:
# torch.argmax(outputs[0],axis=2)

In [26]:
# print(tokenizer.convert_ids_to_tokens(data[0][0]))
# print(data[0][0])
# print(data[1][0])
# print(data[2][0])
# print(tokenizer.convert_ids_to_tokens(out[0]))
# print(out[0])

In [15]:
# Parameters:
lr = 2e-5
adam_epsilon = 1e-8

# Number of training epochs (authors recommend between 2 and 4)
epochs = 100

num_warmup_steps = 0

warmup_ratio = 0.1
num_training_steps = len(train_dataloader)*epochs
warmup_step = int(num_training_steps * warmup_ratio)

### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr,eps=adam_epsilon)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=num_training_steps)  # PyTorch scheduler

In [16]:
train_loss_set = []
learning_rate = []

# criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
loss_fn = nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
criterion_cls = torch.nn.CrossEntropyLoss()

model.zero_grad()

for _ in tnrange(1,epochs+1,desc='Epoch'):
    print("<" + "="*22 + F" Epoch {_} "+ "="*22 + ">")
    batch_loss = 0
    
    # train
    model.train()
    for step, batch in enumerate(tqdm_notebook(train_dataloader)):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_token_type_ids, b_input_mask, b_labels, b_masked_token_idx = batch
#         print(b_input_ids)
#         print(b_token_type_ids)
#         print(b_input_mask)
#         print(b_labels)
#         print(b_masked_token_idx)
        outputs = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask)
        
        # calculate loss
        logits_cls, logits_lm = outputs[1], outputs[0]
        
        mask_label = [topic_token_dict[lb] for lb in b_labels.to('cpu').numpy() ]
        label_lms = []
        SEQUENCE_LENGTH = 35
        
        for idx, label in zip(b_masked_token_idx.to('cpu').numpy(),mask_label):
            label_lm = np.full(SEQUENCE_LENGTH, dtype=np.int, fill_value=-1)
            label_lm[idx] = label
            label_lms.append(label_lm)
            
        label_lms_np = np.array(label_lms)
        label_lms_pt = torch.tensor(label_lms_np,dtype=torch.int64).to(device)
        loss_lm = criterion_lm(logits_lm.view(-1, logits_lm.size(2)), label_lms_pt.view(-1))
        
        labels_cls = [1 for _ in range(len(b_input_ids))]
        labels_cls = torch.tensor(labels_cls).to(device)
        loss_cls = criterion_cls(logits_cls, labels_cls)
        
        loss = loss_cls + loss_lm
        loss.backward()
        print(loss)
        
#         logits = outputs[0].to("cpu").detach().numpy()
        
#         pred = get_predict_tensor(logits,b_masked_token_idx.to('cpu').numpy())
#         answer = [topic_token_dict[lb] for lb in b_labels.to('cpu').numpy() ]
        
#         answer = torch.tensor(answer)
#         loss = loss_fn(pred,answer)
#         loss = loss.to(device)
#         loss.backward()
#         print(loss)
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        scheduler.step()
        optimizer.zero_grad()

        batch_loss += loss.item()
        
    avg_train_loss = batch_loss / len(train_dataloader)
    train_loss_set.append(avg_train_loss)
    print(F'\n\tAverage Training loss: {avg_train_loss}')
        


  for _ in tnrange(1,epochs+1,desc='Epoch'):


Epoch:   0%|          | 0/100 [00:00<?, ?it/s]



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/571 [00:00<?, ?it/s]

tensor(8.7876, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.9956, device='cuda:0', grad_fn=<AddBackward0>)
tensor(9.0803, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.9618, device='cuda:0', grad_fn=<AddBackward0>)
tensor(9.4560, device='cuda:0', grad_fn=<AddBackward0>)
tensor(9.0171, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.8775, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.9959, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.7198, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.8343, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.8219, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.7221, device='cuda:0', grad_fn=<AddBackward0>)
tensor(9.0508, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.5869, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.7670, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.6760, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.9379, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.8623, device='cuda:0', grad_fn=<AddBack

tensor(3.6805, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3679, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4668, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5279, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3765, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3848, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3968, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2669, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4146, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0581, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1277, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1799, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0658, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0044, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7228, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6376, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8287, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9932, device='cuda:0', grad_fn=<AddBack

tensor(0.7121, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.6018, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.6477, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.6315, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.7492, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4330, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4979, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5748, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.6766, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4716, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.7119, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.7629, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.7098, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.7063, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.6970, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.6058, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.7102, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5547, device='cuda:0', grad_fn=<AddBack

tensor(0.5565, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3886, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5024, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.7187, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4763, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4409, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.7178, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3994, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5739, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.9279, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4214, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.6591, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4346, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3340, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3329, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3840, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4510, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4916, device='cuda:0', grad_fn=<AddBack

  0%|          | 0/571 [00:00<?, ?it/s]

tensor(0.1708, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4405, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3838, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.1799, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4593, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4082, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3426, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3034, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4077, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3765, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4921, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4199, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5206, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2472, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5138, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3023, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4999, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2659, device='cuda:0', grad_fn=<AddBack

tensor(0.6504, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3675, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4853, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3776, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2414, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3738, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2899, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2107, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4541, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4941, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3278, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4462, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4690, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3460, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3357, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4352, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3672, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4414, device='cuda:0', grad_fn=<AddBack

tensor(0.5139, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3295, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4510, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2320, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2722, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4609, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3565, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3427, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3795, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4403, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2875, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2771, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3014, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3194, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3255, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3463, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3445, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3236, device='cuda:0', grad_fn=<AddBack

tensor(0.4847, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2692, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4167, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3215, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4664, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4692, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3154, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4939, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3514, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4261, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2778, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4904, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.6417, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5608, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2817, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3891, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3442, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3143, device='cuda:0', grad_fn=<AddBack

  0%|          | 0/571 [00:00<?, ?it/s]

tensor(0.2710, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4347, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2286, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2873, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5832, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5929, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2571, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4819, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3481, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.1548, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3273, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3106, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3041, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3884, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3370, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5107, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3887, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3107, device='cuda:0', grad_fn=<AddBack

tensor(0.3162, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2328, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2221, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3772, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3770, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.1047, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3218, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4514, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3197, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3130, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2111, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2404, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4831, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4386, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4094, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2406, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.1377, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2911, device='cuda:0', grad_fn=<AddBack

tensor(0.1450, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.1743, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4020, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.1673, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2856, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2840, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2265, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.1967, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2513, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4708, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5573, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5624, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2894, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2646, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2852, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3545, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2007, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4529, device='cuda:0', grad_fn=<AddBack

tensor(0.2447, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3480, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2209, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4110, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2748, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.1358, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4635, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4365, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2789, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4270, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.1741, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4839, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.6247, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2037, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.1857, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5230, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.1613, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.1580, device='cuda:0', grad_fn=<AddBack

  0%|          | 0/571 [00:00<?, ?it/s]

tensor(0.3157, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.1414, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2465, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.1441, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2377, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3229, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2275, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3582, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.4268, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3687, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2625, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2087, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2329, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3601, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3832, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.2261, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3547, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.3938, device='cuda:0', grad_fn=<AddBack

KeyboardInterrupt: 

In [17]:
data = next(iter(validation_dataloader))
data[3][0]

tensor(3)

In [18]:
model.eval()
data = tuple(t.to(device) for t in data)
input_ids, token_ids, mask, label, masked_token_idx = data
outputs = model(input_ids, token_type_ids=token_ids, attention_mask=mask)
print(outputs)

BertForPreTrainingOutput(loss=None, prediction_logits=tensor([[[ -7.5324,   4.7769,  -5.8239,  ...,  -8.4287,  -9.7536,  -7.3671],
         [ -2.6904,   5.2286,  -7.6973,  ...,  -4.0463,  -4.3348,  -2.6985],
         [-10.5770,   3.3772,  -8.3321,  ...,  -7.8780,  -8.0841,  -7.6637],
         ...,
         [ -6.8220,   4.7606,  -7.0810,  ...,  -6.2672,  -7.0951,  -5.3995],
         [ -8.4229,   4.9718,  -5.2277,  ...,  -6.9559,  -7.2738,  -7.1324],
         [ -7.0484,   6.0130,  -4.7348,  ...,  -6.9919,  -6.7588,  -6.2325]],

        [[ -7.1345,   4.0415,  -5.9587,  ...,  -9.2324, -10.2824,  -6.4977],
         [ -7.1295,   3.5847,  -7.3339,  ...,  -7.0343,  -7.6060,  -5.6498],
         [ -7.9517,   5.5693,  -7.9232,  ...,  -7.9006,  -8.1612,  -7.1819],
         ...,
         [ -7.5906,   4.8099,  -7.0088,  ...,  -8.9138,  -7.2824,  -6.7442],
         [ -6.7460,   6.2802,  -6.3477,  ...,  -9.5336,  -8.1012,  -5.8430],
         [ -5.6934,   6.4480,  -7.0061,  ...,  -8.9291,  -7.5552,  -5

In [19]:
logits_cls, logits_lm = outputs[1], outputs[0]

In [20]:
logits_lm.shape

torch.Size([64, 35, 32000])

In [24]:
logits_lm = logits_lm.to('cpu').detach().numpy()
print(logits_lm.shape)
print(len(logits_lm[0][0]))
print(np.argmax(logits_lm,axis=2).shape)
out = np.argmax(logits_lm,axis=2)

AttributeError: 'numpy.ndarray' object has no attribute 'to'

In [33]:
out[3]

array([ 3647,  3665, 13723, 26602, 18556,  1570,  2260,   121, 12461,
       11647, 23679,  2307, 18928,  5754,  3665,  2302,  2178,    18,
          18,  1504,  6265,  2073,  3665,    18,  3665,  3665,  3665,
        3665,  3665,  3665,  3665,  3665,  3665,  3665,  3665],
      dtype=int64)

In [34]:
print(tokenizer.convert_ids_to_tokens(data[0][3]))
print(data[0][3])
print(data[1][3])
print(data[2][3])
print(data[3][3])
print(data[4][3])
print(tokenizer.convert_ids_to_tokens(out[3]))
print(out[3])

['[CLS]', '홍콩', '시위대', '실탄', '맞아', '중', '##태', '…', '시진핑', '초상', '불태우', '##며', '애도', '시위', '##종합', '##2', '##보', '.', '[SEP]', '이', '문장', '##은', '[MASK]', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
tensor([    2,  6616, 13723, 26602, 18556,  1570,  2260,   121, 12461, 11647,
        23679,  2307, 18928,  5754, 27854,  2302,  2178,    18,     3,  1504,
         6265,  2073,     4,     3,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
tensor(4, device='cuda:0')
tensor(22, device='cuda:0')
['사회', '세계', '시위대', '실탄', '맞아', '중', '##태', '…', '시진핑', '초상', '불태우', '##며', '애도', '시위', '세계', '##2', '##보', '