In [1]:
import random
import logging
from IPython.display import display, HTML
from tqdm import tqdm, tqdm_notebook, tnrange

import numpy as np
import pandas as pd
import datasets
from datasets import load_dataset, load_metric, ClassLabel, Sequence
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, AutoModelForMaskedLM, AutoModelForPreTraining, TrainingArguments, Trainer

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from transformers import get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset,TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score

print(torch.cuda.device_count())

device = torch.device("cuda:0")

1


In [2]:
torch.cuda.is_available()

True

In [3]:
model_checkpoint = "klue/bert-base"
# model_checkpoint = "bert-base-multilingual-cased"
batch_size = 32
task = "nli"
MODEL_P = "models/klue-bert-base-mlm.pth"
RANDOM_SEED = 17

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
dataset = pd.read_csv("data/train_data.csv",index_col=False)
test = pd.read_csv("data/test_data.csv",index_col=False)

In [6]:
topic_token_dict = {0:4038,1:3674,2:3647,3:3697,4:3665,5:4559,6:3713}
token_topic_dict = {4038 : 0, 3674 : 1, 3647 : 2, 3697 : 3, 3665 : 4, 4559 : 5, 3713 : 6}
topic_dict = {0: "과학", 1:"경제", 2:"사회", 3:"문화", 4:"세계", 5:"스포츠", 6 : "정치"}
tmp = []

for title, topic_idx in zip(dataset["title"],dataset["topic_idx"]):
    sentence = title + ".[SEP] 이 문장은 [MASK]"
    tmp.append(sentence)
dataset["title"] = tmp
    
tmp = []
for title in test["title"]:
    sentence = title + ".[SEP] 이 문장은 [MASK]"
    tmp.append(sentence)

test["title"] = tmp

In [7]:
dataset_train, dataset_val = train_test_split(dataset,test_size = 0.2,random_state = RANDOM_SEED)

In [8]:
dataset_train.head()

Unnamed: 0,index,title,topic_idx
25339,25339,더민주 서영교 여파 지역위원장 심사기준 강화.[SEP] 이 문장은 [MASK],6
24704,24704,맛집에 너그러운 한국인 해외여행서도 JMT 찾았다.[SEP] 이 문장은 [MASK],3
1834,1834,특징주 삼성물산 지배구조 이슈 부각에 강세종합.[SEP] 이 문장은 [MASK],1
17604,17604,생필품난 베네수엘라 콜롬비아와의 국경 1년 만에 재개방.[SEP] 이 문장은 [MASK],4
19362,19362,금태섭 국민 10명 중 8명 판결문 공개 원해.[SEP] 이 문장은 [MASK],6


In [9]:
# dataset_train = dataset_train.head()

In [10]:
def bert_tokenize(dataset,sent_key,label_key,tokenizer):
    if label_key is None :
        labels = [np.int64(0) for i in dataset[sent_key]]
    else :
        labels = [np.int64(i) for i in dataset[label_key]]
    
    sentences = tokenizer(dataset[sent_key].tolist(),truncation=True,padding=True)
#     sentences = tokenizer(dataset[sent_key].tolist(),truncation=True)

    input_ids = sentences.input_ids
    token_type_ids = sentences.token_type_ids
    attention_mask = sentences.attention_mask
    masked_token_idx = []
    
    for input_id in input_ids:
        masked_token_idx.append(input_id.index(4))
        
    
    return list([input_ids, token_type_ids, attention_mask, labels, masked_token_idx])

In [11]:
train_inputs = bert_tokenize(dataset_train,"title","topic_idx",tokenizer)
validation_inputs = bert_tokenize(dataset_val,"title","topic_idx",tokenizer)
test_inputs = bert_tokenize(test,"title",None,tokenizer)

In [34]:
test_inputs[0][1], test_inputs[4][1]


(tensor([    2, 24905,  1042,  4795, 19982,  2129,   121,  6904, 16311,     1,
         14392,    18,     3,  1504,  6265,  2073,     4,     3,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]),
 tensor(16))

In [12]:
for i in range(len(train_inputs)):
    train_inputs[i] = torch.tensor(train_inputs[i])
    
for i in range(len(validation_inputs)):
    validation_inputs[i] = torch.tensor(validation_inputs[i])
    
for i in range(len(test_inputs)):
    test_inputs[i] = torch.tensor(test_inputs[i])

In [35]:
train_data = TensorDataset(*train_inputs)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,sampler=train_sampler,batch_size=batch_size)

validation_data = TensorDataset(*validation_inputs)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data,sampler=validation_sampler,batch_size=batch_size)

test_data = TensorDataset(*test_inputs)
# test_sampler = RandomSampler(test_data)
# test_dataloader = DataLoader(test_data,sampler=test_sampler,batch_size=batch_size)
test_dataloader = DataLoader(test_data,batch_size=batch_size)

In [14]:
data = next(iter(train_dataloader))
data[3][0]

tensor(1)

In [15]:
data

[tensor([[    2,  1804,  2146,  ...,     0,     0,     0],
         [    2, 10898, 11079,  ...,     0,     0,     0],
         [    2,   594,  2398,  ...,     0,     0,     0],
         ...,
         [    2,  4004, 11298,  ...,     0,     0,     0],
         [    2, 16761, 29712,  ...,     0,     0,     0],
         [    2, 30072,    25,  ...,     0,     0,     0]]),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([1, 1, 3, 3, 6, 6, 1, 5, 4, 5, 4, 1, 1, 1, 3, 6, 5, 1, 1, 5, 2, 4, 2, 4,
         1, 6, 0, 6, 1, 1, 0, 1, 5, 2, 0, 4, 6, 2, 4, 5, 2, 2, 0, 5, 5, 6, 3, 0,
         2, 6, 3

In [14]:
# model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,num_labels=7)
# model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
model = AutoModelForPreTraining.from_pretrained(model_checkpoint)
model.to(device)

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [108]:
model.train()
data = tuple(t.to(device) for t in data)
input_ids, token_ids, mask, label, masked_token_idx = data
outputs = model(input_ids, token_type_ids=token_ids, attention_mask=mask)
print(outputs)

BertForPreTrainingOutput(loss=None, prediction_logits=tensor([[[ -6.0406,   3.1712,  -4.7043,  ...,  -6.7852,  -6.0219,  -6.4100],
         [ -6.5202,   6.0657,  -7.2749,  ...,  -6.4878,  -8.5244,  -6.5877],
         [ -8.7517,   7.6491,  -6.4469,  ...,  -2.6874,  -5.8746,  -6.4186],
         ...,
         [ -7.9496,   2.6318,  -6.0783,  ...,  -5.1762,  -6.1159,  -7.9961],
         [ -6.8191,   5.3924,  -4.8108,  ...,  -4.5111,  -5.3303,  -6.0203],
         [ -7.1067,   5.0341,  -4.6863,  ...,  -5.6264,  -5.4639,  -6.7108]],

        [[ -6.5271,   3.6221,  -5.4519,  ...,  -5.8584,  -5.7478,  -5.1125],
         [ -5.3117,   8.8945,  -4.5603,  ...,  -4.1524,  -2.3396,  -1.6568],
         [ -6.4958,   5.0262,  -3.3150,  ...,  -5.1708,  -4.7854,  -4.2634],
         ...,
         [ -6.1085,   4.6625,  -5.5953,  ...,  -5.1957,  -3.4544,  -3.6032],
         [ -5.6021,   8.5952,  -4.4087,  ...,  -4.1917,  -3.5906,  -4.4249],
         [ -6.1020,   7.6569,  -4.2453,  ...,  -4.7865,  -4.0938,  -4

In [109]:
logits_cls, logits_lm = outputs[1], outputs[0]

In [110]:
logits_lm.view(-1, logits_lm.size(2)).shape

torch.Size([2240, 32000])

In [111]:
logits_lm.shape

torch.Size([64, 35, 32000])

In [112]:
masked_token_idx

tensor([22, 24, 16, 13, 21, 21, 25, 19, 23, 18, 22, 22, 16, 21, 19, 17, 23, 20,
        21, 16, 18, 18, 21, 18, 20, 19, 19, 22, 19, 19, 22, 10, 23, 16, 25, 25,
        20, 27, 21, 16, 20, 26, 19, 21, 19, 17, 26, 21, 20, 20, 21, 20, 26, 18,
        15, 22, 19, 20, 17, 19, 24, 26, 21, 23], device='cuda:0')

In [113]:
SEQUENCE_LENGTH = 35
mask_label = [topic_token_dict[lb] for lb in label.to('cpu').numpy() ]
label_lms = []
for idx, label in zip(masked_token_idx.to('cpu').numpy(),mask_label):
    label_lm = np.full(SEQUENCE_LENGTH, dtype=np.int, fill_value=-1)
    print(idx)
    print(type(idx))
    print(label_lm)
    label_lm[idx] = label
    label_lms.append(label_lm)

22
<class 'numpy.int64'>
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
24
<class 'numpy.int64'>
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
16
<class 'numpy.int64'>
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
13
<class 'numpy.int64'>
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
21
<class 'numpy.int64'>
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
21
<class 'numpy.int64'>
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
25
<class 'numpy.int64'>
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
19
<class 'numpy.int64'>
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -

In [114]:
# print(label_lms)
label_lms_np = np.array(label_lms)
label_lms_np.shape

(64, 35)

In [115]:
label_lms_pt = torch.tensor(label_lms_np,dtype=torch.int64).to(device)
label_lms_pt.view(-1)

tensor([-1, -1, -1,  ..., -1, -1, -1], device='cuda:0')

In [116]:
print(logits_lm.view(-1, logits_lm.size(2)).shape , label_lms_pt.view(-1).shape)
criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
loss_lm = criterion_lm(logits_lm.view(-1, logits_lm.size(2)), label_lms_pt.view(-1))
print(loss_lm)

torch.Size([2240, 32000]) torch.Size([2240])
tensor(7.8615, device='cuda:0', grad_fn=<NllLossBackward>)


In [128]:
len(input_ids)

64

In [129]:
criterion_cls = torch.nn.CrossEntropyLoss()
labels_cls = [1 for _ in range(len(input_ids))]
labels_cls = torch.tensor(labels_cls).to(device)
loss_cls = criterion_cls(logits_cls, labels_cls)

ValueError: Expected input batch_size (43) to match target batch_size (64).

In [118]:
loss_cls

tensor(0.7543, device='cuda:0', grad_fn=<NllLossBackward>)

In [119]:
loss = loss_cls + loss_lm

In [120]:
loss.backward()

In [54]:
# masked_token_ids.to('cpu').numpy(), label.to('cpu').numpy()

AttributeError: 'int' object has no attribute 'to'

In [20]:
# logits = outputs[0].to("cpu").detach().numpy()
# print(logits.shape)
# print(len(logits[0][0]))
# print(np.argmax(logits,axis=2).shape)
# out = np.argmax(logits,axis=2)

In [21]:
def get_predict_tensor(logits,idx):
    tmp = []
    for b,i in zip(logits,idx):
        tmp.append(b[i])
    
    pred = torch.tensor(tmp,requires_grad=True)
    
    return pred

# pred = get_predict_tensor(logits,[18])
# pred

In [22]:
# pred = get_predict_tensor(logits,masked_token_ids.to('cpu').numpy())
# pred

In [23]:
# answer = [topic_token_dict[lb] for lb in label.to('cpu').numpy() ]

In [24]:
# ans = torch.tensor(answer)
# loss_fn(pred,ans)

In [25]:
# torch.argmax(outputs[0],axis=2)

In [26]:
# print(tokenizer.convert_ids_to_tokens(data[0][0]))
# print(data[0][0])
# print(data[1][0])
# print(data[2][0])
# print(tokenizer.convert_ids_to_tokens(out[0]))
# print(out[0])

In [15]:
# Parameters:
lr = 2e-5
adam_epsilon = 1e-8

# Number of training epochs (authors recommend between 2 and 4)
epochs = 1

num_warmup_steps = 0

warmup_ratio = 0.1
num_training_steps = len(train_dataloader)*epochs
warmup_step = int(num_training_steps * warmup_ratio)

### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr,eps=adam_epsilon)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=num_training_steps)  # PyTorch scheduler

In [23]:
train_loss_set = []
learning_rate = []

# criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
loss_fn = nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
criterion_cls = torch.nn.CrossEntropyLoss()

model.zero_grad()

for _ in tnrange(1,epochs+1,desc='Epoch'):
    print("<" + "="*22 + F" Epoch {_} "+ "="*22 + ">")
    batch_loss = 0
    
    # train
    model.train()
    for step, batch in enumerate(tqdm_notebook(train_dataloader)):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_token_type_ids, b_input_mask, b_labels, b_masked_token_idx = batch
#         print(b_input_ids)
#         print(b_token_type_ids)
#         print(b_input_mask)
#         print(b_labels)
#         print(b_masked_token_idx)
        outputs = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask)
        
        # calculate loss
        logits_cls, logits_lm = outputs[1], outputs[0]
        
        mask_label = [topic_token_dict[lb] for lb in b_labels.to('cpu').numpy() ]
        label_lms = []
        SEQUENCE_LENGTH = 35
        
        for idx, label in zip(b_masked_token_idx.to('cpu').numpy(),mask_label):
            label_lm = np.full(SEQUENCE_LENGTH, dtype=np.int, fill_value=-1)
            label_lm[idx] = label
            label_lms.append(label_lm)
            
        label_lms_np = np.array(label_lms)
        label_lms_pt = torch.tensor(label_lms_np,dtype=torch.int64).to(device)
        loss_lm = criterion_lm(logits_lm.view(-1, logits_lm.size(2)), label_lms_pt.view(-1))
        
        labels_cls = [1 for _ in range(len(b_input_ids))]
        labels_cls = torch.tensor(labels_cls).to(device)
        loss_cls = criterion_cls(logits_cls, labels_cls)
        
        loss = loss_cls + loss_lm
        loss.backward()        
#         logits = outputs[0].to("cpu").detach().numpy()
        
#         pred = get_predict_tensor(logits,b_masked_token_idx.to('cpu').numpy())
#         answer = [topic_token_dict[lb] for lb in b_labels.to('cpu').numpy() ]
        
#         answer = torch.tensor(answer)
#         loss = loss_fn(pred,answer)
#         loss = loss.to(device)
#         loss.backward()
#         print(loss)
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        scheduler.step()
        optimizer.zero_grad()

        batch_loss += loss.item()
        
    avg_train_loss = batch_loss / len(train_dataloader)
    train_loss_set.append(avg_train_loss)
    print(F'\n\tAverage Training loss: {avg_train_loss}')
        


  for _ in tnrange(1,epochs+1,desc='Epoch'):


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/1142 [00:00<?, ?it/s]


	Average Training loss: 0.2860492701510101


In [30]:
predict_li = []
label_li = []

# eval
model.eval()
for step, batch in enumerate(tqdm_notebook(validation_dataloader)):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_token_type_ids, b_input_mask, b_labels, b_masked_token_idx = batch
    
    outputs = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask)
    
    logits_cls, logits_lm = outputs[1], outputs[0]
    
    logits_lm_cpu = logits_lm.to('cpu').detach().numpy()
    out = np.argmax(logits_lm_cpu,axis=2)
    
    masked_token_idx_np = b_masked_token_idx.to('cpu').numpy()
    
    for i in range(len(out)):
        l = token_topic_dict[out[i][masked_token_idx_np[i]]]
        predict_li.append(l)
        
    for l in b_labels.to('cpu').numpy():
        label_li.append(l)
    
    mask_label = [topic_token_dict[lb] for lb in b_labels.to('cpu').numpy() ]
    label_lms = []
    SEQUENCE_LENGTH = 35

    for idx, label in zip(b_masked_token_idx.to('cpu').numpy(),mask_label):
        label_lm = np.full(SEQUENCE_LENGTH, dtype=np.int, fill_value=-1)
        label_lm[idx] = label
        label_lms.append(label_lm)

    label_lms_np = np.array(label_lms)
    label_lms_pt = torch.tensor(label_lms_np,dtype=torch.int64).to(device)
    loss_lm = criterion_lm(logits_lm.view(-1, logits_lm.size(2)), label_lms_pt.view(-1))

    labels_cls = [1 for _ in range(len(b_input_ids))]
    labels_cls = torch.tensor(labels_cls).to(device)
    loss_cls = criterion_cls(logits_cls, labels_cls)
    
    loss = loss_cls + loss_lm
    
    batch_loss += loss.item()
    
avg_validation_loss = batch_loss / len(validation_dataloader)
print(F'\n\tAverage validation loss: {avg_validation_loss}')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in enumerate(tqdm_notebook(validation_dataloader)):


  0%|          | 0/286 [00:00<?, ?it/s]


	Average validation loss: 1.839752100798842


In [31]:
accuracy_score(label_li, predict_li)

0.8774504435439711

In [36]:
predict_li = []

model.eval()
for step, batch in enumerate(tqdm_notebook(test_dataloader)):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_token_type_ids, b_input_mask, b_labels, b_masked_token_idx = batch
    
    outputs = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask)
    
    logits_cls, logits_lm = outputs[1], outputs[0]
    
    logits_lm = logits_lm.to('cpu').detach().numpy()
    out = np.argmax(logits_lm,axis=2)
    
    masked_token_idx_np = b_masked_token_idx.to('cpu').numpy()
    
#     print(out)
    
    for i in range(len(out)):
        l = token_topic_dict[out[i][masked_token_idx_np[i]]]
        predict_li.append(l)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/286 [00:00<?, ?it/s]

In [37]:
len(predict_li)

9131

In [38]:
predict_li

[2,
 3,
 2,
 2,
 3,
 0,
 5,
 3,
 4,
 1,
 4,
 6,
 4,
 5,
 6,
 1,
 6,
 2,
 4,
 4,
 4,
 4,
 4,
 0,
 0,
 3,
 6,
 0,
 5,
 1,
 3,
 1,
 4,
 5,
 2,
 4,
 5,
 4,
 6,
 5,
 5,
 5,
 5,
 3,
 0,
 5,
 3,
 6,
 1,
 6,
 0,
 4,
 6,
 4,
 5,
 1,
 4,
 6,
 3,
 3,
 4,
 4,
 4,
 5,
 3,
 1,
 5,
 5,
 2,
 0,
 3,
 1,
 3,
 3,
 0,
 5,
 1,
 1,
 1,
 2,
 6,
 1,
 5,
 4,
 6,
 5,
 4,
 1,
 6,
 1,
 5,
 5,
 4,
 4,
 4,
 1,
 6,
 4,
 1,
 1,
 5,
 5,
 4,
 4,
 5,
 4,
 3,
 4,
 0,
 3,
 2,
 2,
 2,
 4,
 3,
 5,
 3,
 6,
 0,
 4,
 1,
 0,
 6,
 0,
 2,
 1,
 5,
 4,
 0,
 4,
 0,
 2,
 6,
 6,
 4,
 2,
 6,
 2,
 1,
 0,
 5,
 2,
 3,
 3,
 1,
 2,
 1,
 2,
 3,
 2,
 6,
 0,
 1,
 6,
 4,
 3,
 5,
 2,
 0,
 6,
 1,
 2,
 3,
 5,
 5,
 3,
 6,
 1,
 1,
 4,
 1,
 2,
 2,
 1,
 3,
 4,
 2,
 3,
 6,
 2,
 5,
 1,
 5,
 1,
 5,
 1,
 2,
 0,
 3,
 5,
 2,
 6,
 0,
 4,
 6,
 2,
 6,
 1,
 3,
 0,
 1,
 5,
 4,
 6,
 5,
 3,
 5,
 5,
 5,
 1,
 1,
 1,
 3,
 0,
 1,
 0,
 4,
 0,
 6,
 2,
 4,
 3,
 5,
 1,
 0,
 5,
 0,
 3,
 4,
 1,
 6,
 6,
 5,
 6,
 4,
 4,
 5,
 2,
 2,
 1,
 5,
 5,
 1,
 4,
 0,
 4,
 5,
 4,
 6,
 4,


In [39]:
submission = pd.read_csv('data/sample_submission.csv')
submission['topic_idx'] = predict_li
submission.to_csv("results/klue-bert-mlm-classification-2epoch-norandom.csv",index=False)

In [24]:
data = next(iter(validation_dataloader))
data[3][0]

tensor(1)

In [42]:
model.eval()
data = tuple(t.to(device) for t in data)
input_ids, token_ids, mask, label, masked_token_idx = data
outputs = model(input_ids, token_type_ids=token_ids, attention_mask=mask)
print(outputs)

BertForPreTrainingOutput(loss=None, prediction_logits=tensor([[[ -7.5750,   4.0583,  -5.7115,  ...,  -8.4080, -10.1831,  -6.9428],
         [ -5.7624,   6.3970,  -5.7372,  ...,  -8.4756,  -7.7428,  -6.1240],
         [ -6.1303,   4.9622,  -5.0594,  ...,  -7.7942,  -7.3606,  -8.2195],
         ...,
         [ -6.7994,   5.5160,  -4.6602,  ...,  -7.2120,  -6.9814,  -6.1145],
         [ -6.7101,   5.8034,  -4.9487,  ...,  -7.9012,  -7.3610,  -5.1996],
         [ -7.5329,   5.2829,  -4.8685,  ...,  -8.5925,  -7.0635,  -6.2882]],

        [[ -7.3972,   4.0784,  -4.7739,  ...,  -7.8792, -10.2142,  -7.2260],
         [ -5.4033,   7.5581,  -4.3354,  ...,  -7.0303,  -7.8286,  -5.9605],
         [ -4.9273,   7.4306,  -4.3602,  ...,  -4.0567,  -4.7388,  -3.5071],
         ...,
         [ -6.2394,   4.7532,  -3.5773,  ...,  -7.1833,  -8.6217,  -5.8134],
         [ -6.1106,   4.8454,  -4.0421,  ...,  -7.2750,  -8.5017,  -5.9248],
         [ -5.9491,   6.4712,  -3.8178,  ...,  -5.6931,  -6.9110,  -5

In [43]:
logits_cls, logits_lm = outputs[1], outputs[0]

In [44]:
logits_lm.shape

torch.Size([64, 35, 32000])

In [45]:
logits_lm = logits_lm.to('cpu').detach().numpy()
print(logits_lm.shape)
print(len(logits_lm[0][0]))
print(np.argmax(logits_lm,axis=2).shape)
out = np.argmax(logits_lm,axis=2)

(64, 35, 32000)
32000
(64, 35)


In [46]:
out[3]

array([ 4559,  4559,  2630,  4559, 22550,  8387,  4559,  4559, 21155,
        2016,  2170,  5002, 10470,  5419,    18,    18,    18,  4559,
          18,  1504,  4559,  4559,  4559,    18,  4559,  4559,  4559,
        4559,  4559,  4559,  4559,  4559,  4559,  4559,  4559],
      dtype=int64)

In [47]:
masked_token_idx_np = masked_token_idx.to('cpu').numpy()

In [48]:
flatten = []
for i in range(len(out)):
    l = token_topic_dict[out[i][masked_token_idx_np[i]]]
    flatten.append(l)
    


In [49]:
flatten

[1,
 6,
 3,
 5,
 6,
 2,
 1,
 6,
 6,
 4,
 5,
 4,
 4,
 2,
 0,
 4,
 1,
 3,
 6,
 0,
 6,
 3,
 6,
 4,
 2,
 3,
 1,
 5,
 3,
 5,
 5,
 6,
 3,
 1,
 4,
 6,
 1,
 1,
 2,
 5,
 2,
 2,
 6,
 1,
 1,
 1,
 2,
 5,
 5,
 3,
 2,
 3,
 4,
 2,
 4,
 4,
 2,
 2,
 4,
 5,
 3,
 1,
 1,
 0]

In [52]:
label_np = label.to("cpu")
label_np

tensor([1, 6, 3, 5, 6, 2, 1, 6, 6, 4, 5, 4, 4, 2, 0, 4, 1, 3, 6, 0, 6, 3, 6, 4,
        2, 3, 1, 5, 3, 5, 5, 6, 2, 1, 4, 6, 1, 1, 2, 5, 2, 2, 6, 1, 2, 1, 2, 5,
        5, 3, 2, 3, 4, 3, 4, 5, 2, 2, 4, 5, 3, 1, 6, 0])

In [53]:
accuracy_score(label_np, flatten)

0.921875

In [54]:
# print(tokenizer.convert_ids_to_tokens(data[0][3]))
# print(data[0][3])
# print(data[1][3])
# print(data[2][3])
# print(data[3][3])
# print(data[4][3])
# print(tokenizer.convert_ids_to_tokens(out[3]))
# print(out[3])

In [57]:
data = next(iter(test_dataloader))
for i in data:
    print(i)

tensor([[    2, 20134,  8527,  ...,     0,     0,     0],
        [    2,   393,  3698,  ...,     0,     0,     0],
        [    2,   268,  7054,  ...,     0,     0,     0],
        ...,
        [    2,  5621, 13344,  ...,     0,     0,     0],
        [    2,  1485, 29945,  ...,     0,     0,     0],
        [    2,  4989,  3698,  ...,     0,     0,     0]])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [67]:
# del batch
# del data
# del outputs
# del logits_lm
# del logits_cls
# del label_lms_pt
# del b_input_ids, b_token_type_ids, b_input_mask, b_labels, b_masked_token_idx 
del train_dataloader

In [66]:
predict_li = []

model.eval()
for step, batch in enumerate(tqdm_notebook(test_dataloader)):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_token_type_ids, b_input_mask, b_labels, b_masked_token_idx = batch
    
    outputs = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask)
    
    logits_cls, logits_lm = outputs[1], outputs[0]
    
    logits_lm = logits_lm.to('cpu').detach().numpy()
    out = np.argmax(logits_lm,axis=2)
    
    masked_token_idx_np = b_masked_token_idx.to('cpu').numpy()
    
    print(out)
    
    for i in range(len(out)):
        l = token_topic_dict[out[i][masked_token_idx_np[i]]]
        predict_li.append(l)
        
    
#     mask_label = [ topic_token_dict[lb] for lb in b_labels.to('cpu').numpy() ]
#     label_lms = []
#     SEQUENCE_LENGTH = 35

#     for idx, label in zip(b_masked_token_idx.to('cpu').numpy(),mask_label):
#         label_lm = np.full(SEQUENCE_LENGTH, dtype=np.int, fill_value=-1)
#         label_lm[idx] = label
#         label_lms.append(label_lm)

#     label_lms_np = np.array(label_lms)
#     label_lms_pt = torch.tensor(label_lms_np,dtype=torch.int64).to(device)
#     loss_lm = criterion_lm(logits_lm.view(-1, logits_lm.size(2)), label_lms_pt.view(-1))

#     labels_cls = [1 for _ in range(len(b_input_ids))]
#     labels_cls = torch.tensor(labels_cls).to(device)
#     loss_cls = criterion_cls(logits_cls, labels_cls)
    
#     loss = loss_cls + loss_lm
    
#     batch_loss += loss.item()
    
# avg_validation_loss = batch_loss / len(validation_dataloader)
# print(F'\n\tAverage validation loss: {avg_validation_loss}')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/143 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 8.00 GiB total capacity; 6.57 GiB already allocated; 0 bytes free; 6.78 GiB reserved in total by PyTorch)