In [24]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

from KoBERT.kobert.utils import get_tokenizer
from KoBERT.kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

import pandas as pd
from sklearn.model_selection import train_test_split

##GPU 사용 시
device = torch.device("cuda:0")

bertmodel, vocab = get_pytorch_kobert_model()

using cached model
using cached model


In [25]:
## Setting parameters
max_len = 40
batch_size = 16
warmup_ratio = 0.1
num_epochs = 3
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5
NUM_CLASS = 7

In [26]:
RANDOM_SEED = 17
dataset = pd.read_csv("data/train_data.csv",index_col=False)
dataset_train, dataset_test = train_test_split(dataset,test_size = 0.2,random_state = RANDOM_SEED)

In [27]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

class BERTDataset(Dataset):
    def __init__(self, dataset, sent_key, label_key, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i]) for i in dataset[sent_key]]
        
        if not label_key == None:
            self.mode = "train"
        else:
            self.mode = "test"
            
        if self.mode == "train":
            self.labels = [np.int32(i) for i in dataset[label_key]]
        else:
            self.labels = [np.int32(0) for i in dataset[sent_key]]

    def __getitem__(self, i):
        if self.mode == "train":
            return (self.sentences[i] + (self.labels[i], ))
        else:
            return self.sentences[i]

    def __len__(self):
        return (len(self.labels))


using cached model


In [28]:
data_train = BERTDataset(dataset_train, "title", "topic_idx", tok, max_len, True, False)
data_test = BERTDataset(dataset_test, "title", "topic_idx", tok, max_len, True, False)

train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

In [29]:
dataset_test = pd.read_csv("data/test_data.csv",index_col=False)
data_test = BERTDataset(dataset_test, "title", None, tok, max_len, True, False)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

submission = pd.read_csv('data/sample_submission.csv')

In [30]:
print(data_train[0])
for i in train_dataloader:
    print(i[3][0])
    break

(array([   2, 1698, 6264, 2718, 6951, 5488, 3298, 7682, 4329, 7053, 3065,
       5581,  824,    3,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1], dtype=int32), array(14, dtype=int32), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32), 6)
tensor(6, dtype=torch.int32)


In [31]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=NUM_CLASS,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [32]:
model = torch.load("models/kobert-4-final.pth")

In [33]:
model.to(device)

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

In [34]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [53]:
model.eval()
pred = []
outs = []

for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(test_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    out = model(token_ids, valid_length, segment_ids)
        
    _,max_indices = torch.max(out,1)
    for idx in max_indices.cpu().numpy():
        pred.append(idx)
        
    for o in out.detach():
        outs.append(o)
    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/571 [00:00<?, ?it/s]

In [36]:
pred

[2,
 3,
 2,
 2,
 3,
 0,
 5,
 3,
 4,
 4,
 4,
 6,
 4,
 5,
 6,
 1,
 6,
 2,
 4,
 4,
 4,
 4,
 4,
 3,
 0,
 3,
 6,
 2,
 5,
 1,
 3,
 4,
 4,
 5,
 2,
 4,
 5,
 4,
 6,
 5,
 5,
 5,
 5,
 3,
 0,
 5,
 3,
 6,
 1,
 6,
 0,
 4,
 6,
 4,
 5,
 1,
 4,
 6,
 3,
 3,
 4,
 4,
 4,
 5,
 3,
 1,
 5,
 5,
 2,
 0,
 3,
 4,
 3,
 3,
 1,
 5,
 1,
 0,
 1,
 2,
 6,
 1,
 5,
 4,
 6,
 5,
 4,
 1,
 6,
 1,
 5,
 5,
 4,
 4,
 4,
 1,
 6,
 4,
 1,
 1,
 5,
 5,
 4,
 4,
 5,
 4,
 3,
 4,
 0,
 3,
 2,
 2,
 2,
 4,
 3,
 5,
 3,
 6,
 2,
 4,
 5,
 0,
 6,
 0,
 2,
 1,
 5,
 4,
 0,
 4,
 0,
 2,
 6,
 4,
 4,
 2,
 6,
 2,
 1,
 0,
 5,
 2,
 3,
 3,
 4,
 4,
 1,
 2,
 3,
 2,
 6,
 0,
 1,
 6,
 4,
 3,
 5,
 2,
 0,
 6,
 1,
 2,
 3,
 5,
 5,
 3,
 6,
 1,
 1,
 4,
 1,
 2,
 2,
 1,
 3,
 4,
 2,
 3,
 6,
 2,
 5,
 1,
 5,
 1,
 5,
 1,
 2,
 0,
 3,
 5,
 2,
 1,
 0,
 4,
 2,
 2,
 6,
 1,
 3,
 0,
 1,
 5,
 4,
 6,
 5,
 3,
 5,
 5,
 5,
 1,
 1,
 1,
 3,
 0,
 1,
 0,
 4,
 0,
 6,
 2,
 4,
 3,
 5,
 1,
 0,
 5,
 0,
 3,
 4,
 1,
 6,
 6,
 3,
 6,
 4,
 4,
 5,
 2,
 2,
 1,
 5,
 5,
 1,
 4,
 0,
 4,
 5,
 4,
 6,
 4,


In [78]:
outs = np.array(outs)
outs, len(outs), outs[0], outs[0].cpu().numpy()

(array([tensor([ 2.6485,  0.6508,  3.7859,  1.7606, -2.1103, -2.2866, -2.8808],
        device='cuda:0'),
        tensor([-1.2475, -1.4552, -0.2895,  6.5390, -1.0645, -1.1071, -1.4559],
        device='cuda:0'),
        tensor([-0.8278,  0.5448,  4.7876, -0.9837, -2.2014, -2.7468,  2.3614],
        device='cuda:0'),
        ...,
        tensor([-1.2840, -1.2306,  4.5834,  3.6261, -1.2074, -1.9716, -1.1157],
        device='cuda:0'),
        tensor([-6.4991e-01,  2.7113e-02,  4.2975e+00,  1.3415e+00,  7.6421e-04,
         -2.3367e+00, -1.1355e+00], device='cuda:0'),
        tensor([-1.5681, -0.6418,  3.3331, -1.6345, -1.2429, -2.1326,  4.5700],
        device='cuda:0')], dtype=object),
 9131,
 tensor([ 2.6485,  0.6508,  3.7859,  1.7606, -2.1103, -2.2866, -2.8808],
        device='cuda:0'),
 array([ 2.6484816 ,  0.65084934,  3.785947  ,  1.7605944 , -2.1102579 ,
        -2.2865846 , -2.8807566 ], dtype=float32))

In [86]:
import torch.nn.functional as F

tmp = []
for out in outs:
    data = F.softmax(out,dim=0)
    tmp.append(data.cpu().numpy())

tmp

[array([0.21341214, 0.02895066, 0.6656012 , 0.08782428, 0.00183031,
        0.00153443, 0.00084704], dtype=float32),
 array([4.1400135e-04, 3.3638271e-04, 1.0791459e-03, 9.9686074e-01,
        4.9716298e-04, 4.7644059e-04, 3.3614220e-04], dtype=float32),
 array([3.2777481e-03, 1.2932602e-02, 9.0012664e-01, 2.8046814e-03,
        8.2995661e-04, 4.8101929e-04, 7.9547353e-02], dtype=float32),
 array([1.3989006e-01, 8.3875516e-03, 8.3208442e-01, 1.2606352e-02,
        2.6308345e-03, 8.0804003e-04, 3.5927398e-03], dtype=float32),
 array([3.8103349e-04, 3.4736894e-04, 1.2172321e-03, 9.9669492e-01,
        6.3000678e-04, 4.3255882e-04, 2.9692595e-04], dtype=float32),
 array([0.7842915 , 0.1833878 , 0.01405823, 0.01390892, 0.00122281,
        0.00219535, 0.00093536], dtype=float32),
 array([2.9274664e-04, 2.5508893e-04, 5.4674939e-04, 3.1002413e-04,
        3.6614856e-03, 9.9427044e-01, 6.6348439e-04], dtype=float32),
 array([4.0122712e-04, 2.9558319e-04, 1.9042873e-03, 9.9621826e-01,
        

In [87]:
kobert_y_df = pd.DataFrame(tmp)

In [88]:
kobert_y_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.213412,0.028951,0.665601,0.087824,0.00183,0.001534,0.000847
1,0.000414,0.000336,0.001079,0.996861,0.000497,0.000476,0.000336
2,0.003278,0.012933,0.900127,0.002805,0.00083,0.000481,0.079547
3,0.13989,0.008388,0.832084,0.012606,0.002631,0.000808,0.003593
4,0.000381,0.000347,0.001217,0.996695,0.00063,0.000433,0.000297


In [89]:
kobert_y_df.to_csv("ensemble/kobert.csv")

In [12]:
model.eval()
val_loss = 0
test_acc = 0.0
wrong_answer = []

def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):

    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    label = label.long().to(device)

    outs = model(token_ids, valid_length, segment_ids)
    loss = loss_fn(outs,label)
    
    max_vals, max_indices = torch.max(outs, 1)
    
    for tok, out, lab in zip(token_ids,max_indices, label):
        if out != lab:
            wrong_answer.append((tok,out,lab))

    val_loss += loss.data.cpu().numpy()
    test_acc += calc_accuracy(outs, label)

# print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
# print("epoch {} val_loss {}".format(e+1, val_loss / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


  0%|          | 0/571 [00:00<?, ?it/s]

In [17]:
wrong_answer

[(tensor([   2, 4525,  589, 5713, 2816, 6238, 7953, 5474, 1986, 7741,  994, 4257,
           517,  210,  517,   11,  517, 5330, 3312,    3,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1], device='cuda:0'),
  tensor(2, device='cuda:0'),
  tensor(3, device='cuda:0')),
 (tensor([   2, 1073, 7970, 7570, 4438, 6137,  517,  194, 7279, 1263, 6730,    3,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1], device='cuda:0'),
  tensor(2, device='cuda:0'),
  tensor(3, device='cuda:0')),
 (tensor([   2, 4858,  299,  351, 4264, 2777, 7227, 6441, 4907, 4112, 7636, 4577,
             3,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1],

In [38]:
wrong_answer[2]

for token_ids, out, lab in wrong_answer:
    print(token_ids, out, lab)
    for ele in token_ids:
        print(vocab.idx_to_token[ele],end="")
    print()

tensor([   2, 4525,  589, 5713, 2816, 6238, 7953, 5474, 1986, 7741,  994, 4257,
         517,  210,  517,   11,  517, 5330, 3312,    3,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(2, device='cuda:0') tensor(3, device='cuda:0')
[CLS]▁최근▁3년간▁세종문화회관▁매표▁고객▁중▁71▁%▁가▁여성[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 1073, 7970, 7570, 4438, 6137,  517,  194, 7279, 1263, 6730,    3,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(2, device='cuda:0') tensor(3, device='cuda:0')
[CLS]▁관훈클럽▁창립▁62주년▁기념식[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
t

[CLS]▁박정호▁SKT▁사장▁5G▁네트워크▁품질▁두▁달▁내▁안정화▁하겠다[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 3724, 6664, 6896, 7096, 6896, 2485, 6573, 6738, 5452, 6705, 6338,
        4339, 3397,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(2, device='cuda:0') tensor(1, device='cuda:0')
[CLS]▁이에스에이에▁불성실공시법인▁지정▁예고[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2,  612, 5712, 4587, 6995, 5585, 4958, 4587, 5436,  862, 6217, 2169,
        7089, 6493, 2199, 6177, 7925, 5163, 2348, 6579, 7270,    3,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(2, device='cuda:0') tensor(3, device='cuda:0'

[CLS]▁박지원▁vs▁.▁이재오▁...▁더▁독해지는▁KBS▁더▁라이브[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2,  517, 5371,  343,  105, 4821, 1926,  824,  517,   55, 1138, 6553,
        4821, 7682, 7673,  517,  478, 2150, 6559,  517, 6690, 3432,    3,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(0, device='cuda:0') tensor(4, device='cuda:0')
[CLS]▁갤S10▁팬▁마케팅▁강화▁...▁국내서▁팬파티▁·▁미국선▁스토어▁오픈[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 2651, 7736,  553, 6418, 3047, 4213, 7469,  517,   55,  687,  105,
        4809, 1778, 6896, 4257, 7199, 5330, 7736, 2539,  507,    3,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(0, device='cuda:0') tensor(1, device='cuda:0')
[CLS]▁삼성폰▁2분기▁실적▁주춤▁...▁S10▁판매▁둔

[CLS]▁이마트▁온라인쇼핑몰▁사업▁물적분할▁이마트몰▁신설[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2,  517, 7107, 5468, 1900, 6445, 2618,  517,   55,  517, 5281, 1667,
        4998, 6335, 7086, 2465, 6730, 1586, 7202, 4087,    3,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(4, device='cuda:0') tensor(6, device='cuda:0')
[CLS]▁이란과▁리비아▁사이▁...▁文▁대통령▁해법은▁북한식▁단계적▁접근[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 4917, 7847, 3790, 6364, 7932, 4349, 7846, 7720, 3014,  844,    3,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(3, device='cuda:0') tensor(4, device='cuda:0')
[CLS]▁필

[CLS]▁돈▁되는▁비트코인이▁사상▁최악▁랜섬웨어▁키웠다[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 2856, 6364, 1110, 4259,  879, 1680, 3574, 7825,  517,  478, 1103,
        7343,  553, 7276, 1815, 5488, 4257, 7318,    3,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(4, device='cuda:0') tensor(2, device='cuda:0')
[CLS]▁속보▁교육부▁중국▁거친▁대학▁유학생▁·▁교직원▁2주▁등교▁중지[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 3552, 6190, 7753, 4856, 5479, 7136, 5550, 7234, 4855, 4482, 3206,
        5546, 6706, 4070, 4252, 6855,    3,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(2, device='cuda:0') tensor(1, device='cuda:0')
[CLS]▁위메프▁포괄임금

tensor([   2, 1470, 2053, 4657, 6037,  839, 2938, 7659, 6873,  517, 6983, 7096,
        5920, 6347, 7673, 7318, 4768,    3,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(0, device='cuda:0') tensor(1, device='cuda:0')
[CLS]▁네이버▁모바일▁컨트롤러▁개발▁스타트업▁와이드벤티지▁투자[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 4955, 6573, 6642, 1542, 7720, 7354, 1820, 2584,  849, 3298, 7318,
        3119, 3270, 6838,  905,    3,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(2, device='cuda:0') tensor(6, device='cuda:0')
[CLS]▁한성숙▁뉴스편집▁등에▁사람▁개입▁여지▁아예▁없앨▁것[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tenso

           1,    1,    1,    1], device='cuda:0') tensor(0, device='cuda:0') tensor(3, device='cuda:0')
[CLS]▁게시판▁SK텔레콤▁5GX▁부스트파크에서▁성탄절▁행사[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 1132, 5579,  953, 6078, 7794, 3574, 7086, 7922, 2457, 6983, 4597,
        5524, 1109, 5341, 5931,    3,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(2, device='cuda:0') tensor(6, device='cuda:0')
[CLS]▁국기에▁경례하는▁유은혜▁부총리와▁충청권▁교육감들[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 1316, 7925, 7427, 3969, 6745, 6553,  517, 5859, 7095, 7202, 3943,
        3785,  517,   55,  517, 7918, 6896, 4897, 7881, 4555, 3173,    3,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         

[CLS]▁경영난에▁임대료도▁못내는▁부산북항▁컨부두▁...▁800억원대▁체납[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 3200, 1262,  727, 4949, 6553, 4425, 6304,  517,   55, 1117,  517,
         478,  517,  321, 5561, 5579, 2182,  589, 7044,    3,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(4, device='cuda:0') tensor(0, device='cuda:0')
[CLS]▁애플▁기기▁美▁학교서▁찬밥▁...▁구글▁·▁MS기기에▁밀려▁3위[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 3224,  983, 5812,  517, 5330, 6553, 4252, 7088, 2718, 6844,  760,
        2554, 6116, 5591,  517,   55,  972, 7821, 7167, 1615, 7086,    3,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(1, device='cuda:0') tensor(3, device='cuda:0')
[CLS]▁어떤▁계산대로▁가서▁줄을▁서야▁가장▁빠를까▁...▁경제학자의▁답은[SEP][PAD]

           1,    1,    1,    1], device='cuda:0') tensor(3, device='cuda:0') tensor(2, device='cuda:0')
[CLS]▁장항화물역▁도시탐험역으로▁변신[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 3010, 5712, 6493, 1476, 6557, 7946, 1073, 6579, 7431, 7178, 2906,
         517,  478, 3140, 7202, 2801, 6629, 1787,    3,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(2, device='cuda:0') tensor(1, device='cuda:0')
[CLS]▁신년사▁노석환▁관세청장▁수출▁·▁안정적▁세수▁뒷받침[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2,  689, 5386, 2440, 2207, 6916,  539, 5499, 4243, 7191, 5362, 2609,
        2902,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         

[CLS]▁한계기업▁주식▁불공정거래▁주의▁거래소▁올해▁적발▁18곳[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2,  846, 4854, 4955, 5793, 3574, 6077, 5863, 6079, 2339, 7828,  845,
           3,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(6, device='cuda:0') tensor(2, device='cuda:0')
[CLS]▁개성공단▁폐쇄▁한달▁유령도시로▁변한▁개성[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 2465, 6165, 6204, 7716, 6593, 2944, 6553,  517, 5256, 3761, 2627,
        7207, 6705, 7953,  854,    3,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(6, device='cuda:0') ten

tensor([   2, 4955, 6638, 3139, 1023, 7118, 5482, 6079, 2726, 6952, 5482, 7234,
         517, 7086, 6527, 2884,    3,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(2, device='cuda:0') tensor(3, device='cuda:0')
[CLS]▁한수원▁안전▁공익광고로▁서울영상광고제▁은상▁수상[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 1636, 6710, 3190, 2734, 1633, 7895, 6797, 4162, 7903, 7005,  793,
        7843,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(2, device='cuda:0') tensor(5, device='cuda:0')
[CLS]▁대구시민▁앞에▁선▁대헤아▁조현우▁감사합니다[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]

           1,    1,    1,    1], device='cuda:0') tensor(1, device='cuda:0') tensor(2, device='cuda:0')
[CLS]▁대신지배구조▁硏▁도▁현대모비스▁분할▁·▁합병에▁반대▁권고[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 2150,  517, 5313, 1330, 7086, 3469, 2401, 6745,  517, 5256, 1326,
        7427,  517,   55, 3224, 3330, 7836, 5591,    3,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(4, device='cuda:0') tensor(6, device='cuda:0')
[CLS]▁미국▁行▁김정은▁외교▁복심▁北▁김영철▁...▁어떤▁역할할까[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 3447, 3447, 3358, 5561, 1871,  813, 7095, 2265,    3,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,

           1,    1,    1,    1], device='cuda:0') tensor(6, device='cuda:0') tensor(2, device='cuda:0')
[CLS]▁6▁.▁15북측위▁민족공동행사▁결렬에▁우리▁정부▁비난[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 4777, 5392, 1317, 7467,  517, 5255,  517, 6741,  517,  478, 2120,
        7436, 6398, 1077, 3928, 1815, 3183,  119, 6364,    3,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(2, device='cuda:0') tensor(6, device='cuda:0')
[CLS]▁특검▁김기춘▁前▁실장▁·▁문체부▁관계자▁자택▁등▁압수수색2보[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 3010, 7828, 5555,  554,  116, 1233, 3919, 5808, 7466, 4370,    3,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1, 

[CLS]▁거래소▁돈육선물▁시장▁휴장▁...▁아프리카돼지열병▁여파종합[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 3093, 6113, 7896, 7673, 5655, 6553, 1579,  553, 7126, 4257, 5666,
        6255,  659, 7692, 1694,  517,   55,  543, 5357, 7678, 4428,    3,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(4, device='cuda:0') tensor(3, device='cuda:0')
[CLS]▁아르헨티나서▁다음달▁2일▁중남미▁K팝▁대회▁...▁15개팀▁참가[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2,  727, 1132, 6228, 6398, 1648, 7231, 2423, 5822,  517, 6037, 7479,
        5468, 4970, 2125, 1023, 7253, 1502,    3,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(4, device='cuda:0') tensor(6, device='cuda:0')
[CLS]▁美▁국무부▁대북정책▁부대표▁러측과▁한반도▁문제▁공조▁논의[SEP

[CLS]▁김광현은▁왜▁세인트루이스를▁택했나[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 3396, 7005, 6293, 2573, 6273, 6714,  609, 6328, 1585, 4773, 6003,
        7005, 7659,  517,   55,  791, 7095,  671,  311,  270, 3449,    3,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(5, device='cuda:0') tensor(4, device='cuda:0')
[CLS]▁예우받은▁사바시아▁45번▁단▁트라우트▁...▁감동의▁MLB▁올스타전[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 1010, 5808, 6016,  756, 7941,  553, 7028, 2609, 7436, 4197, 6493,
        7147, 4287,  526,  517,   54,  627,  517,   11,  517, 6079, 3323, 4537,
           3,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(1, device='cuda:0') tensor(2, device='cuda:0')
[CLS]▁고용대란▁가시화▁2월▁사업체▁종사자▁증가율▁0▁.

           1,    1,    1,    1], device='cuda:0') tensor(1, device='cuda:0') tensor(4, device='cuda:0')
[CLS]▁특징주▁아난티▁짐▁로저스▁사외이사▁선임에▁급등종합[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2,  689,  351,  611,  290, 3485, 3758, 5330, 3969, 6736,  517,   55,
         611, 6153, 5808, 3484, 4868, 5943, 7270,    3,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(1, device='cuda:0') tensor(0, device='cuda:0')
[CLS]▁SKT▁5G▁요금제▁인가▁재신청▁...▁5만원대▁요금▁포함한듯종합[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 1542,  517,  478, 3036, 5558, 5392, 2573, 6003, 7344, 1470, 2053,
        4481, 5114,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,

[CLS]▁윤석열▁검찰총장▁마이크▁켜주는▁손학규▁대표[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2,  517, 5283,  517,  478,  517,    0, 4904, 6255, 6855,  109,  838,
        7207, 6553, 2333, 5702, 6629,  517,  478, 4886, 6896, 6113, 7628, 4128,
        6825,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(5, device='cuda:0') tensor(4, device='cuda:0')
[CLS]▁日▁·▁[UNK]▁프리미어12▁개막전서▁베네수▁·▁푸에르토▁제압[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2,  668,  356, 2150, 2938, 7659, 6873, 5468,  517,  267, 4804, 6749,
        4492,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(0, device='cuda:0') tensor(1, device='cuda:0')
[CLS]▁LGU▁미국▁스타트업과▁AR▁파트너십▁체결[SEP][PAD][PAD][PAD][

[CLS]▁주민센터서▁휴면재산▁검색▁...▁금융고객보호▁책임▁CEO가▁진다[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 2251, 7794, 5108, 6263, 7227, 2573, 5491, 5383, 7227, 6882, 6579,
        6527, 1674,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(6, device='cuda:0') tensor(2, device='cuda:0')
[CLS]▁발표하는▁홍민정▁사교육걱정없는세상▁대표[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 4635, 7631, 6305,  695,  342,  311, 1489,  848, 2322, 5532, 3555,
        2822, 7318, 3211, 7848, 7270,    3,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(2, device='cuda:0') tensor(0, device='cuda:0')
[C

[CLS]▁아프리카▁돼지열병▁예방관리▁안내[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2,  665, 3574, 6745, 3969, 5439, 2373, 2485, 5330, 2282, 6896, 3595,
        7220, 2423, 5547, 6398, 5547,    3,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(2, device='cuda:0') tensor(1, device='cuda:0')
[CLS]▁KT▁유심▁재고▁보상▁불가▁방침에▁유통점▁부글부글[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 3827, 2781, 7389, 6356, 3494, 5669, 2086, 4998,  637,  271, 2665,
        3298, 7343, 2889, 6204, 2839, 4249,    3,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(2, device='cuda:0') tensor(4, device

           1,    1,    1,    1], device='cuda:0') tensor(0, device='cuda:0') tensor(1, device='cuda:0')
[CLS]▁KT▁화웨이▁비와이폰3▁출시▁...▁출고가▁33만원[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 4106,  554,  137, 5714, 1708, 2991,  517,  165, 7258,  517,  478,
        3819,  628, 6150, 5357, 4443,    3,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1], device='cuda:0') tensor(1, device='cuda:0') tensor(0, device='cuda:0')
[CLS]▁정부▁2025년까지▁데이터▁시장▁43조원▁·▁일자리▁90만개▁창출[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor([   2, 4640, 4635, 7631, 2001, 3332,  617, 7422, 6858, 4027,  517,   55,
        1692, 6217, 6083, 1097, 4937,  581, 6858,    3,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
    

In [17]:
submission['topic_idx'] = pred

In [21]:
len(pred)

9131

In [18]:
submission.head()

Unnamed: 0,index,topic_idx
0,45654,0
1,45655,3
2,45656,2
3,45657,2
4,45658,3


In [20]:
submission.to_csv("results/kobert-2.csv",index=False)