In [4]:
import torch 
import torch.nn as nn 
import torch.functional as f
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
import time
import datetime
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import random 
import re 
import math 
from tqdm import tqdm 
import sklearn
from transformers import *
from kobert_tokenizer import KoBERTTokenizer

PyTorch version 1.7.0+cu110 available.
TensorFlow version 2.5.0 available.


In [5]:
train = pd.read_csv("open/train.csv") 
test = pd.read_csv("open/test.csv") 

train.shape, test.shape, train['label'].nunique()


((174304, 13), (43576, 12), 46)

In [7]:
submission = pd.read_csv("open/sample_submission.csv")

In [8]:
def clean_text(sent):
    sent_clean=re.sub("[^가-힣ㄱ-하-ㅣ]", " ", sent)
    return sent_clean

In [9]:
def split_text(s, overlap = 20, chunk_size = 50): 
    total = [] 
    partial = [] 
    if len(s.split()) // (chunk_size - overlap) > 0:  
        n = len(s.split()) // (chunk_size - overlap) 
    else: 
        n = 1 
    for w in range(n): 
        if w == 0: 
            partial = s.split()[:chunk_size] 
            total.append(" ".join(partial)) 
        else:  
            partial = s.split()[w*(chunk_size - overlap):w*(chunk_size - overlap) + chunk_size]
            total.append(" ".join(partial)) 
    return total


In [10]:
train['요약문_내용'] = train['요약문_연구목표'] + train['요약문_연구내용'] + train['요약문_기대효과'] 
test['요약문_내용'] = test['요약문_연구목표'] + test['요약문_연구내용'] + test['요약문_기대효과']

In [11]:
train['요약문_내용'].fillna('NAN',inplace=True) 
test['요약문_내용'].fillna('NAN',inplace=True)

In [12]:
train['사업명'].fillna('NAN',inplace=True) 
train['사업_부처명'].fillna('NAN',inplace=True) 
train['내역사업명'].fillna('NAN',inplace=True) 
train['과제명'].fillna('NAN',inplace=True) 
train['요약문_한글키워드'].fillna('NAN',inplace=True)

In [13]:
contents = train['요약문_내용'].values 
feature1 = train['사업명'].values 
feature2 = train['사업_부처명'].values 
feature3 = train['내역사업명'].values 
feature4 = train['과제명'].values 
feature5 = train['요약문_한글키워드'].values 
feature6 = train['label'].values 

train_data = {'사업명':[],'사업_부처명':[],'내역사업명':[],'과제명':[],'한글키워드':[],'요약문':[],'label':[]} 

for i in tqdm(range(contents.shape[0]), position = 0, leave = True): 
    sample = str(contents[i]) 
    splitted_text = split_text(clean_text(sample)) 
    for t in splitted_text: 
        train_data['요약문'].append(t) 
        train_data['사업명'].append(clean_text(str(feature1[i])))
        train_data['사업_부처명'].append(clean_text(str(feature2[i]))) 
        train_data['내역사업명'].append(clean_text(str(feature3[i]))) 
        train_data['과제명'].append(clean_text(str(feature4[i])))  
        train_data['한글키워드'].append(feature5[i]) # no cleaning for this one
        train_data['label'].append(feature6[i])

100%|██████████| 174304/174304 [03:07<00:00, 929.93it/s] 


In [14]:
train_data = pd.DataFrame(train_data)

In [17]:
## Now we tokenize each data and make sure they all lie within the 512 tokenization range 
## if not check how many have token length greater than 512 

tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')


def bert_tokenizer(sent, MAX_LEN):  
    encoded_dict = tokenizer.encode_plus(
        text = sent, 
        add_special_tokens = True, # add [CLS] and [SEP]
        pad_to_max_length = False, 
        return_attention_mask = True # constructing attention_masks 
    )  
    
    input_id = encoded_dict['input_ids'] 
    attention_mask = encoded_dict['attention_mask'] # differentiate padding from non padding 
    token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences, not "really" necessary for now    
    
    if len(input_id) > 512: # head + tail methodology 
        input_id = input_id[:512]  
        attention_mask = attention_mask[:512]  
        token_type_id = token_type_id[:512]     
        print("Long Text!! Using the first 512 tokens")
    elif len(input_id) <= 512: 
        input_id = input_id + [0]*(512 - len(input_id)) 
        attention_mask = attention_mask + [0]*(512 - len(attention_mask))
        token_type_id = token_type_id + [0]*(512 - len(token_type_id))  
        
    return input_id, attention_mask, token_type_id

In [16]:
train_data['data'] = train_data['사업명'] + " " + train_data['사업_부처명'] + " " + train_data['내역사업명'] + " " + train_data['과제명'] + " " + train_data['한글키워드'] + " " + train_data['요약문'] 

train_data.head(2)

Unnamed: 0,사업명,사업_부처명,내역사업명,과제명,한글키워드,요약문,label,data
0,농업기초기반연구,농촌진흥청,농산물안전성연구,유전정보를 활용한 새로운 해충 분류군 동정기술 개발,"뉴클레오티드 염기서열, 분자마커, 종 동정, 침샘, 전사체",새로운 해충분류군의 동정기술 개발 및 유입확산 추적 가 외래 및 돌발해충의 발생조사...,24,농업기초기반연구 농촌진흥청 농산물안전성연구 유전정보를 활용한 새로운 해충 분류군 동...
1,농업기초기반연구,농촌진흥청,농산물안전성연구,유전정보를 활용한 새로운 해충 분류군 동정기술 개발,"뉴클레오티드 염기서열, 분자마커, 종 동정, 침샘, 전사체",의 돌발 및 외래해충 다 외래 및 돌발해충의 유전적 다양성 조사 시험곤충 나 의 해...,24,농업기초기반연구 농촌진흥청 농산물안전성연구 유전정보를 활용한 새로운 해충 분류군 동...


In [18]:
train_text = train_data['data'].values 
train_labels = train_data['label'].values

In [19]:
BATCH_SIZE = 32
NUM_EPOCHS = 30
VALID_SPLIT = 0.1 
MAX_LEN = 512 # max token size for BERT, ELECTRA

In [20]:
N = train_data.shape[0] 

input_ids = np.zeros((N, MAX_LEN),dtype=int)
attention_masks = np.zeros((N, MAX_LEN),dtype=int)
token_type_ids = np.zeros((N, MAX_LEN),dtype=int) 
labels = np.zeros((N),dtype=int)

for i in tqdm(range(N), position=0, leave=True): 
    try:
        cur_str = train_text[i]
        cur_label = train_labels[i]
        input_id, attention_mask, token_type_id = bert_tokenizer(cur_str, MAX_LEN=MAX_LEN) 
        input_ids[i,] = input_id 
        attention_masks[i,] = attention_mask 
        token_type_ids[i,] = token_type_id
        labels[i] = cur_label 
    except Exception as e: 
        print(e)
        print(cur_str)
        pass

  5%|▍         | 79939/1638867 [02:14<44:54, 578.58it/s]  

Long Text!! Using the first 512 tokens


  8%|▊         | 122944/1638867 [03:25<39:41, 636.45it/s]

Long Text!! Using the first 512 tokens


  8%|▊         | 138415/1638867 [03:50<44:06, 566.94it/s]

Long Text!! Using the first 512 tokens


 11%|█         | 173087/1638867 [04:52<43:14, 565.02it/s]  

Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens


 11%|█         | 178421/1638867 [05:01<43:33, 558.73it/s]

Long Text!! Using the first 512 tokens


 14%|█▍        | 233820/1638867 [06:35<41:29, 564.34it/s]

Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens


 15%|█▌        | 251064/1638867 [07:05<46:31, 497.14it/s]

Long Text!! Using the first 512 tokens


 16%|█▌        | 258665/1638867 [07:18<39:36, 580.83it/s]

Long Text!! Using the first 512 tokens


 29%|██▉       | 475568/1638867 [13:22<31:28, 615.87it/s]

Long Text!! Using the first 512 tokens


 30%|██▉       | 489033/1638867 [13:45<34:00, 563.38it/s]

Long Text!! Using the first 512 tokens


 33%|███▎      | 545079/1638867 [15:22<29:11, 624.54it/s]

Long Text!! Using the first 512 tokens


 35%|███▍      | 569102/1638867 [16:01<30:13, 589.74it/s]

Long Text!! Using the first 512 tokens


 35%|███▌      | 576093/1638867 [16:13<28:20, 624.84it/s]

Long Text!! Using the first 512 tokens


 36%|███▌      | 582067/1638867 [16:24<28:29, 618.11it/s]

Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens


 38%|███▊      | 624191/1638867 [17:33<29:42, 569.24it/s]

Long Text!! Using the first 512 tokens


 40%|████      | 657183/1638867 [18:30<29:09, 561.09it/s]

Long Text!! Using the first 512 tokens


 44%|████▎     | 713166/1638867 [20:06<26:40, 578.34it/s]

Long Text!! Using the first 512 tokens


 45%|████▍     | 733685/1638867 [20:41<27:58, 539.26it/s]

Long Text!! Using the first 512 tokens


 45%|████▌     | 743931/1638867 [20:59<28:21, 526.06it/s]

Long Text!! Using the first 512 tokens


 46%|████▌     | 748278/1638867 [21:06<25:39, 578.48it/s]

Long Text!! Using the first 512 tokens


 49%|████▊     | 796773/1638867 [22:30<23:35, 594.89it/s]

Long Text!! Using the first 512 tokens


 49%|████▉     | 805568/1638867 [22:45<23:11, 599.05it/s]

Long Text!! Using the first 512 tokens


 52%|█████▏    | 854252/1638867 [24:07<21:52, 597.86it/s]

Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens


 55%|█████▍    | 895825/1638867 [25:19<23:04, 536.87it/s]

Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens


 58%|█████▊    | 947976/1638867 [26:51<20:12, 569.67it/s]

Long Text!! Using the first 512 tokens


 65%|██████▌   | 1068793/1638867 [30:15<13:56, 681.69it/s]

Long Text!! Using the first 512 tokens


 66%|██████▋   | 1087490/1638867 [30:46<15:33, 590.88it/s]

Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens


 72%|███████▏  | 1185207/1638867 [33:32<11:39, 648.85it/s]

Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens


 72%|███████▏  | 1186957/1638867 [33:35<13:16, 567.12it/s]

Long Text!! Using the first 512 tokens


 73%|███████▎  | 1199358/1638867 [33:56<13:11, 555.43it/s]

Long Text!! Using the first 512 tokens


 75%|███████▍  | 1222390/1638867 [34:36<13:06, 529.52it/s]

Long Text!! Using the first 512 tokens


 75%|███████▍  | 1225518/1638867 [34:41<11:45, 585.58it/s]

Long Text!! Using the first 512 tokens


 75%|███████▌  | 1233813/1638867 [34:55<12:49, 526.55it/s]

Long Text!! Using the first 512 tokens


 83%|████████▎ | 1353133/1638867 [38:19<07:57, 597.79it/s]

Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens


 84%|████████▍ | 1373809/1638867 [38:54<07:47, 567.37it/s]

Long Text!! Using the first 512 tokens


 86%|████████▌ | 1410435/1638867 [39:58<06:13, 611.22it/s]

Long Text!! Using the first 512 tokens


 87%|████████▋ | 1422052/1638867 [40:17<06:08, 588.26it/s]

Long Text!! Using the first 512 tokens


 89%|████████▉ | 1454967/1638867 [41:13<05:10, 593.05it/s]

Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens


 95%|█████████▍| 1554722/1638867 [44:07<02:34, 543.97it/s]

Long Text!! Using the first 512 tokens


100%|██████████| 1638867/1638867 [46:32<00:00, 586.81it/s]


In [21]:
input_ids = torch.tensor(input_ids, dtype=int)
attention_masks = torch.tensor(attention_masks, dtype=int)
token_type_ids = torch.tensor(token_type_ids, dtype=int) 
labels = torch.tensor(labels, dtype=int)


In [22]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels, random_state = 5678, test_size = VALID_SPLIT, stratify=labels) 

train_attention_mask, val_attention_mask, _, _ = train_test_split(attention_masks, labels, random_state = 5678, test_size = VALID_SPLIT, stratify=labels) 

train_token_ids, val_token_ids, _, _ = train_test_split(token_type_ids, labels, random_state = 5678, test_size = VALID_SPLIT, stratify=labels) 


train_inputs.shape, train_attention_mask.shape, train_token_ids.shape, train_labels.shape, val_inputs.shape, val_attention_mask.shape, val_token_ids.shape, val_labels.shape


(torch.Size([1474980, 512]),
 torch.Size([1474980, 512]),
 torch.Size([1474980, 512]),
 torch.Size([1474980]),
 torch.Size([163887, 512]),
 torch.Size([163887, 512]),
 torch.Size([163887, 512]),
 torch.Size([163887]))

In [23]:
batch_size = 32 
train_data = TensorDataset(train_inputs, train_attention_mask, train_token_ids, train_labels) 
train_sampler = RandomSampler(train_data) 
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) 

validation_data = TensorDataset(val_inputs, val_attention_mask, val_token_ids, val_labels) 
validation_sampler = SequentialSampler(validation_data) 
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [24]:
model = BertForSequenceClassification.from_pretrained("skt/kobert-base-v1", num_labels=46)
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=535.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=368792544.0, style=ProgressStyle(descri…




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [25]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

epochs = 10

total_steps = len(train_dataloader) * epochs 

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)


def flat_accuracy(preds, labels): 
    pred_flat = np.argmax(preds, axis=1).flatten() 
    labels_flat = labels.flatten() 
    return np.sum(pred_flat == labels_flat) / len(labels_flat) 

def format_time(elapsed):
    # 반올림
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))


device = torch.device("cuda")


# 그래디언트 초기화
model.zero_grad()

# 에폭만큼 반복
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 시작 시간 설정
    t0 = time.time()

    # 로스 초기화
    total_loss = 0

    # 훈련모드로 변경
    model.train()
        
    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            print('  current average loss = {}'.format(total_loss / step))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch

        # Forward 수행                
        outputs = model(b_input_ids, 
                        token_type_ids=b_token_type_ids, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        # 로스 구함
        loss = outputs[0]

        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        optimizer.step()

        # 스케줄러로 학습률 감소
        scheduler.step()

        # 그래디언트 초기화
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    #시작 시간 설정
    t0 = time.time()

    # 평가모드로 변경
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch
        
        # 그래디언트 계산 안함
        with torch.no_grad():     
            # Forward 수행
            outputs = model(b_input_ids, 
                            token_type_ids=b_token_type_ids, 
                            attention_mask=b_input_mask, 
                            labels = b_labels)
    
        
        loss = outputs[0] 
        logits = outputs[1] 
        
        # 로스 구함 
        eval_loss += loss.item() 
        
        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    
    avg_val_loss = eval_loss / len(validation_dataloader)            
    print("  Average validation loss: {}".format(avg_val_loss))
    print("  Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
    
    torch.save(model.state_dict(), "KoBERT_DIFFERENT_CONFIG_" + str(epoch_i + 1)) 
    

print("")
print("Training complete!")



Training...
  Batch   500  of  46,094.    Elapsed: 0:08:22.
  current average loss = 1.0394093351811171
  Batch 1,000  of  46,094.    Elapsed: 0:16:43.
  current average loss = 0.8381642836257815
  Batch 1,500  of  46,094.    Elapsed: 0:25:04.
  current average loss = 0.7416638403125108
  Batch 2,000  of  46,094.    Elapsed: 0:33:26.
  current average loss = 0.6740569929247722
  Batch 2,500  of  46,094.    Elapsed: 0:41:48.
  current average loss = 0.6300022935830056
  Batch 3,000  of  46,094.    Elapsed: 0:50:09.
  current average loss = 0.5910384592091044
  Batch 3,500  of  46,094.    Elapsed: 0:58:30.
  current average loss = 0.5605949303901622
  Batch 4,000  of  46,094.    Elapsed: 1:06:52.
  current average loss = 0.5360350472899154
  Batch 4,500  of  46,094.    Elapsed: 1:15:15.
  current average loss = 0.5153200819359885
  Batch 5,000  of  46,094.    Elapsed: 1:23:35.
  current average loss = 0.49687080581914633
  Batch 5,500  of  46,094.    Elapsed: 1:31:56.
  current average 

  Batch 44,500  of  46,094.    Elapsed: 12:23:18.
  current average loss = 0.1899930844064573
  Batch 45,000  of  46,094.    Elapsed: 12:31:40.
  current average loss = 0.18873211899776038
  Batch 45,500  of  46,094.    Elapsed: 12:40:00.
  current average loss = 0.18748732914242913
  Batch 46,000  of  46,094.    Elapsed: 12:48:20.
  current average loss = 0.18634890301577917

  Average training loss: 0.1861113428295841
  Training epoch took: 12:49:53

Running Validation...
  Average validation loss: 0.06438649728138421
  Accuracy: 0.9831425712612261
  Validation took: 0:27:26

Training...
  Batch   500  of  46,094.    Elapsed: 0:08:11.
  current average loss = 0.06600610079642502
  Batch 1,000  of  46,094.    Elapsed: 0:16:17.
  current average loss = 0.067710450602739
  Batch 1,500  of  46,094.    Elapsed: 0:24:23.
  current average loss = 0.06737658379823551
  Batch 2,000  of  46,094.    Elapsed: 0:32:29.
  current average loss = 0.06834925241265591
  Batch 2,500  of  46,094.    Ela

  Batch 41,000  of  46,094.    Elapsed: 11:03:42.
  current average loss = 0.054669142743217755
  Batch 41,500  of  46,094.    Elapsed: 11:11:47.
  current average loss = 0.05452871523957312
  Batch 42,000  of  46,094.    Elapsed: 11:19:54.
  current average loss = 0.05436408282584212
  Batch 42,500  of  46,094.    Elapsed: 11:27:59.
  current average loss = 0.054278060767984936
  Batch 43,000  of  46,094.    Elapsed: 11:36:04.
  current average loss = 0.054115120612926254
  Batch 43,500  of  46,094.    Elapsed: 11:44:09.
  current average loss = 0.05396399882595376
  Batch 44,000  of  46,094.    Elapsed: 11:52:15.
  current average loss = 0.05386819957132477
  Batch 44,500  of  46,094.    Elapsed: 12:00:20.
  current average loss = 0.05369499149301955
  Batch 45,000  of  46,094.    Elapsed: 12:08:24.
  current average loss = 0.05358809171523438
  Batch 45,500  of  46,094.    Elapsed: 12:16:32.
  current average loss = 0.05346374924572696
  Batch 46,000  of  46,094.    Elapsed: 12:24:3

  Batch 37,500  of  46,094.    Elapsed: 10:05:36.
  current average loss = 0.03283405258456963
  Batch 38,000  of  46,094.    Elapsed: 10:13:40.
  current average loss = 0.032780946522839305
  Batch 38,500  of  46,094.    Elapsed: 10:21:46.
  current average loss = 0.032770042202409824
  Batch 39,000  of  46,094.    Elapsed: 10:29:50.
  current average loss = 0.03273770584886388
  Batch 39,500  of  46,094.    Elapsed: 10:37:54.
  current average loss = 0.03267756866744301
  Batch 40,000  of  46,094.    Elapsed: 10:45:57.
  current average loss = 0.03265709445078137
  Batch 40,500  of  46,094.    Elapsed: 10:54:02.
  current average loss = 0.03259372003371903
  Batch 41,000  of  46,094.    Elapsed: 11:02:06.
  current average loss = 0.03254446825784216
  Batch 41,500  of  46,094.    Elapsed: 11:10:10.
  current average loss = 0.03245914917617816
  Batch 42,000  of  46,094.    Elapsed: 11:18:14.
  current average loss = 0.03239633762990473
  Batch 42,500  of  46,094.    Elapsed: 11:26:18

  Batch 33,500  of  46,094.    Elapsed: 9:00:30.
  current average loss = 0.02275658724043219
  Batch 34,000  of  46,094.    Elapsed: 9:08:33.
  current average loss = 0.02272111285237952
  Batch 34,500  of  46,094.    Elapsed: 9:16:36.
  current average loss = 0.022736152013885208
  Batch 35,000  of  46,094.    Elapsed: 9:24:41.
  current average loss = 0.022705622164306
  Batch 35,500  of  46,094.    Elapsed: 9:32:45.
  current average loss = 0.022675974060059802
  Batch 36,000  of  46,094.    Elapsed: 9:40:49.
  current average loss = 0.022606908850986707
  Batch 36,500  of  46,094.    Elapsed: 9:48:53.
  current average loss = 0.022607344109733463
  Batch 37,000  of  46,094.    Elapsed: 9:56:57.
  current average loss = 0.022618271769158052
  Batch 38,000  of  46,094.    Elapsed: 10:13:05.
  current average loss = 0.022605720232340087
  Batch 38,500  of  46,094.    Elapsed: 10:21:10.
  current average loss = 0.022611596524168828
  Batch 39,000  of  46,094.    Elapsed: 10:29:13.
  c

  Batch 30,500  of  46,094.    Elapsed: 8:12:28.
  current average loss = 0.01699164782109366
  Batch 31,000  of  46,094.    Elapsed: 8:20:31.
  current average loss = 0.0169542149918045
  Batch 31,500  of  46,094.    Elapsed: 8:28:35.
  current average loss = 0.016946855733192413
  Batch 32,000  of  46,094.    Elapsed: 8:36:41.
  current average loss = 0.016978701326026965
  Batch 32,500  of  46,094.    Elapsed: 8:44:44.
  current average loss = 0.016948554242182525
  Batch 33,000  of  46,094.    Elapsed: 8:52:49.
  current average loss = 0.01692145025176703
  Batch 33,500  of  46,094.    Elapsed: 9:00:54.
  current average loss = 0.016856696222321832
  Batch 34,000  of  46,094.    Elapsed: 9:09:00.
  current average loss = 0.016863405845211298
  Batch 34,500  of  46,094.    Elapsed: 9:17:05.
  current average loss = 0.016833106087321262
  Batch 35,000  of  46,094.    Elapsed: 9:25:09.
  current average loss = 0.016851408787568654
  Batch 35,500  of  46,094.    Elapsed: 9:33:15.
  cur

  Batch 27,000  of  46,094.    Elapsed: 7:18:45.
  current average loss = 0.012686926726273062
  Batch 27,500  of  46,094.    Elapsed: 7:26:55.
  current average loss = 0.012640525932612284
  Batch 28,000  of  46,094.    Elapsed: 7:35:05.
  current average loss = 0.012621094191769868
  Batch 28,500  of  46,094.    Elapsed: 7:43:15.
  current average loss = 0.012659999775755989
  Batch 29,000  of  46,094.    Elapsed: 7:51:25.
  current average loss = 0.012760657999813515
  Batch 29,500  of  46,094.    Elapsed: 7:59:36.
  current average loss = 0.012771225776152462
  Batch 30,000  of  46,094.    Elapsed: 8:07:46.
  current average loss = 0.012829335567360855
  Batch 30,500  of  46,094.    Elapsed: 8:15:58.
  current average loss = 0.012802476792099607
  Batch 31,000  of  46,094.    Elapsed: 8:24:07.
  current average loss = 0.01276229858104172
  Batch 31,500  of  46,094.    Elapsed: 8:32:18.
  current average loss = 0.012709699723553057
  Batch 32,000  of  46,094.    Elapsed: 8:40:28.
  

  Batch 23,500  of  46,094.    Elapsed: 6:22:21.
  current average loss = 0.010000031937306553
  Batch 24,000  of  46,094.    Elapsed: 6:30:24.
  current average loss = 0.009956037282895675
  Batch 24,500  of  46,094.    Elapsed: 6:38:27.
  current average loss = 0.009973593374276696
  Batch 25,000  of  46,094.    Elapsed: 6:46:31.
  current average loss = 0.0100251623261434
  Batch 25,500  of  46,094.    Elapsed: 6:54:37.
  current average loss = 0.010048599723883622
  Batch 26,000  of  46,094.    Elapsed: 7:02:39.
  current average loss = 0.010066063236704184
  Batch 26,500  of  46,094.    Elapsed: 7:10:42.
  current average loss = 0.010053441841265047
  Batch 27,000  of  46,094.    Elapsed: 7:18:45.
  current average loss = 0.010048184325160575


KeyboardInterrupt: 

# Prediction Code

In [26]:
test_contents = test['요약문_내용'].values 
test_feature1 = test['사업명'].values 
test_feature2 = test['사업_부처명'].values 
test_feature3 = test['내역사업명'].values
test_feature4 = test['과제명'].values
test_feature5 = test['요약문_한글키워드'].values 
test_feature6 = test['요약문_영문키워드'].values 

In [27]:
# Load KoBERT
bert_checkpoint = torch.load('KoBERT_DIFFERENT_CONFIG_6') 
test_bert = BertForSequenceClassification.from_pretrained("skt/kobert-base-v1", num_labels=46) 
test_bert.load_state_dict(bert_checkpoint) 
test_bert.cuda() 
print() 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.





In [29]:
import torch.nn.functional as nnf

bert_probsum = [] 
bert_chunkparts = [] 

# change to eval mode 
test_bert.eval() 


for i in tqdm(range(test_contents.shape[0]), position = 0, leave=True): 
    f1 = clean_text(str(test_feature1[i])) 
    f2 = clean_text(str(test_feature2[i])) 
    f3 = clean_text(str(test_feature3[i])) 
    f4 = clean_text(str(test_feature4[i])) 
    f5 = str(test_feature5[i]) 
    splitted = split_text(clean_text(str(test_contents[i])))
    probabilities_bert = [] 
    for text in splitted: 
        test_text = f1 + " " + f2 + " " + f3 + " " + f4 + " " + f5 + " " + text 
        # tokenize test text 
        input_id, attention_mask, token_type_id = bert_tokenizer(test_text, MAX_LEN=MAX_LEN) 
        input_id = torch.tensor(input_id) 
        attention_mask = torch.tensor(attention_mask) 
        token_type_id = torch.tensor(token_type_id) 
        # reshape into (batch, MAX_LEN) 
        input_id = torch.reshape(input_id, (-1, MAX_LEN)) 
        attention_mask = torch.reshape(attention_mask, (-1, MAX_LEN))  
        token_type_id = torch.reshape(token_type_id, (-1, MAX_LEN)) 
        # move tensor to cuda 
        input_id = input_id.to(device) 
        attention_mask = attention_mask.to(device) 
        token_type_id = token_type_id.to(device) 
        
        with torch.no_grad(): 
            outputs = test_bert(input_id, 
                                token_type_ids=token_type_id, 
                                attention_mask = attention_mask) 
            
        logits = outputs[0] 
        
        # obtain softmax probabilities 
        prob = nnf.softmax(logits, dim=1).flatten() 
        probabilities_bert.append(prob) 
        
    # soft voting 
    prob_sum = np.zeros(46) 
    for i in range(len(probabilities_bert)): 
        for j in range(46):  
            prob_sum[j] += probabilities_bert[i][j] 
    
    bert_probsum.append(prob_sum)
    bert_chunkparts.append(len(probabilities_bert)) 
    

bert_probsum = np.asarray(bert_probsum) 
bert_chunkparts = np.asarray(bert_chunkparts) 



  7%|▋         | 3141/43576 [10:42<2:07:11,  5.30it/s]

Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens


 20%|█▉        | 8594/43576 [30:11<2:17:41,  4.23it/s]

Long Text!! Using the first 512 tokens


 62%|██████▏   | 27182/43576 [1:34:28<44:07,  6.19it/s]  

Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens
Long Text!! Using the first 512 tokens


100%|██████████| 43576/43576 [2:31:28<00:00,  4.79it/s]  

saving predicted probability values





NameError: name 'bert_probum' is not defined

In [30]:
print("saving predicted probability values") 
np.save('bert_probsum_2.npy', bert_probsum) 
np.save('bert_chunkparts_2.npy', bert_chunkparts) 
        

saving predicted probability values


In [32]:
bert_probsum

array([[1.39996986e+01, 2.92763457e-06, 2.31417268e-08, ...,
        8.18598153e-07, 3.20058433e-08, 1.07785363e-05],
       [1.59998016e+01, 1.82669692e-06, 6.93749271e-08, ...,
        1.44548790e-06, 3.81685567e-07, 9.28362078e-06],
       [5.99995518e+00, 3.86897739e-07, 9.11096620e-09, ...,
        3.82379937e-07, 3.51047014e-08, 3.50270693e-06],
       ...,
       [1.45434072e-07, 5.20826507e-06, 7.99969864e+00, ...,
        1.17021982e-05, 4.48026949e-05, 1.67731051e-09],
       [8.99993134e+00, 8.92271430e-07, 2.80280119e-08, ...,
        7.02174191e-07, 8.29793265e-08, 3.31755064e-06],
       [3.99960995e+00, 4.35817157e-07, 9.73338032e-09, ...,
        7.03599198e-07, 1.64671476e-08, 2.16815374e-06]])