In [16]:
import torch 
import torch.nn as nn 
import torch.functional as f
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
import time
import datetime
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import random 
import re 
import math 
from tqdm import tqdm 
import sklearn
from transformers import *
from kobert_tokenizer import KoBERTTokenizer

In [5]:
train = pd.read_csv("train.csv") 
test = pd.read_csv("test.csv") 

train.shape, test.shape, train['label'].nunique()

((174304, 13), (43576, 12), 46)

In [6]:
submission = pd.read_csv("sample_submission.csv")


In [7]:
def clean_text(sent):
    sent_clean=re.sub("[^가-힣ㄱ-하-ㅣ]", " ", sent)
    return sent_clean

In [8]:
def split_text(s, overlap = 20, chunk_size = 50): 
    total = [] 
    partial = [] 
    if len(s.split()) // (chunk_size - overlap) > 0:  
        n = len(s.split()) // (chunk_size - overlap) 
    else: 
        n = 1 
    for w in range(n): 
        if w == 0: 
            partial = s.split()[:chunk_size] 
            total.append(" ".join(partial)) 
        else:  
            partial = s.split()[w*(chunk_size - overlap):w*(chunk_size - overlap) + chunk_size]
            total.append(" ".join(partial)) 
    return total


In [9]:
train['요약문_내용'] = train['요약문_연구목표'] + train['요약문_연구내용'] + train['요약문_기대효과'] 
test['요약문_내용'] = test['요약문_연구목표'] + test['요약문_연구내용'] + test['요약문_기대효과']

In [10]:
train['요약문_내용'].fillna('NAN',inplace=True) 
test['요약문_내용'].fillna('NAN',inplace=True)

In [11]:
train['사업명'].fillna('NAN',inplace=True) 
train['사업_부처명'].fillna('NAN',inplace=True) 
train['내역사업명'].fillna('NAN',inplace=True) 
train['과제명'].fillna('NAN',inplace=True) 
train['요약문_한글키워드'].fillna('NAN',inplace=True)

In [12]:
contents = train['요약문_내용'].values 
feature1 = train['사업명'].values 
feature2 = train['사업_부처명'].values 
feature3 = train['내역사업명'].values 
feature4 = train['과제명'].values 
feature5 = train['요약문_한글키워드'].values 
feature6 = train['label'].values 

train_data = {'사업명':[],'사업_부처명':[],'내역사업명':[],'과제명':[],'한글키워드':[],'요약문':[],'label':[]} 

for i in tqdm(range(contents.shape[0]), position = 0, leave = True): 
    sample = str(contents[i]) 
    splitted_text = split_text(clean_text(sample)) 
    for t in splitted_text: 
        train_data['요약문'].append(t) 
        train_data['사업명'].append(clean_text(str(feature1[i])))
        train_data['사업_부처명'].append(clean_text(str(feature2[i]))) 
        train_data['내역사업명'].append(clean_text(str(feature3[i]))) 
        train_data['과제명'].append(clean_text(str(feature4[i])))  
        train_data['한글키워드'].append(feature5[i]) # no cleaning for this one
        train_data['label'].append(feature6[i])

100%|██████████| 174304/174304 [02:46<00:00, 1047.21it/s]


In [13]:
train_data = pd.DataFrame(train_data)

In [17]:
## Now we tokenize each data and make sure they all lie within the 512 tokenization range 
## if not check how many have token length greater than 512 

tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')


def bert_tokenizer(sent, MAX_LEN):  
    encoded_dict = tokenizer.encode_plus(
        text = sent, 
        add_special_tokens = True, # add [CLS] and [SEP]
        pad_to_max_length = False, 
        return_attention_mask = True # constructing attention_masks 
    )  
    
    input_id = encoded_dict['input_ids'] 
    attention_mask = encoded_dict['attention_mask'] # differentiate padding from non padding 
    token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences, not "really" necessary for now    
    
    if len(input_id) > 512: # head + tail methodology 
        input_id = input_id[:129] + input_id[-383:] 
        attention_mask = attention_mask[:129] + attention_mask[-383:]  
        token_type_id = token_type_id[:129] + token_type_id[-383:]    
        print("Long Text!! Using Head+Tail Truncation")
    elif len(input_id) <= 512: 
        input_id = input_id + [0]*(512 - len(input_id)) 
        attention_mask = attention_mask + [0]*(512 - len(attention_mask))
        token_type_id = token_type_id + [0]*(512 - len(token_type_id))  
        
    return input_id, attention_mask, token_type_id


HBox(children=(IntProgress(value=0, description='Downloading', max=371427, style=ProgressStyle(description_wid…




In [18]:
train_data['data'] = train_data['사업명'] + " " + train_data['사업_부처명'] + " " + train_data['내역사업명'] + " " + train_data['과제명'] + " " + train_data['한글키워드'] + " " + train_data['요약문'] 

train_data.head(2)


Unnamed: 0,사업명,사업_부처명,내역사업명,과제명,한글키워드,요약문,label,data
0,농업기초기반연구,농촌진흥청,농산물안전성연구,유전정보를 활용한 새로운 해충 분류군 동정기술 개발,"뉴클레오티드 염기서열, 분자마커, 종 동정, 침샘, 전사체",새로운 해충분류군의 동정기술 개발 및 유입확산 추적 가 외래 및 돌발해충의 발생조사...,24,농업기초기반연구 농촌진흥청 농산물안전성연구 유전정보를 활용한 새로운 해충 분류군 동...
1,농업기초기반연구,농촌진흥청,농산물안전성연구,유전정보를 활용한 새로운 해충 분류군 동정기술 개발,"뉴클레오티드 염기서열, 분자마커, 종 동정, 침샘, 전사체",의 돌발 및 외래해충 다 외래 및 돌발해충의 유전적 다양성 조사 시험곤충 나 의 해...,24,농업기초기반연구 농촌진흥청 농산물안전성연구 유전정보를 활용한 새로운 해충 분류군 동...


In [19]:
train_text = train_data['data'].values 
train_labels = train_data['label'].values

In [20]:
BATCH_SIZE = 32
NUM_EPOCHS = 30
VALID_SPLIT = 0.1 
MAX_LEN = 512 # max token size for BERT, ELECTRA

In [21]:
N = train_data.shape[0] 

input_ids = np.zeros((N, MAX_LEN),dtype=int)
attention_masks = np.zeros((N, MAX_LEN),dtype=int)
token_type_ids = np.zeros((N, MAX_LEN),dtype=int) 
labels = np.zeros((N),dtype=int)

for i in tqdm(range(N), position=0, leave=True): 
    try:
        cur_str = train_text[i]
        cur_label = train_labels[i]
        input_id, attention_mask, token_type_id = bert_tokenizer(cur_str, MAX_LEN=MAX_LEN) 
        input_ids[i,] = input_id 
        attention_masks[i,] = attention_mask 
        token_type_ids[i,] = token_type_id
        labels[i] = cur_label 
    except Exception as e: 
        print(e)
        print(cur_str)
        pass

  5%|▍         | 80049/1638867 [01:52<22:59, 1130.27it/s] 

Long Text!! Using Head+Tail Truncation


  8%|▊         | 122947/1638867 [02:29<21:42, 1163.42it/s]

Long Text!! Using Head+Tail Truncation


  8%|▊         | 138519/1638867 [02:42<21:37, 1156.23it/s]

Long Text!! Using Head+Tail Truncation


 11%|█         | 173164/1638867 [03:13<21:30, 1135.35it/s]

Long Text!! Using Head+Tail Truncation
Long Text!! Using Head+Tail Truncation
Long Text!! Using Head+Tail Truncation


 11%|█         | 178516/1638867 [03:17<20:53, 1165.11it/s]

Long Text!! Using Head+Tail Truncation


 14%|█▍        | 233938/1638867 [04:07<20:50, 1123.53it/s]

Long Text!! Using Head+Tail Truncation
Long Text!! Using Head+Tail Truncation
Long Text!! Using Head+Tail Truncation


 15%|█▌        | 251108/1638867 [04:23<34:49, 664.00it/s] 

Long Text!! Using Head+Tail Truncation


 16%|█▌        | 258672/1638867 [04:30<21:53, 1051.06it/s]

Long Text!! Using Head+Tail Truncation


 29%|██▉       | 475668/1638867 [07:45<18:33, 1044.67it/s]

Long Text!! Using Head+Tail Truncation


 30%|██▉       | 489135/1638867 [07:58<16:34, 1156.11it/s]

Long Text!! Using Head+Tail Truncation


 33%|███▎      | 545212/1638867 [08:48<16:07, 1130.26it/s]

Long Text!! Using Head+Tail Truncation


 35%|███▍      | 569162/1638867 [09:10<16:07, 1106.13it/s]

Long Text!! Using Head+Tail Truncation


 35%|███▌      | 576159/1638867 [09:17<15:55, 1112.58it/s]

Long Text!! Using Head+Tail Truncation


 36%|███▌      | 582171/1638867 [09:23<16:44, 1052.24it/s]

Long Text!! Using Head+Tail Truncation
Long Text!! Using Head+Tail Truncation
Long Text!! Using Head+Tail Truncation


 38%|███▊      | 624326/1638867 [10:02<14:27, 1169.19it/s]

Long Text!! Using Head+Tail Truncation


 40%|████      | 657227/1638867 [10:32<18:01, 907.36it/s] 

Long Text!! Using Head+Tail Truncation


 44%|████▎     | 713308/1638867 [11:22<14:37, 1054.30it/s]

Long Text!! Using Head+Tail Truncation


 45%|████▍     | 733749/1638867 [11:41<12:52, 1171.60it/s]

Long Text!! Using Head+Tail Truncation


 45%|████▌     | 743962/1638867 [11:50<13:09, 1133.96it/s]

Long Text!! Using Head+Tail Truncation


 46%|████▌     | 748408/1638867 [11:54<12:55, 1148.91it/s]

Long Text!! Using Head+Tail Truncation


 49%|████▊     | 796909/1638867 [12:39<11:31, 1218.29it/s]

Long Text!! Using Head+Tail Truncation


 49%|████▉     | 805647/1638867 [12:47<15:17, 908.06it/s] 

Long Text!! Using Head+Tail Truncation


 52%|█████▏    | 854255/1638867 [13:29<10:44, 1217.72it/s]

Long Text!! Using Head+Tail Truncation
Long Text!! Using Head+Tail Truncation


 55%|█████▍    | 895946/1638867 [14:07<10:31, 1176.35it/s]

Long Text!! Using Head+Tail Truncation
Long Text!! Using Head+Tail Truncation


 58%|█████▊    | 947992/1638867 [14:54<10:12, 1127.90it/s]

Long Text!! Using Head+Tail Truncation


 65%|██████▌   | 1068835/1638867 [17:18<10:01, 948.43it/s]

Long Text!! Using Head+Tail Truncation


 66%|██████▋   | 1087517/1638867 [17:44<15:02, 610.67it/s] 

Long Text!! Using Head+Tail Truncation
Long Text!! Using Head+Tail Truncation
Long Text!! Using Head+Tail Truncation


 72%|███████▏  | 1185175/1638867 [20:00<17:22, 435.03it/s] 

Long Text!! Using Head+Tail Truncation
Long Text!! Using Head+Tail Truncation


 72%|███████▏  | 1186971/1638867 [20:03<08:53, 846.25it/s]

Long Text!! Using Head+Tail Truncation


 73%|███████▎  | 1199393/1638867 [20:22<11:09, 656.06it/s]

Long Text!! Using Head+Tail Truncation


 75%|███████▍  | 1222443/1638867 [20:56<12:19, 562.80it/s] 

Long Text!! Using Head+Tail Truncation


 75%|███████▍  | 1225591/1638867 [21:02<11:05, 620.93it/s]

Long Text!! Using Head+Tail Truncation


 75%|███████▌  | 1233850/1638867 [21:14<07:52, 857.39it/s]

Long Text!! Using Head+Tail Truncation


 83%|████████▎ | 1353132/1638867 [24:18<06:57, 684.62it/s] 

Long Text!! Using Head+Tail Truncation
Long Text!! Using Head+Tail Truncation


 84%|████████▍ | 1373834/1638867 [24:52<07:13, 611.38it/s] 

Long Text!! Using Head+Tail Truncation


 86%|████████▌ | 1410443/1638867 [25:54<06:09, 618.28it/s]

Long Text!! Using Head+Tail Truncation


 87%|████████▋ | 1422020/1638867 [26:12<04:16, 844.49it/s]

Long Text!! Using Head+Tail Truncation


 89%|████████▉ | 1454962/1638867 [27:05<03:55, 779.53it/s]

Long Text!! Using Head+Tail Truncation
Long Text!! Using Head+Tail Truncation


 95%|█████████▍| 1554766/1638867 [29:46<02:24, 581.80it/s] 

Long Text!! Using Head+Tail Truncation


100%|██████████| 1638867/1638867 [32:06<00:00, 850.76it/s] 


In [22]:
input_ids = torch.tensor(input_ids, dtype=int)
attention_masks = torch.tensor(attention_masks, dtype=int)
token_type_ids = torch.tensor(token_type_ids, dtype=int) 
labels = torch.tensor(labels, dtype=int)

In [23]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels, random_state = 42, test_size = VALID_SPLIT) 

train_attention_mask, val_attention_mask, _, _ = train_test_split(attention_masks, labels, random_state = 42, test_size = VALID_SPLIT) 

train_token_ids, val_token_ids, _, _ = train_test_split(token_type_ids, labels, random_state = 42, test_size = VALID_SPLIT) 


train_inputs.shape, train_attention_mask.shape, train_token_ids.shape, train_labels.shape, val_inputs.shape, val_attention_mask.shape, val_token_ids.shape, val_labels.shape

(torch.Size([1474980, 512]),
 torch.Size([1474980, 512]),
 torch.Size([1474980, 512]),
 torch.Size([1474980]),
 torch.Size([163887, 512]),
 torch.Size([163887, 512]),
 torch.Size([163887, 512]),
 torch.Size([163887]))

In [24]:
batch_size = 32 
train_data = TensorDataset(train_inputs, train_attention_mask, train_token_ids, train_labels) 
train_sampler = RandomSampler(train_data) 
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) 

validation_data = TensorDataset(val_inputs, val_attention_mask, val_token_ids, val_labels) 
validation_sampler = SequentialSampler(validation_data) 
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [25]:
model = BertForSequenceClassification.from_pretrained("skt/kobert-base-v1", num_labels=46)
model.cuda()


HBox(children=(IntProgress(value=0, description='Downloading', max=535, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=368792544, style=ProgressStyle(description_…




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [27]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

epochs = 10

total_steps = len(train_dataloader) * epochs 

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)


def flat_accuracy(preds, labels): 
    pred_flat = np.argmax(preds, axis=1).flatten() 
    labels_flat = labels.flatten() 
    return np.sum(pred_flat == labels_flat) / len(labels_flat) 

def format_time(elapsed):
    # 반올림
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))


device = torch.device("cuda")


# 재현을 위해 랜덤시드 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# 그래디언트 초기화
model.zero_grad()

# 에폭만큼 반복
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 시작 시간 설정
    t0 = time.time()

    # 로스 초기화
    total_loss = 0

    # 훈련모드로 변경
    model.train()
        
    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            print('  current average loss = {}'.format(total_loss / step))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch

        # Forward 수행                
        outputs = model(b_input_ids, 
                        token_type_ids=b_token_type_ids, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        # 로스 구함
        loss = outputs[0]

        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        optimizer.step()

        # 스케줄러로 학습률 감소
        scheduler.step()

        # 그래디언트 초기화
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    #시작 시간 설정
    t0 = time.time()

    # 평가모드로 변경
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch
        
        # 그래디언트 계산 안함
        with torch.no_grad():     
            # Forward 수행
            outputs = model(b_input_ids, 
                            token_type_ids=b_token_type_ids, 
                            attention_mask=b_input_mask, 
                            labels = b_labels)
    
        
        loss = outputs[0] 
        logits = outputs[1] 
        
        # 로스 구함 
        eval_loss += loss.item() 
        
        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    
    avg_val_loss = eval_loss / len(validation_dataloader)            
    print("  Average validation loss: {}".format(avg_val_loss))
    print("  Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
    
    torch.save(model.state_dict(), "KoBERT_MORE_FEATURES_" + str(epoch_i + 1)) 
    

print("")
print("Training complete!")



Training...
  Batch   500  of  46,094.    Elapsed: 0:08:03.
  current average loss = 1.012136717170477
  Batch 1,000  of  46,094.    Elapsed: 0:16:06.
  current average loss = 0.8487846918702125
  Batch 1,500  of  46,094.    Elapsed: 0:24:01.
  current average loss = 0.7619319985459249
  Batch 2,000  of  46,094.    Elapsed: 0:31:58.
  current average loss = 0.6991074663512409
  Batch 2,500  of  46,094.    Elapsed: 0:39:59.
  current average loss = 0.6520356897130608
  Batch 3,000  of  46,094.    Elapsed: 0:48:00.
  current average loss = 0.6144530848897993
  Batch 3,500  of  46,094.    Elapsed: 0:55:59.
  current average loss = 0.5845445481411048
  Batch 4,000  of  46,094.    Elapsed: 1:03:58.
  current average loss = 0.5583495307192207
  Batch 4,500  of  46,094.    Elapsed: 1:11:56.
  current average loss = 0.536209449806975
  Batch 5,000  of  46,094.    Elapsed: 1:19:59.
  current average loss = 0.5163870525781066
  Batch 5,500  of  46,094.    Elapsed: 1:28:02.
  current average los

  Batch 44,500  of  46,094.    Elapsed: 11:55:10.
  current average loss = 0.19834581152815373
  Batch 45,000  of  46,094.    Elapsed: 12:03:14.
  current average loss = 0.19703827457262021
  Batch 45,500  of  46,094.    Elapsed: 12:11:18.
  current average loss = 0.1958123529157064
  Batch 46,000  of  46,094.    Elapsed: 12:19:22.
  current average loss = 0.19453318259684285

  Average training loss: 0.19427929408840847
  Training epoch took: 12:20:52

Running Validation...
  Average validation loss: 0.06462574088643923
  Accuracy: 0.9828192112456072
  Validation took: 0:27:01

Training...
  Batch   500  of  46,094.    Elapsed: 0:08:04.
  current average loss = 0.0735813626376912
  Batch 1,000  of  46,094.    Elapsed: 0:16:08.
  current average loss = 0.07348795195561252
  Batch 1,500  of  46,094.    Elapsed: 0:24:11.
  current average loss = 0.07189535331759059
  Batch 2,000  of  46,094.    Elapsed: 0:32:13.
  current average loss = 0.07123592855608149
  Batch 2,500  of  46,094.    E

  Batch 41,500  of  46,094.    Elapsed: 11:04:08.
  current average loss = 0.05708305598017185
  Batch 42,000  of  46,094.    Elapsed: 11:12:08.
  current average loss = 0.0568367514252251
  Batch 42,500  of  46,094.    Elapsed: 11:20:08.
  current average loss = 0.056669108492812065
  Batch 43,000  of  46,094.    Elapsed: 11:28:09.
  current average loss = 0.05659289677012397
  Batch 43,500  of  46,094.    Elapsed: 11:36:08.
  current average loss = 0.056490203324088994
  Batch 44,000  of  46,094.    Elapsed: 11:44:07.
  current average loss = 0.056349227775597685
  Batch 44,500  of  46,094.    Elapsed: 11:52:07.
  current average loss = 0.05622186168383747
  Batch 45,000  of  46,094.    Elapsed: 12:00:08.
  current average loss = 0.05605352304925165
  Batch 45,500  of  46,094.    Elapsed: 12:08:07.
  current average loss = 0.055918187949147616
  Batch 46,000  of  46,094.    Elapsed: 12:16:09.
  current average loss = 0.055825726943660295

  Average training loss: 0.055811768940015896

  Batch 38,000  of  46,094.    Elapsed: 10:07:08.
  current average loss = 0.03372720736918717
  Batch 38,500  of  46,094.    Elapsed: 10:15:04.
  current average loss = 0.033669133375709884
  Batch 39,000  of  46,094.    Elapsed: 10:23:00.
  current average loss = 0.033653126131244604
  Batch 39,500  of  46,094.    Elapsed: 10:30:56.
  current average loss = 0.03358774067340756
  Batch 40,000  of  46,094.    Elapsed: 10:38:53.
  current average loss = 0.0335658948686314
  Batch 40,500  of  46,094.    Elapsed: 10:46:49.
  current average loss = 0.03349182855560898
  Batch 41,000  of  46,094.    Elapsed: 10:54:46.
  current average loss = 0.03344140454306336
  Batch 41,500  of  46,094.    Elapsed: 11:02:43.
  current average loss = 0.0334113194935791
  Batch 42,000  of  46,094.    Elapsed: 11:10:40.
  current average loss = 0.033370883509014244
  Batch 42,500  of  46,094.    Elapsed: 11:18:37.
  current average loss = 0.033306171617194284
  Batch 43,000  of  46,094.    Elapsed: 11:26:33

  Batch 34,500  of  46,094.    Elapsed: 9:06:49.
  current average loss = 0.023453376737286792
  Batch 35,000  of  46,094.    Elapsed: 9:14:45.
  current average loss = 0.02341307201980027
  Batch 35,500  of  46,094.    Elapsed: 9:22:40.
  current average loss = 0.023384578650245548
  Batch 36,000  of  46,094.    Elapsed: 9:30:36.
  current average loss = 0.023348301164953327
  Batch 36,500  of  46,094.    Elapsed: 9:38:32.
  current average loss = 0.02331298953354028
  Batch 37,000  of  46,094.    Elapsed: 9:46:28.
  current average loss = 0.023272247537511812
  Batch 37,500  of  46,094.    Elapsed: 9:54:24.
  current average loss = 0.023213474455382334
  Batch 38,000  of  46,094.    Elapsed: 10:02:19.
  current average loss = 0.02321040679187215
  Batch 38,500  of  46,094.    Elapsed: 10:10:16.
  current average loss = 0.023141091609283966
  Batch 39,000  of  46,094.    Elapsed: 10:18:11.
  current average loss = 0.023136299335012107
  Batch 39,500  of  46,094.    Elapsed: 10:26:08.


  Batch 30,500  of  46,094.    Elapsed: 8:03:46.
  current average loss = 0.0174807159249714
  Batch 31,000  of  46,094.    Elapsed: 8:11:42.
  current average loss = 0.01744277164569159
  Batch 31,500  of  46,094.    Elapsed: 8:19:37.
  current average loss = 0.017451268908946365
  Batch 32,000  of  46,094.    Elapsed: 8:27:33.
  current average loss = 0.017404676073835845
  Batch 32,500  of  46,094.    Elapsed: 8:35:29.
  current average loss = 0.017354537796498894
  Batch 33,000  of  46,094.    Elapsed: 8:43:25.
  current average loss = 0.017363794091607567
  Batch 33,500  of  46,094.    Elapsed: 8:51:20.
  current average loss = 0.017302216591050624
  Batch 34,000  of  46,094.    Elapsed: 8:59:16.
  current average loss = 0.017290502672276917
  Batch 34,500  of  46,094.    Elapsed: 9:07:13.
  current average loss = 0.0172806361475777
  Batch 35,000  of  46,094.    Elapsed: 9:15:08.
  current average loss = 0.01725040773587295
  Batch 35,500  of  46,094.    Elapsed: 9:23:04.
  curre

  Batch 26,500  of  46,094.    Elapsed: 6:59:02.
  current average loss = 0.013208998992952197
  Batch 27,000  of  46,094.    Elapsed: 7:06:59.
  current average loss = 0.013184609280249553
  Batch 27,500  of  46,094.    Elapsed: 7:14:54.
  current average loss = 0.013158093444159188
  Batch 28,000  of  46,094.    Elapsed: 7:22:48.
  current average loss = 0.01311977599105288
  Batch 28,500  of  46,094.    Elapsed: 7:30:43.
  current average loss = 0.013063515865803422
  Batch 29,000  of  46,094.    Elapsed: 7:38:37.
  current average loss = 0.013053384128782234
  Batch 29,500  of  46,094.    Elapsed: 7:46:33.
  current average loss = 0.0130552005806354
  Batch 30,000  of  46,094.    Elapsed: 7:54:29.
  current average loss = 0.013050165781059574
  Batch 30,500  of  46,094.    Elapsed: 8:02:25.
  current average loss = 0.013045083078542642
  Batch 31,000  of  46,094.    Elapsed: 8:10:19.
  current average loss = 0.013075703313337374
  Batch 31,500  of  46,094.    Elapsed: 8:18:12.
  cu

  Batch 22,500  of  46,094.    Elapsed: 5:56:53.
  current average loss = 0.010378909692242005
  Batch 23,000  of  46,094.    Elapsed: 6:04:48.
  current average loss = 0.010405461286220089
  Batch 23,500  of  46,094.    Elapsed: 6:12:44.
  current average loss = 0.010457647192702388
  Batch 24,000  of  46,094.    Elapsed: 6:20:40.
  current average loss = 0.010487719408225


KeyboardInterrupt: 

# Make Prediction

In [None]:
# load best checkpoint model
checkpoint = torch.load('') 
test_model = BertForSequenceClassification.from_pretrained("skt/kobert-base-v1", num_labels=46)
test_model.load_state_dict(checkpoint) 
test_model.cuda()


In [None]:
import torch.nn.functional as nnf

test['사업명'].fillna('NAN',inplace=True) 
test['사업_부처명'].fillna('NAN',inplace=True) 
test['내역사업명'].fillna('NAN',inplace=True) 
test['과제명'].fillna('NAN',inplace=True) 
test['요약문_한글키워드'].fillna('NAN',inplace=True)


test_contents = test['요약문_내용'].values 
test_feature1 = test['사업명'].values 
test_feature2 = test['사업_부처명'].values 
test_feature3 = test['내역사업명'].values 
test_feature4 = test['과제명'].values 
test_feature5 = test['요약문_한글키워드'].values 

predicted_classes = []

# change to eval mode 
test_model.eval() 

for i in tqdm(range(test_contents.shape[0]), position=0, leave=True):
    f1 = clean_text(str(test_feature1[i])) 
    f2 = clean_text(str(test_feature2[i])) 
    f3 = clean_text(str(test_feature3[i])) 
    f4 = clean_text(str(test_feature4[i])) 
    f5 = str(test_feature5[i]) 
    splitted = split_text(str(test_contents[i])) 
    # make predictions for each splitted text 
    probabilities = [] 
    for text in splitted: 
        test_text = f1 + " " + f2 + " " + f3 + " " + f4 + " " + f5 + " " + text 
        # tokenize test text 
        input_id, attention_mask, token_type_id = electra_tokenizer(test_text, MAX_LEN=MAX_LEN) 
        input_id = torch.tensor(input_id)
        attention_mask = torch.tensor(attention_mask) 
        token_type_id = torch.tensor(token_type_id) 
        # reshape into (batch, MAX_LEN)
        input_id = torch.reshape(input_id, (-1,MAX_LEN)) 
        attention_mask = torch.reshape(attention_mask, (-1,MAX_LEN)) 
        token_type_id = torch.reshape(token_type_id, (-1,MAX_LEN)) 
        # move tensor to cuda 
        input_id = input_id.to(device) 
        attention_mask = attention_mask.to(device) 
        token_type_id = token_type_id.to(device) 
        
        with torch.no_grad(): 
            outputs = test_model(input_id, 
                            token_type_ids=token_type_id, 
                            attention_mask=attention_mask) 
        
        logits = outputs[0]
        
        # obtain softmax probabilities 
        prob = nnf.softmax(logits, dim=1).flatten()
        probabilities.append(prob)
    
    # soft voting 
    prob_sum = np.zeros(46) 
    for i in range(len(probabilities)): 
        for j in range(46): 
            prob_sum[j] += probabilities[i][j] 
            
    
    prob_sum /= len(probabilities)  
    
    predicted_class = np.argmax(prob_sum)
    
    predicted_classes.append(predicted_class)


In [None]:
submission.iloc[:,1] = predicted_classes 


submission.to_csv("KOBERT_voting.csv",index=False)


submission
