In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

from KoBERT.kobert.utils import get_tokenizer
from KoBERT.kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

import pandas as pd
from sklearn.model_selection import train_test_split

print(torch.cuda.device_count())

device = torch.device("cuda:0")

ModuleNotFoundError: No module named 'gluonnlp'

In [2]:
bertmodel, vocab = get_pytorch_kobert_model()

using cached model
using cached model


In [3]:
RANDOM_SEED = 17

In [4]:
dataset = pd.read_csv("data/train_data.csv",index_col=False)
# rd_augmentation = pd.read_csv("data/train_rd_augmentation.csv",index_col=False)
# rs_augmentation = pd.read_csv("data/train_rs_augmentation.csv",index_col=False)
test = pd.read_csv("data/test_data.csv",index_col=False)

In [5]:
# total = pd.concat([dataset,rd_augmentation,rs_augmentation])

In [5]:
# total = total[["title","topic_idx"]]

In [7]:
# total

Unnamed: 0,title,topic_idx
0,인천→핀란드 항공기 결항…휴가철 여행객 분통,4
1,실리콘밸리 넘어서겠다…구글 15조원 들여 美전역 거점화,4
2,이란 외무 긴장완화 해결책은 미국이 경제전쟁 멈추는 것,4
3,NYT 클린턴 측근韓기업 특수관계 조명…공과 사 맞물려종합,4
4,시진핑 트럼프에 중미 무역협상 조속 타결 희망,4
...,...,...
45649,KB 공략 미국 선진국 스티펠 과 제휴 … IB 시장 금융,1
45650,1 보 서울시 교육청 신종 코로나 확산 에 개학 연기 · 휴업 검토,2
45651,게시판 키움증권 키움 2020 투자 실전 영웅전 대회,1
45652,는 답변 하 배기동 국립 중앙 박물 관장,2


In [5]:
dataset_train, dataset_val = train_test_split(dataset,test_size = 0.2,random_state = RANDOM_SEED)

In [6]:
dataset_train.head

<bound method NDFrame.head of        index                            title  topic_idx
25339  25339         더민주 서영교 여파 지역위원장 심사기준 강화          6
24704  24704      맛집에 너그러운 한국인 해외여행서도 JMT 찾았다          3
1834    1834        특징주 삼성물산 지배구조 이슈 부각에 강세종합          1
17604  17604   생필품난 베네수엘라 콜롬비아와의 국경 1년 만에 재개방          4
19362  19362        금태섭 국민 10명 중 8명 판결문 공개 원해          6
...      ...                              ...        ...
25631  25631      7번째 개관하는 소극장 운동 발원지 삼일로창고극장          3
42297  42297    삼강엠앤티 516억원 규모 케미컬 탱크 3척 공급계약          1
33174  33174    이란 외무장관 美 제재 해제하면 협상의 문 활짝 열려          4
34959  34959     제주·남부 지방에 호우특보…완도 165.5㎜ 장대비          3
10863  10863  영천 새마을금고 강도 범행 6시간만에 검거…범행동기 조사          2

[36523 rows x 3 columns]>

In [7]:
from koeda import EasyDataAugmentation

EDA = EasyDataAugmentation(
    morpheme_analyzer=None, alpha_sr=0, alpha_ri=0.2, alpha_rs=0.2, prob_rd=0.2
)

def augment_data(dataset_df,EDA,repetition_num):

    augmented_list = []
    label_list = []

    for text, label in zip(dataset_df["title"],dataset_df["topic_idx"]):
        augmenteds = EDA(data=text, p=None, repetition=repetition_num)

        for aug in augmenteds:
            augmented_list.append(aug)
            label_list.append(label)

    new_df = pd.DataFrame({
        'title' : augmented_list,
        'topic_idx' : label_list
    })

    return new_df

In [8]:
aug_df = augment_data(dataset_train,EDA,4)

In [9]:
dataset_train = pd.concat([dataset_train,aug_df])

In [10]:
dataset_val

Unnamed: 0,index,title,topic_idx
33284,33284,아시안게임 손흥민 보자…교민 응원 속에 태극전사 ...,5
20725,20725,최근 3년간 세종문화회관 매표 고객 중 71%가 여성,3
44150,44150,오케스트라 연주로 만나는 픽사 애니메이션 대표작 16편,3
8817,8817,관훈클럽 창립 62주년 기념식,3
886,886,파키스탄 시장에서 폭탄테러…최소 16명 사망,4
...,...,...,...
280,280,손흥민 선발로 73분 활약…리그 8호골은 다음에,5
7239,7239,힐러리 IS 트럼프가 대통령 되길 간절히 기도하고 있다,4
29933,29933,손흥민의 토트넘 UEFA챔스리그서 레알 마드리드와 한 조,5
10038,10038,신간 개헌전쟁·나의 형 체 게바라·핸드 투 마우스,3


In [11]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

class BERTDataset(Dataset):
    def __init__(self, dataset, sent_key, label_key, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i]) for i in dataset[sent_key]]
        
        if not label_key == None:
            self.mode = "train"
        else:
            self.mode = "test"
            
        if self.mode == "train":
            self.labels = [np.int32(i) for i in dataset[label_key]]
        else:
            self.labels = [np.int32(0) for i in dataset[sent_key]]

    def __getitem__(self, i):
        if self.mode == "train":
            return (self.sentences[i] + (self.labels[i], ))
        else:
            return self.sentences[i]

    def __len__(self):
        return (len(self.labels))


using cached model


In [13]:
## Setting parameters
max_len = 40
batch_size = 16
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5
NUM_CLASS = 7
MODEL_P = "models/kobert-0724-2.pth"

epochs_no_improve = 0
min_val_loss = np.Inf
n_epochs_stop = 2


In [14]:
# data_total = BERTDataset(dataset, "title", "topic_idx", tok, max_len, True, False)
data_train = BERTDataset(dataset_train, "title", "topic_idx", tok, max_len, True, False)
data_val = BERTDataset(dataset_val, "title", "topic_idx", tok, max_len, True, False)
data_test = BERTDataset(test, "title", None, tok, max_len, True, False)

# total_dataloader = torch.utils.data.DataLoader(data_total, batch_size=batch_size, num_workers=5)
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
val_dataloader = torch.utils.data.DataLoader(data_val, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

In [15]:


class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=NUM_CLASS,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)


In [16]:
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [17]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        
        label = label.long().to(device)
        
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    model.eval()
    val_loss = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(val_dataloader)):
        
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out,label)
        
        val_loss += loss.data.cpu().numpy()
        test_acc += calc_accuracy(out, label)
    
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    print("epoch {} val_loss {}".format(e+1, val_loss / (batch_id+1)))
        
    if val_loss < min_val_loss:
        torch.save(model, MODEL_P)
        epochs_no_improve = 0
        min_val_loss = val_loss
    else :
        epochs_no_improve += 1

    if epochs_no_improve == n_epochs_stop:
        print('Early stopping!')
        early_stop = True
        break
    else:
        print("Keep going!")
        continue

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/11414 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.942183017730713 train acc 0.25
epoch 1 batch id 201 loss 1.8889559507369995 train acc 0.15640547263681592
epoch 1 batch id 401 loss 1.6556187868118286 train acc 0.23675187032418954
epoch 1 batch id 601 loss 0.8542299270629883 train acc 0.39548668885191346
epoch 1 batch id 801 loss 0.6733460426330566 train acc 0.5038233458177278
epoch 1 batch id 1001 loss 0.528282642364502 train acc 0.5720529470529471
epoch 1 batch id 1201 loss 0.3402792811393738 train acc 0.6200041631973355
epoch 1 batch id 1401 loss 0.32762253284454346 train acc 0.6550678087080657
epoch 1 batch id 1601 loss 0.16090533137321472 train acc 0.6808635227982511
epoch 1 batch id 1801 loss 0.5848179459571838 train acc 0.7021446418656302
epoch 1 batch id 2001 loss 0.6720952987670898 train acc 0.7188280859570215
epoch 1 batch id 2201 loss 0.803919792175293 train acc 0.73284870513403
epoch 1 batch id 2401 loss 0.9471951723098755 train acc 0.7421386922115785
epoch 1 batch id 2601 loss 0.20263555645942688

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/571 [00:00<?, ?it/s]

epoch 1 test acc 0.8588003502626971
epoch 1 val_loss 0.5371990839929537
Keep going!


  0%|          | 0/11414 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.5677416920661926 train acc 0.8125
epoch 2 batch id 201 loss 0.37380337715148926 train acc 0.8852611940298507
epoch 2 batch id 401 loss 0.32049328088760376 train acc 0.8820137157107232
epoch 2 batch id 601 loss 0.5624613761901855 train acc 0.8825915141430949
epoch 2 batch id 801 loss 0.47818702459335327 train acc 0.8847534332084894
epoch 2 batch id 1001 loss 0.33665427565574646 train acc 0.8873626373626373
epoch 2 batch id 1201 loss 0.05148313194513321 train acc 0.891184429641965
epoch 2 batch id 1401 loss 0.6474764347076416 train acc 0.8931566738044254
epoch 2 batch id 1601 loss 0.050547949969768524 train acc 0.8941286695815116
epoch 2 batch id 1801 loss 0.6650224924087524 train acc 0.8964117157134925
epoch 2 batch id 2001 loss 0.8324832916259766 train acc 0.8969577711144427
epoch 2 batch id 2201 loss 0.21268124878406525 train acc 0.898682417083144
epoch 2 batch id 2401 loss 1.146361231803894 train acc 0.8979591836734694
epoch 2 batch id 2601 loss 0.0319000370

  0%|          | 0/571 [00:00<?, ?it/s]

epoch 2 test acc 0.8627408056042032
epoch 2 val_loss 0.6258215382287439
Keep going!


  0%|          | 0/11414 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.07210821658372879 train acc 1.0
epoch 3 batch id 201 loss 0.4218697249889374 train acc 0.9402985074626866
epoch 3 batch id 401 loss 0.08517388999462128 train acc 0.934071072319202
epoch 3 batch id 601 loss 0.5581591725349426 train acc 0.9355241264559068
epoch 3 batch id 801 loss 0.13398027420043945 train acc 0.9371878901373284
epoch 3 batch id 1001 loss 0.3495432734489441 train acc 0.9381868131868132
epoch 3 batch id 1201 loss 0.5713797807693481 train acc 0.9400499583680266
epoch 3 batch id 1401 loss 0.15126366913318634 train acc 0.9412473233404711
epoch 3 batch id 1601 loss 0.038682159036397934 train acc 0.9421064959400375
epoch 3 batch id 1801 loss 0.10061424970626831 train acc 0.9433300943920044
epoch 3 batch id 2001 loss 0.07882269471883774 train acc 0.9441529235382309
epoch 3 batch id 2201 loss 0.047816287726163864 train acc 0.94525215810995
epoch 3 batch id 2401 loss 0.41999951004981995 train acc 0.9437473969179508
epoch 3 batch id 2601 loss 0.0066842981

  0%|          | 0/571 [00:00<?, ?it/s]

epoch 3 test acc 0.8661339754816112
epoch 3 val_loss 0.6897610888181127
Early stopping!


In [18]:
model.eval()
outs = []

for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(test_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    out = model(token_ids, valid_length, segment_ids)
        
    for o in out.detach():
        outs.append(o.tolist())

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/571 [00:00<?, ?it/s]

In [38]:
kobert_test_pred = pd.DataFrame(outs)
kobert_test_pred.to_csv("results/kobert_test_pred.csv",index=False)

In [33]:
model.eval()
outs = []

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(total_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    out = model(token_ids, valid_length, segment_ids)
        
    for o in out.detach():
        outs.append(o.tolist())

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/2854 [00:00<?, ?it/s]

In [34]:
len(outs[0])

7

In [35]:
kobert_train_pred = pd.DataFrame(outs)
kobert_train_pred.to_csv("results/kobert_train_pred.csv",index=False)

In [21]:
model = torch.load(MODEL_P)

In [26]:
model.eval()
outs = []

for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(test_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    out = model(token_ids, valid_length, segment_ids)
        
    for o in out.detach():
        outs.append(o.tolist())

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/571 [00:00<?, ?it/s]

In [27]:
outs[0]

[7.34231424331665,
 -0.20525412261486053,
 1.046838402748108,
 0.006420684978365898,
 -2.0384185314178467,
 -3.0738162994384766,
 -2.5687575340270996]

In [28]:
kobert_best_test_pred = pd.DataFrame(outs)
kobert_best_test_pred.to_csv("results/kobert_best_test_pred.csv",index=False)

In [29]:
model.eval()
outs = []

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(total_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    out = model(token_ids, valid_length, segment_ids)
        
    for o in out.detach():
        outs.append(o.tolist())

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/2854 [00:00<?, ?it/s]

In [30]:
kobert_train_pred = pd.DataFrame(outs)
kobert_train_pred.to_csv("results/kobert_best_train_pred.csv",index=False)

In [22]:
model.eval()
pred = []

for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(test_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    out = model(token_ids, valid_length, segment_ids)
        
    _,max_indices = torch.max(out,1)
    for idx in max_indices.cpu().numpy():
        pred.append(idx)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/571 [00:00<?, ?it/s]

In [23]:
submission = pd.read_csv('data/sample_submission.csv')
submission['topic_idx'] = pred
submission.to_csv("results/kobert-aumented2.csv",index=False)