In [1]:
import pandas as pd
import numpy as np
import re
import gluonnlp as nlp
from gluonnlp.data import SentencepieceTokenizer
from kobert.utils import get_tokenizer
import torch
from torch import nn
from kobert.pytorch_kobert import get_pytorch_kobert_model
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from transformers.optimization import WarmupLinearSchedule

In [32]:
from tqdm import tqdm, tqdm_notebook

In [36]:
!pip install mxnet-cu101
!pip install gluonnlp pandas tqdm
!pip install sentencepiece==0.1.85
!pip install transformers==2.1.1
!pip install torch==1.3.1

Collecting torch==1.3.1
  Using cached https://files.pythonhosted.org/packages/88/95/90e8c4c31cfc67248bf944ba42029295b77159982f532c5689bcfe4e9108/torch-1.3.1-cp36-cp36m-manylinux1_x86_64.whl
[31mERROR: torchvision 0.8.1+cu101 has requirement torch==1.7.0, but you'll have torch 1.3.1 which is incompatible.[0m
Installing collected packages: torch
  Found existing installation: torch 1.7.0
    Uninstalling torch-1.7.0:
      Successfully uninstalled torch-1.7.0
Successfully installed torch-1.3.1


In [2]:
from google.colab import drive
drive.mount("/gdrive", force_remount = True)

Mounted at /gdrive


In [3]:
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Collecting git+https://****@github.com/SKTBrain/KoBERT.git@master
  Cloning https://****@github.com/SKTBrain/KoBERT.git (to revision master) to /tmp/pip-req-build-wc002yp3
  Running command git clone -q 'https://****@github.com/SKTBrain/KoBERT.git' /tmp/pip-req-build-wc002yp3
Building wheels for collected packages: kobert
  Building wheel for kobert (setup.py) ... [?25l[?25hdone
  Created wheel for kobert: filename=kobert-0.1.1-cp36-none-any.whl size=12825 sha256=b558efc899de415e50a8326a91358d5e8fa8cdfd2790e2c8eac0fd92ad32da91
  Stored in directory: /tmp/pip-ephem-wheel-cache-8s_z4vao/wheels/a2/b0/41/435ee4e918f91918be41529283c5ff86cd010f02e7525aecf3
Successfully built kobert


In [4]:
cd /gdrive/MyDrive/NLP

/gdrive/MyDrive/NLP


In [5]:
device = torch.device("cuda:0")

In [6]:
# pandas 설정: 각 column 별 (혹은 한 cell에서) 출력되는 글자수 제한을 없애기
pd.set_option('display.max_colwidth', -1)
df = pd.read_csv('./data/몽데이크_Open.csv')

  


In [7]:
# 개수 체크하기
print(len(df[df['text']=='다음 중 옳은 것은?']))
print(len(df[df['text']=='다음 중 옳지 않은 것은?']))

10
5


In [8]:
# 15개 밖에 없기 때문에 해도 무방할듯?
df = df[(df['text'] !='다음 중 옳은 것은?') & (df['text'] !='다음 중 옳지 않은 것은?')]

In [9]:
# qtid값 체크
df['qtid'] = df['qtid'].apply(lambda x: x[0:7])

In [10]:
# Math symbols
math_terms = ['sin', 'cos', 'tan', #Triangle function
             'alpha', 'beta', 'theta', 'gamma', 'omega', 'phi', #Roman letters
             'uu', 'nn', 'sup', 'sub', '^C', #Sets
             'pi', 'abs', 'sqrt', 'cdots', #Etc
             '!', '(', ')', '{', '}', '[', ']', '+', '|', '/_', '@', '=' 
             ]

# 좌표 괄호와 일반 괄호 구분 : 추가 예정

# Math symbols which need regular expression
math_terms_complex = [r'_[0-9]+C_[0-9+]', #Combination
                     r'_[0-9]+P_[0-9+]' #Permutation
                     ] 

# Symbols which are not related to math
non_math_terms = ['<br/>', '<fieldset>', '<legend>', '</fieldset>', '</legend>', '\{::\}']

In [11]:
# Add spaces to the next of math symbols
for math in math_terms:
    df['text'] = df['text'].apply(lambda x: x.replace(math, ' '+math+' '))

# Remove non-math terms
for non_math in non_math_terms:
    df['text'] = df['text'].apply(lambda x: x.replace(non_math, ' '))

# Handle exceptions
df['text'] = df['text'].apply(lambda x: re.sub(r'_[0-9]+C_[0-9+]', 'Combination', x))
df['text'] = df['text'].apply(lambda x: re.sub(r'_[0-9]+P_[0-9+]', 'Permutation', x))
df['text'] = df['text'].apply(lambda x: re.sub(r'<보기>', '보기', x))

In [12]:
df = df[['text', 'qtid']]

In [13]:
df.reset_index(inplace=True, drop=True)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['qtid'], test_size=0.2, random_state=307, stratify=df['qtid'])

In [45]:
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [46]:
print(len(y_test.unique()))
print(len(y_train.unique()))

37
37


In [47]:
indexer = {df['qtid'].unique()[i]: i for i in range(len(df['qtid'].unique()))}

In [50]:
# 추후에 dataloader에 추가할 에정
# bert 데이터 형식에 맞춰주기
data_train = []
for i_index in range(len(X_train)):
    temp = []
    temp.append(X_train[i_index])
    temp.append(str(indexer[y_train[i_index]]))
    data_train.append(temp)
    
data_test = []
for i_index in range(len(X_test)):
    temp = []
    temp.append(X_test[i_index])
    temp.append(str(indexer[y_test[i_index]]))
    data_test.append(temp)

In [51]:
model_bert, vocab = get_pytorch_kobert_model()

using cached model
using cached model


In [52]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


In [74]:
## Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 15
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [54]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))


In [55]:
train = BERTDataset(data_train, 0, 1, tok, max_len, True, False)
test = BERTDataset(data_test, 0, 1, tok, max_len, True, False)

In [56]:
train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=5)

In [57]:
# bert 모델
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=37,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [75]:
model = BERTClassifier(model_bert,  dr_rate=0.5).to(device)

In [76]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [77]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [78]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [79]:
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_step, t_total=t_total)

In [80]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [81]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))

epoch 1 batch id 1 loss 3.6264402866363525 train acc 0.046875

epoch 1 train acc 0.2795157734056988


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))


epoch 1 test acc 0.616687979539642


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))

epoch 2 batch id 1 loss 2.5522260665893555 train acc 0.59375

epoch 2 train acc 0.6599940637720488


HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))


epoch 2 test acc 0.667039641943734


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))

epoch 3 batch id 1 loss 1.6502113342285156 train acc 0.6875

epoch 3 train acc 0.7241222862957938


HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))


epoch 3 test acc 0.6964514066496164


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))

epoch 4 batch id 1 loss 1.1607141494750977 train acc 0.75

epoch 4 train acc 0.7641197421981004


HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))


epoch 4 test acc 0.7234654731457801


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))

epoch 5 batch id 1 loss 1.0008471012115479 train acc 0.78125

epoch 5 train acc 0.7983972184531886


HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))


epoch 5 test acc 0.748641304347826


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))

epoch 6 batch id 1 loss 0.7328384518623352 train acc 0.8125

epoch 6 train acc 0.8302281207598372


HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))


epoch 6 test acc 0.7734175191815857


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))

epoch 7 batch id 1 loss 0.6307983994483948 train acc 0.859375

epoch 7 train acc 0.8430546132971506


HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))


epoch 7 test acc 0.7879635549872123


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))

epoch 8 batch id 1 loss 0.5696201920509338 train acc 0.84375

epoch 8 train acc 0.8787737449118046


HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))


epoch 8 test acc 0.8026694373401535


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))

epoch 9 batch id 1 loss 0.4024849236011505 train acc 0.875

epoch 9 train acc 0.902132801899593


HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))


epoch 9 test acc 0.8048673273657289


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))

epoch 10 batch id 1 loss 0.35187238454818726 train acc 0.921875

epoch 10 train acc 0.9240544436906377


HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))


epoch 10 test acc 0.8162563938618925


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))

epoch 11 batch id 1 loss 0.32378894090652466 train acc 0.9375

epoch 11 train acc 0.9423210651289009


HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))


epoch 11 test acc 0.8202925191815857


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))

epoch 12 batch id 1 loss 0.28143569827079773 train acc 0.953125

epoch 12 train acc 0.9581411126187247


HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))


epoch 12 test acc 0.8248881074168798


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))

epoch 13 batch id 1 loss 0.22962018847465515 train acc 0.953125

epoch 13 train acc 0.9672744233378563


HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))


epoch 13 test acc 0.8322410485933505


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))

epoch 14 batch id 1 loss 0.19920259714126587 train acc 0.96875

epoch 14 train acc 0.973804274084125


HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))


epoch 14 test acc 0.8171755115089514


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))

epoch 15 batch id 1 loss 0.19755446910858154 train acc 0.96875

epoch 15 train acc 0.9789730325644506


HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))


epoch 15 test acc 0.8258072250639387


In [88]:
# 최대 값과 최대 값의 index 추출
max_vals, max_indices = torch.max(out, 1)

In [89]:
max_vals

tensor([6.7202, 7.4898, 6.4944, 6.0919, 7.5431, 7.5366, 7.0303, 5.3125, 6.4005,
        7.5324, 5.8579, 6.2511, 5.2962, 7.1076, 4.7326, 7.2492, 7.0209, 7.2463,
        6.9200, 7.2162, 5.9225, 7.1865, 5.9626, 6.5802, 6.3075, 6.9723, 6.3020,
        7.2019, 7.1202, 6.7419, 6.2986, 7.1431, 6.3545, 5.1534, 7.1038, 7.1286,
        7.1809, 7.2512, 6.0089, 7.2354, 7.0129, 7.2605, 6.2629, 4.3713, 6.4299,
        6.0628], device='cuda:0', grad_fn=<MaxBackward0>)

In [90]:
max_indices

tensor([ 1, 32, 27,  5, 34, 32,  0, 22, 27, 34, 14, 34, 26, 30, 32, 33, 24, 12,
         2, 12, 21, 33, 14,  0, 25, 12, 25, 35, 34,  1, 25, 35,  3, 36, 30, 35,
        35, 33, 10, 12,  0, 12, 25, 11, 27,  5], device='cuda:0')

In [91]:
label

tensor([ 1, 32, 27,  4, 34, 32,  0, 22, 20, 34, 14,  0, 26, 30, 22, 33, 24, 12,
         2, 12, 14, 33, 14, 34, 12, 25, 25, 35, 34,  1, 25, 35,  3, 36, 30, 35,
        35, 34, 10, 12,  0, 12, 25, 11, 27,  5], device='cuda:0')

In [92]:
# F1 Score로 성능 평가 
# accuracy에서 0.82 나옴
# 파라미터 수정해서 진행해도 좋을듯
from sklearn.metrics import f1_score

In [98]:
print(f1_score(label.tolist(), max_indices.tolist(), average='weighted'))
print(f1_score(label.tolist(), max_indices.tolist(), average='macro'))
print(f1_score(label.tolist(), max_indices.tolist(), average='micro'))

0.7942028985507248
0.7572463768115942
0.8043478260869565
