In [1]:
import pymysql, os, copy, json, time, openpyxl, argparse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import torch
from torch import nn
from torch.utils.data import  Dataset, DataLoader, TensorDataset
from tqdm import tqdm
from transformers import BertForTokenClassification, AdamW
from tokenization_kobert import KoBertTokenizer

In [2]:
def token_generate(sent, tok, MAX_LEN):
    encode_dict = tok.encode_plus(text=sent, 
                                    add_special_tokens=True, max_length=MAX_LEN,
                                    return_token_type_ids=True, padding='max_length', 
                                    return_attention_mask=True, truncation=True) 
    input_id = encode_dict['input_ids']
    attention_mask = encode_dict['attention_mask']
    token_type_id = encode_dict['token_type_ids']
    return input_id, attention_mask, token_type_id  

def convert_label(words, labels_idx, tok,  ner_b_label, max_seq_len):
    cls_token = tok.cls_token
    sep_token = tok.sep_token
    unk_token = tok.unk_token
    pad_token_id = tok.pad_token_id
    tokens = []
    labels_ids = []    
    for word, slot_label in zip(words, labels_idx):
        w_token = tok.tokenize(word)
        if not w_token:
            w_token = [unk_token]
        tokens.extend(w_token)
        if int(slot_label) in ner_b_label:
            labels_ids.extend([int(slot_label)] + [int(slot_label) + 1] * (len(w_token)-1))
        else:
            labels_ids.extend([int(slot_label)] * len(w_token))            
    special_tokens_cnt = 2
    if len(labels_ids) > max_seq_len - special_tokens_cnt:
        labels_ids = labels_ids[:(max_seq_len - special_tokens_cnt)]        
    labels_ids += [sep_token_label_id]
    labels_ids = [cls_token_label_id] + labels_ids    
    padding_len = max_seq_len - len(labels_ids)
    labels_ids = labels_ids + ([pad_token_label_id] * padding_len)    
    return labels_ids

def generate_input(df, tok, ner_b_label, max_seq_len):
    input_ids = []
    attention_masks = []
    token_type_ids = []
    label_list = []    
    for i, data in enumerate(df[['sentence', 'label']].values):
        sentence, labels = data
        words = sentence.split()
        labels = labels.split()
        labels_idx = []        
        for l in labels:    
            labels_idx.append(train_label_l1.index(l+'-B') if l+'-B' in train_label_l1 else train_label_l1.index("UNK"))        
        input_id, attention_mask, token_type_id = token_generate(sentence, tok, max_seq_len)
        convert_label_id = convert_label(words, labels_idx, tok, ner_b_label, max_seq_len)
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        label_list.append(convert_label_id)        
    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)
    label_list = np.asarray(label_list, dtype=int)
    inputs = (input_ids, attention_masks, token_type_ids)    
    return inputs, label_list

class custom_set(Dataset):
    def __init__(self, dX, dY=None):
        self.input_id = dX[0]
        self.attention_mask = dX[1]
        self.token_type_id = dX[2]
        self.label = dY      
    
    def __len__(self):
        return len(self.input_id)
    
    def __getitem__(self, idx):
        input_ids = self.input_id[idx]
        attention_masks = self.attention_mask[idx]
        token_type_ids = self.token_type_id[idx]        
        if self.label is None:            
            return input_ids, attention_masks, token_type_ids            
        else :            
            label = self.label[idx]        
            return input_ids, attention_masks, token_type_ids, label

In [3]:
with open('LGES_sent_210804.dta', 'rb') as a_sent:
    train_sent = pickle.load(a_sent)
with open('LGES_sent_ner_level1_210804.dta', 'rb') as a_sent_l1:
    train_ner_l1 = pickle.load(a_sent_l1)
with open('LGES_label_l1_210804.dta', 'rb') as a_label1:
    train_label_l1 = pickle.load(a_label1)
    
args = argparse.Namespace(  
    max_seq_len = 128
)

In [4]:
sentences, labels = [], []
for i in range(0, len(train_sent)): 
    sentences.append(train_sent[i])
    labels.append(train_ner_l1[i])
train_dict = {"sentence": sentences, "label":labels}
train_df = pd.DataFrame(train_dict)
ner_begin_label = [train_label_l1.index(begin_label) for begin_label in train_label_l1 if "B" in begin_label]
print("Train Set : ", len(sentences))

Train Set :  605785


In [5]:
s_time =time.time()
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
pad_token_id = tokenizer.pad_token_id
pad_token_label_id = 0
cls_token_label_id = 0
sep_token_label_id = 0
train_inputs, train_labels = generate_input(train_df, tokenizer, ner_begin_label, 128)
print(time.time() - s_time)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.


370.183789730072


In [6]:
train_set = custom_set(train_inputs, train_labels)
train_loader = DataLoader(train_set, batch_size=32)

In [7]:
torch.manual_seed(1234)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = BertForTokenClassification.from_pretrained('monologg/kobert', num_labels=len(train_label_l1))#
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-3, eps=1e-3)
loss = nn.CrossEntropyLoss()

Downloading:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.train()
epoch_loss = 0
for epoch in tqdm(range(0, 1)):
    s_time = time.time()
    batch_loss = 0
    for i, [tmp_i, tmp_a, tmp_t, tmp_l] in enumerate(train_loader):
        tmp_input = tmp_i.to(device)
        tmp_attention = tmp_a.to(device)
        tmp_token = tmp_t.to(device)
        tmp_label = tmp_l.to(device)
        optimizer.zero_grad() 
        outputs = model(input_ids=tmp_input.to(device), attention_mask=tmp_attention.to(device), 
                    token_type_ids=tmp_token.to(device), labels=tmp_label.to(device))
        loss = outputs.loss 
        logit = np.argmax(outputs.logits.detach().cpu().numpy(), axis=2) 
        loss.backward()
        optimizer.step()          
        batch_loss += loss.item()
    e_time = time.time()
    tqdm.write('[Epoch : %d] train_loss: %.5f / Time : %f' % (epoch, batch_loss / (i+1), e_time - s_time))    

  0%|          | 0/1 [00:00<?, ?it/s]