In [1]:
import pymysql, os, copy, json, time, openpyxl, argparse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import torch
from torch import nn
from torch.utils.data import  Dataset, DataLoader, TensorDataset
from tqdm import tqdm

In [2]:
from transformers import BertForTokenClassification, AdamW
from tokenization_kobert import KoBertTokenizer

In [3]:
def token_generate(sent, tok, MAX_LEN):
    encode_dict = tok.encode_plus(text=sent, 
                                    add_special_tokens=True, max_length=MAX_LEN,
                                    return_token_type_ids=True, padding='max_length', #pad_to_max_length=True,
                                    return_attention_mask=True, truncation=True) # return_tensors='pt',

    input_id = encode_dict['input_ids']
    attention_mask = encode_dict['attention_mask']
    token_type_id = encode_dict['token_type_ids']
    return input_id, attention_mask, token_type_id  

def convert_label(words, labels_idx, tok,  ner_b_label, max_seq_len):
    cls_token = tok.cls_token
    sep_token = tok.sep_token
    unk_token = tok.unk_token
    pad_token_id = tok.pad_token_id
    tokens = []
    labels_ids = []
    
    for word, slot_label in zip(words, labels_idx):
        w_token = tok.tokenize(word)
        if not w_token:
            w_token = [unk_token]
        tokens.extend(w_token)
        #labels_ids.extend([int(slot_label)] * len(w_token))
        if int(slot_label) in ner_b_label:
            labels_ids.extend([int(slot_label)] + [int(slot_label) + 1] * (len(w_token)-1))
        else:
            labels_ids.extend([int(slot_label)] * len(w_token))
            
    special_tokens_cnt = 2
    if len(labels_ids) > max_seq_len - special_tokens_cnt:
        labels_ids = labels_ids[:(max_seq_len - special_tokens_cnt)]
        
    labels_ids += [sep_token_label_id]
    labels_ids = [cls_token_label_id] + labels_ids
    
    padding_len = max_seq_len - len(labels_ids)
    labels_ids = labels_ids + ([pad_token_label_id] * padding_len)
    
    return labels_ids

def generate_input(df, tok, ner_b_label, max_seq_len):
    input_ids = []
    attention_masks = []
    token_type_ids = []
    label_list = []
    
    for i, data in enumerate(df[['sentence', 'label']].values):
        sentence, labels = data
        words = sentence.split()
        labels = labels.split()
        labels_idx = []
        
        for l in labels:    
            labels_idx.append(train_label_l1.index(l+'-B') if l+'-B' in train_label_l1 else train_label_l1.index("UNK"))
        
        input_id, attention_mask, token_type_id = token_generate(sentence, tok, max_seq_len)
        convert_label_id = convert_label(words, labels_idx, tok, ner_b_label, max_seq_len)
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        label_list.append(convert_label_id)
        
    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)
    label_list = np.asarray(label_list, dtype=int)
    inputs = (input_ids, attention_masks, token_type_ids)
    
    return inputs, label_list

class custom_set(Dataset):
    def __init__(self, dX, dY=None):
        self.input_id = dX[0]
        self.attention_mask = dX[1]
        self.token_type_id = dX[2]
        self.label = dY
        
    
    def __len__(self):
        return len(self.input_id)
    
    def __getitem__(self, idx):
        input_ids = self.input_id[idx]
        attention_masks = self.attention_mask[idx]
        token_type_ids = self.token_type_id[idx]
        
        if self.label is None:            
            return input_ids, attention_masks, token_type_ids            
        else :            
            label = self.label[idx]        
            return input_ids, attention_masks, token_type_ids, label

In [4]:
def _read_file(input_file):
    with open(input_file, "r", encoding="utf-8") as f:
        sentences = []
        labels = []
        for line in f:
            split_line = line.strip().split('\t')
            sentences.append(split_line[0])
            labels.append(split_line[1])
        return sentences, labels
    
def eval_input(test, token, args, pad_token_label_id, mask_padding_with_zero = True):

    words = test.split()
    tokens = []
    slot_label_mask = []
    for word in words:
        word = word.strip()
        word_tokens = token.tokenize(word)
        if not word_tokens:
            word_tokens = [unk_token]  # For handling the bad-encoded word
        tokens.extend(word_tokens)
        slot_label_mask.extend([0] + [pad_token_label_id] * (len(word_tokens) - 1))

    # Account for [CLS] and [SEP]
    special_tokens_count = 2
    if len(tokens) > args.max_seq_len - special_tokens_count:
        slot_label_mask = slot_label_mask[:(args.max_seq_len - special_tokens_count)]

    # Add [SEP] token
    slot_label_mask += [pad_token_label_id]
    slot_label_mask = [pad_token_label_id] + slot_label_mask
    padding_length = args.max_seq_len - len(slot_label_mask)
    slot_label_mask = slot_label_mask + ([pad_token_label_id] * padding_length)
    input_id, attention_mask, token_type_id = token_generate(test, token, args.max_seq_len)

    input_ids = torch.tensor(input_id, dtype=torch.long).reshape(1,-1)
    attention_mask = torch.tensor(attention_mask, dtype=torch.long).reshape(1,-1)
    token_type_ids = torch.tensor(token_type_id, dtype=torch.long).reshape(1,-1)
    slot_label_mask = torch.tensor(slot_label_mask, dtype=torch.long).reshape(1,-1)
    
    return input_ids, attention_mask, token_type_ids, slot_label_mask    

def eval_ft(test_sent, label_lst_, model_, tok, dev, args):
    preds = None
    pad_token = torch.nn.CrossEntropyLoss().ignore_index    
    with torch.no_grad():
        tmp_input, tmp_attention, tmp_token, tmp_slot = eval_input(test_sent, tok, args, pad_token)    
        inputs = {'input_ids':tmp_input.to(device), 'attention_mask':tmp_attention.to(device), 
                  'labels' : None,
                  'token_type_ids': tmp_token.to(device)}
        output = model_(**inputs)
        logits = output[0]

        if preds is None:
            preds = logits.detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)

        preds = np.argmax(preds, axis=2)
        slot_label_map = {i : label for i, label in enumerate(label_lst_)}   
    preds_list = []
    for j in range(preds.shape[1]):
        if tmp_slot[0,j] != pad_token:
            preds_list.append(slot_label_map[preds[0][j]])            

    line = ""
    for w, p in zip(test_sent.split(), preds_list):
        line = line + " {}".format(p)
        #if p == "O":
        #    line = line + w + " "
        #else :
        #    line = line + "{}[{}] ".format(w, p)

    return line#, logits

def get_labels(label_path):
    return [label.strip() for label in open(os.path.join(label_path), 'r', encoding='utf-8')]

In [5]:
with open('LGES_sent_210804.dta', 'rb') as a_sent:
    train_sent = pickle.load(a_sent)
with open('LGES_sent_ner_level1_210804.dta', 'rb') as a_sent_l1:
    train_ner_l1 = pickle.load(a_sent_l1)
with open('LGES_label_l1_210804.dta', 'rb') as a_label1:
    train_label_l1 = pickle.load(a_label1)
    
args = argparse.Namespace(  
    max_seq_len = 128
)
#args.model_name_or_path = MODEL_PATH_MAP[args.model_type]

In [6]:
o_ratio = []
for k in range(0, len(train_ner_l1)):
    #print(k)
    sent_total_cnt = len(train_ner_l1[k].split(' '))
    sent_o_cnt = np.sum([1 if x =='O' else 0 for x in train_ner_l1[k].split(' ') ])
    o_ratio.append( sent_o_cnt  / sent_total_cnt)
#select_sent_id = [i for i,z in enumerate(o_ratio) if z < 0.7]

In [7]:
sentences, labels = [], []
for i in range(0, len(train_sent)): #select_sent_id:#
    sentences.append(train_sent[i])#.split(' '))
    labels.append(train_ner_l1[i])#.split(' '))
train_dict = {"sentence": sentences, "label":labels}
train_df = pd.DataFrame(train_dict)
ner_begin_label = [train_label_l1.index(begin_label) for begin_label in train_label_l1 if "B" in begin_label]
print("Train Set : ", len(sentences))

Train Set :  605785


In [8]:
s_time =time.time()
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
pad_token_id = tokenizer.pad_token_id
pad_token_label_id = 0
cls_token_label_id = 0
sep_token_label_id = 0
train_inputs, train_labels = generate_input(train_df, tokenizer, ner_begin_label, 128)
print(time.time() - s_time)

Downloading:   0%|          | 0.00/371k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.


371.885374546051


In [9]:
train_set = custom_set(train_inputs, train_labels)
train_loader = DataLoader(train_set, batch_size=32)

In [10]:
torch.manual_seed(1234)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = BertForTokenClassification.from_pretrained('monologg/kobert', num_labels=len(train_label_l1))#
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-3, eps=1e-3)
loss = nn.CrossEntropyLoss()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
save_folder = './save/'
if not os.path.exists(save_folder):
        os.makedirs(save_folder)

model.train()
epoch_loss = 0
for epoch in tqdm(range(0, 1)):
    #tqdm_loader = tqdm(train_loader, total=len(train_loader), leave=False)
    s_time = time.time()
    batch_loss = 0
    for i, [tmp_i, tmp_a, tmp_t, tmp_l] in enumerate(train_loader):
        tmp_input = tmp_i.to(device)
        tmp_attention = tmp_a.to(device)
        tmp_token = tmp_t.to(device)
        tmp_label = tmp_l.to(device)
        optimizer.zero_grad() 
        outputs = model(input_ids=tmp_input.to(device), attention_mask=tmp_attention.to(device), 
                    token_type_ids=tmp_token.to(device), labels=tmp_label.to(device))
        loss = outputs.loss 
        logit = np.argmax(outputs.logits.detach().cpu().numpy(), axis=2) 
        loss.backward()
        optimizer.step()          
        batch_loss += loss.item()
    e_time = time.time()
    tqdm.write('[Epoch : %d] train_loss: %.5f / Time : %f' % (epoch, batch_loss / (i+1), e_time - s_time))    
    if epoch % 10 == 0:
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(save_folder+'epoch'+str(epoch))#,state_dict=save_folder+'epoch'+str(epoch))
    #torch.save(args, os.path.join(save_folder+'epoch'+str(epoch)+'/train_args.bin'))
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(save_folder)
#torch.save(os.path.join('./save/train_args.bin'))


  0%|          | 0/1 [05:02<?, ?it/s]

[Epoch : 0] train_loss: 0.93507 / Time : 302.104509


100%|██████████| 1/1 [05:02<00:00, 302.68s/it]
