In [1]:
def recover_label_train(tags, id2tag):
    labels = []
    for tag in tags:
        pre = [id2tag.get(str(int(_id))) for _id in tag]
        labels.append(pre)
    return labels


def recover_label(tags, id2tag):
    labels = []
    for tag in tags:
        pre = [id2tag.get(str(_id), 'O') for _id in tag]
        labels.append(pre)
    return labels


def recover_label_test(tags, id2tag, sens_len):
    labels = []
    #print(id2tag.get(str(12)), 'O')
    for i in range(len(tags)):
        tag = tags[i][:sens_len[i]]
        pre = [id2tag.get(str(int(_id))) for _id in tag]
        labels.append(pre)
    return labels


def recover_bert_label(tags, id2tag):
    labels = []
    for tag in tags:
        pre = [id2tag.get(str(_id), 'O') for _id in tag[1:-1]]
        labels.append(pre)
    return labels


def get_ner_fmeasure(golden_lists, predict_lists, label_type="BMES"):
    golden_full = []
    predict_full = []
    right_full = []
    right_tag = 0
    all_tag = 0
    for idx, (golden_list, predict_list) in enumerate(zip(golden_lists, predict_lists)):
        for golden_tag, predict_tag in zip(golden_list, predict_list):
            if golden_tag == predict_tag:
                right_tag += 1
        all_tag += len(golden_list)
        if label_type == "BMES":
            gold_matrix = get_ner_BMES(golden_list)
            pred_matrix = get_ner_BMES(predict_list)
        else:
            gold_matrix = get_ner_BIO(golden_list)
            pred_matrix = get_ner_BIO(predict_list)
        right_ner = list(set(gold_matrix).intersection(set(pred_matrix)))
        golden_full += gold_matrix
        predict_full += pred_matrix
        right_full += right_ner
    right_num = len(right_full)
    golden_num = len(golden_full)
    predict_num = len(predict_full)
    if predict_num == 0:
        precision = -1
    else:
        precision = (right_num + 0.0) / predict_num
    if golden_num == 0:
        recall = -1
    else:
        recall = (right_num + 0.0) / golden_num
    if (precision == -1) or (recall == -1) or (precision + recall) <= 0.:
        f_measure = -1
    else:
        f_measure = 2 * precision * recall / (precision + recall)
    accuracy = (right_tag + 0.0) / all_tag
    print("gold_num = ", golden_num, " pred_num = ", predict_num, " right_num = ", right_num)
    return round(accuracy, 4), round(precision, 4), round(recall, 4), round(f_measure, 4)


def reverse_style(input_string):
    target_position = input_string.index('[')
    input_len = len(input_string)
    output_string = input_string[target_position:input_len] + input_string[0:target_position]
    return output_string


def get_ner_BMES(label_list):
    list_len = len(label_list)
    begin_label = 'B'
    end_label = 'E'
    single_label = 'S'
    whole_tag = ''
    index_tag = ''
    tag_list = []
    stand_matrix = []
    for i in range(list_len):
        # wordlabel = word_list[i]
        current_label = label_list[i].upper()
        tags = current_label.split('-')
        if begin_label in current_label:
            if index_tag != '':
                tag_list.append(whole_tag + ',' + str(i - 1))
            whole_tag = tags[-1] + '[' + str(i)
            index_tag = tags[-1]

        elif single_label in current_label:
            if index_tag != '':
                tag_list.append(whole_tag + ',' + str(i - 1))
            whole_tag = tags[-1] + '[' + str(i)
            tag_list.append(whole_tag)
            whole_tag = ""
            index_tag = ""
        elif end_label in current_label:
            if index_tag != '':
                tag_list.append(whole_tag + ',' + str(i))
            whole_tag = ''
            index_tag = ''
        else:
            continue
    if (whole_tag != '') & (index_tag != ''):
        tag_list.append(whole_tag)
    tag_list_len = len(tag_list)

    for i in range(0, tag_list_len):
        if len(tag_list[i]) > 0:
            tag_list[i] = tag_list[i] + ']'
            insert_list = reverse_style(tag_list[i])
            stand_matrix.append(insert_list)
    # print stand_matrix
    return stand_matrix


def get_ner_BIO(label_list):
    list_len = len(label_list)
    begin_label = 'B-'
    inside_label = 'I-'
    whole_tag = ''
    index_tag = ''
    tag_list = []
    stand_matrix = []
    for i in range(0, list_len):
        # wordlabel = word_list[i]
        current_label = label_list[i].upper()
        if begin_label in current_label:
            if index_tag == '':
                whole_tag = current_label.replace(begin_label, "", 1) + '[' + str(i)
                index_tag = current_label.replace(begin_label, "", 1)
            else:
                tag_list.append(whole_tag + ',' + str(i - 1))
                whole_tag = current_label.replace(begin_label, "", 1) + '[' + str(i)
                index_tag = current_label.replace(begin_label, "", 1)

        elif inside_label in current_label:
            if current_label.replace(inside_label, "", 1) == index_tag:
                whole_tag = whole_tag
            else:
                if (whole_tag != '') & (index_tag != ''):
                    tag_list.append(whole_tag + ',' + str(i - 1))
                whole_tag = ''
                index_tag = ''
        else:
            if (whole_tag != '') & (index_tag != ''):
                tag_list.append(whole_tag + ',' + str(i - 1))
            whole_tag = ''
            index_tag = ''

    if (whole_tag != '') & (index_tag != ''):
        tag_list.append(whole_tag)
    tag_list_len = len(tag_list)

    for i in range(0, tag_list_len):
        if len(tag_list[i]) > 0:
            tag_list[i] = tag_list[i] + ']'
            insert_list = reverse_style(tag_list[i])
            stand_matrix.append(insert_list)
    return stand_matrix

In [2]:
from transformers import BertModel
from torch import nn


class Bertencoder(nn.Module):
    def __init__(self, bert_dim, output_dim, num_layers, rnn_dim):
        super(Bertencoder, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
        self.rnn = nn.LSTM(bert_dim, rnn_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(rnn_dim*2, output_dim)
        self.loss_function = nn.CrossEntropyLoss()

    def forward(self, x, y, seg_ids, mask):
        bert_output = self.bert_model(x, token_type_ids=seg_ids, attention_mask=mask)
        lstm_output, _ = self.rnn(bert_output.last_hidden_state)
        output = self.linear(lstm_output).transpose(1, 2)
        loss = self.loss_function(output, y)
        return loss

    def test(self, x, y, seg_ids, mask):

        bert_output = self.bert_model(x, token_type_ids=seg_ids, attention_mask=mask)
        lstm_output, _ = self.rnn(bert_output.last_hidden_state)
        output = self.linear(lstm_output).transpose(1, 2)
        loss = self.loss_function(output, y)
        predict = self.predict_label(output.transpose(1, 2), mask)
        return loss, predict

    def predict_label(self, output, mask):
        """
        output: batch_size, max_sentence_len, tag_num
        mask: same label data
        batch_label: batch_size, sentence_label
        """
        pre_label = []
        pre_label_pad = output.argmax(2)
        for i in range(mask.shape[0]):
            sent_label = []
            for j in range(mask.shape[1]):
                if mask[i, j] == 1:
                    sent_label.append(pre_label_pad[i, j].item())
            sent_label = sent_label[1:len(sent_label)-1]
            pre_label.append(sent_label)

        return pre_label


In [27]:
special_labels = {',': 'I-COMMA',
                  '.': 'I-DOT',
                  '?': 'I-QMARK',
                  '!': 'I-EMARK',
                  ':': 'I-COLON',
                  ';': 'I-SEMICOLON'}
normal_label = 'O'
labels_set = list(special_labels.values()) + [normal_label]

In [4]:
from simpletransformers.ner import NERModel, NERArgs
import numpy as np
import pandas as pd

In [41]:
model_args = NERArgs()
model_args.overwrite_output_dir = True
model_args.save_steps = -1
ner_model = NERModel('bert',
                 'bert-base-uncased',
                 labels = labels_set,
                 args=model_args,
                 use_cuda=False)
train_df = pd.read_csv('./wikitext/train0-10.csv').dropna()
validation_df = pd.read_csv('./wikitext/validation.csv').dropna()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [42]:
train_dataset = ner_model.load_and_cache_examples(train_df)
validation_dataset = ner_model.load_and_cache_examples(validation_df)

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [46]:
len(train_dataset), len(validation_dataset)

(106953, 2285)

In [51]:
from tqdm.auto import tqdm, trange

In [58]:
from torch.optim import Adam, SGD
import torch
from torch.utils.data import DataLoader, TensorDataset

bert_dim = 768
output_dim = len(labels_set)
batch_size = 32
adam_lr = 3e-5
epoch = 1
num_layers = 1
rnn_dim = 512

def train(tr_ds, val_ds):
    use_gpu = torch.cuda.is_available()
    device = torch.device('cuda:0' if use_gpu else 'cpu')
    train_dataloader = DataLoader(tr_ds, batch_size=batch_size, shuffle=False)
    validation_dataloader = DataLoader(val_ds, batch_size=batch_size, shuffle=True)
    
    model = Bertencoder(bert_dim, output_dim, num_layers, rnn_dim)
    if use_gpu:
        model.to(device)

    optimizer = Adam(model.parameters(), lr=adam_lr)

    # train model
    for i in range(epoch):
        epoch_loss = 0
        batch_iterator = tqdm(train_dataloader, desc=f"Running Epoch {i+1} of {epoch}", mininterval=0)
        
        for (j, batch) in enumerate(batch_iterator):
            batch_x = batch[0].to(device)
            batch_y = batch[3].to(device)
            batch_seg = batch[2].to(device)
            batch_msk = batch[1].to(device)
            #loss = model(train_x[j], train_y[j], train_seg_ids[j], train_mask[j]).mean()                # dataparallel
            loss = model(batch_x, batch_y, batch_seg, batch_msk)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        # print("epoch_loss:\t{}".format(round(epoch_loss/(j+1), 4)))
        print(f"Epochs {i}/{epoch}. Running Loss: {epoch_loss/(j+1):9.4f}")
#         batch_iterator.set_description(f"Epochs {i}/{epoch}. Running Loss: {epoch_loss/(j+1):9.4f}")

#         # test model
#         dev_loss = 0
#         predict_idx_label = []

#         for (h, val_batch) in enumerate(validation_dataloader):
#             val_batch_x = val_batch[0].to(device)
#             val_batch_y = val_batch[3].to(device)
#             val_batch_seg = val_batch[2].to(device)
#             val_batch_msk = val_batch[1].to(device)
#             with torch.no_grad():
#                 #loss, pre_y = model.module.test(test_x[h], test_y[h], test_seg_ids[h], test_mask[h])     # dataparallel
#                 loss_val, pre_y = model.test(batch_x, batch_y, batch_seg, batch_msk)
#             #dev_loss += loss.mean().item()        # dataparallel
#             dev_loss += loss_val.item()

#             predict_idx_label.append(pre_y)
#             #print('test_iteration:\t{}\t\tloss:\t{}'.format(h+1, round(loss.mean().item(), 4)))          # dataparallel
# #             print('test_iteration:\t{}\t\tloss:\t{}'.format(h+1, round(loss.item(), 4)))

# #         predict_label = test_data.trans2tag(predict_idx_label)

# #         a, p, r, f = get_ner_fmeasure(label_y, predict_label)
#         print('train_loss:\t{}\t\tdev_loss:\t{}'.format(round(epoch_loss/(j+1), 4), round(dev_loss/(h+1), 4)))
# #         print('A:\t{}\t\tP:\t{}\t\tR:\t{}\t\tF:\t{}'.format(a, p, r, f))
#         print('*******************************************************')
    return model

In [59]:
model = train(train_dataset, validation_dataset)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Running Epoch 0 of 1:   0%|          | 0/3343 [00:00<?, ?it/s]

In [38]:
print(model)


Bertencoder(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=

In [None]:
torch.save(model.state_dict(), PATH)