# Name Entity Recognition
This file will demonstrate NER model training, including model and input data creating, model fine-tuning and testing. 

In [1]:
import os
import pandas as pd
import math
import numpy as np
from seqeval.metrics import classification_report,accuracy_score,f1_score

import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, Dataset, DataLoader
from torchcrf import CRF
from transformers import BertModel, BertTokenizer, BertConfig, BertForTokenClassification, \
                            get_linear_schedule_with_warmup

from sklearn import metrics


# Define parameters

In [2]:
BASE_MODEL = 'emilyalsentzer/Bio_ClinicalBERT'

MAX_LEN = 64  # Length of input sequence
BATCH_SIZE = 32
EPOCHS = 50
LEARNING_RATE = 3e-5

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NER_MODEL_SAVED_DIR = 'trained_models/NER/'
MODEL_NAME = 'NER_Bert-CRF-test'

tag2idx = {'B-problem': 0,
           'B-test': 1,
           'B-treatment': 2,
           'I-problem': 3,
           'I-test': 4,
           'I-treatment': 5,
           'O': 6,
           'X': 7,
           '[CLS]': 8,
           '[SEP]': 9
           }
idx2tag = {tag2idx[key]: key for key in tag2idx}
LABELS = ['B-problem',
           'B-test',
           'B-treatment',
           'I-problem',
           'I-test',
           'I-treatment']


## Create data loader

In [3]:

class i2b2Dataset(Dataset):
    def __init__(self, dataframe):
        self.sentences = []
        self.labels = []
        self.tokenizer = BertTokenizer.from_pretrained(BASE_MODEL)
        self.MAX_LEN = MAX_LEN - 2
        agg_func = lambda s: [(w, t) for w, t in zip(s["word"].values.tolist(),
                                                     s["tag"].values.tolist())]
        grouped = dataframe.groupby("sentence #").apply(agg_func)
        sentences_labels = [s for s in grouped]

        self.sentences = [[s[0] for s in sent] for sent in sentences_labels]
        self.labels = [[s[1] for s in sent] for sent in sentences_labels]

    def __getitem__(self, idx):
        sentence, label = self.sentences[idx], self.labels[idx]
        temp_lable = []
        temp_token = []
        # Tokenize each word 
        # e.g. Admission -> Ad ##mi ##ssion
        #      O         -> O   X    X 
        # Label 'X' to subtokens.
        for word, lab in zip(sentence, label):
            token_list = self.tokenizer.tokenize(word)
            for m, token in enumerate(token_list):
                temp_token.append(token)
                if m == 0:
                    temp_lable.append(lab)
                else:
                    temp_lable.append('X')
        # Add beginning tag and end tag to sequences.
        text = ['[CLS]'] + temp_token[:self.MAX_LEN] + ['[SEP]']
        label = ['[CLS]'] + temp_lable[:self.MAX_LEN] + ['[SEP]']
        # convert to ids
        sentence_ids = self.tokenizer.convert_tokens_to_ids(text)
        label_ids = [tag2idx.get(t) for t in label]
        seqlen = len(label_ids)
        return sentence_ids, label_ids, seqlen

    def __len__(self):
        return len(self.sentences)

# truncate or pad sequences
def pad_batch(batch):
    maxlen = max([i[2] for i in batch])
    token_tensors = torch.LongTensor([i[0] + [0] * (maxlen - len(i[0])) for i in batch])
    # 'O' is the label of [PAD].
    label_tensors = torch.LongTensor([i[1] + [tag2idx.get('O')] * (maxlen - len(i[1])) for i in batch])
    mask = (token_tensors > 0)
    return token_tensors, label_tensors, mask

## Create input data

In [4]:
data_path_train = 'Data/processed/NER/merged/train.tsv'
data_path_dev = 'Data/processed/NER/merged/dev.tsv'
data_path_test = 'Data/processed/NER/merged/test.tsv'
train_data = pd.read_csv(data_path_train, sep="\t").astype(str)
dev_data = pd.read_csv(data_path_dev, sep="\t").astype(str)
test_data = pd.read_csv(data_path_test, sep="\t").astype(str)

In [5]:
train_dataset = i2b2Dataset(train_data)
train_iter = DataLoader(dataset=train_dataset,
                        batch_size=BATCH_SIZE,
                        shuffle=True,
                        collate_fn=pad_batch,
                        pin_memory=True
                        )
dev_dataset = i2b2Dataset(dev_data)
dev_iter = DataLoader(dataset=dev_dataset,
                        batch_size=BATCH_SIZE,
                        shuffle=True,
                        collate_fn=pad_batch,
                        pin_memory=True
                        )
test_dataset = i2b2Dataset(test_data)
test_iter = DataLoader(dataset=test_dataset,
                        batch_size=BATCH_SIZE,
                        shuffle=True,
                        collate_fn=pad_batch,
                        pin_memory=True
                        )
print('Data loaded')

Data loaded


## Create an NER model

In [6]:
EMBEDDING_DIM = 768
HIDDEN_DIM = 256

class Bert_BiLSTM_CRF(nn.Module):

    def __init__(self, tag2idx, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM):
        super(Bert_BiLSTM_CRF, self).__init__()
        self.tag_to_ix = tag2idx
        self.tagset_size = len(tag2idx)
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.bert = BertModel.from_pretrained(BASE_MODEL)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim // 2,
                            num_layers=2, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(p=0.1)
        self.linear = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def getfeature(self, sentence):
        with torch.no_grad():
            # Two returns from BERT by default: last_hidden_state, pooler_output
            # last_hidden_state：the semantic vector of each position in an output sequence (batch_size, sequence_length, hidden_size)
            # pooler_output：
            # Semantic vector corresponding to [CLS] symbols with full connectivity layer and tanh activation; this vector can be used for downstream classification tasks
            embeds, _ = self.bert(sentence, return_dict=False)
        # Two returns from BERT by default: output, (h,c)
        # output:[batch_size,seq_len,hidden_dim * 2]   if birectional
        # h,c :[num_layers * 2,batch_size,hidden_dim]  if birectional
        # h is the hidden layer result of the last time step of the LSTM, c is the Cell state of the last time step of the LSTM
        out, _ = self.lstm(embeds)
        out = self.dropout(out)
        feats = self.linear(out)
        return feats

    def forward(self, sentence, tags, mask, is_test=False):
        feature = self.getfeature(sentence)
        # training
        if not is_test:
            # return log-likelihood
            # make this value negative as our loss
            loss = -self.crf.forward(feature, tags, mask, reduction='mean')
            return loss
        # testing
        else:
            decode = self.crf.decode(feature, mask)
            return decode


## Load Model and define optimizer

In [7]:

model = Bert_BiLSTM_CRF(tag2idx).to(DEVICE)
optimizer = Adam(model.parameters(), lr=LEARNING_RATE, eps=1e-6)
# Warmup
len_dataset = len(train_dataset)
total_steps = (len_dataset // BATCH_SIZE) * EPOCHS if len_dataset % BATCH_SIZE == 0 \
    else (len_dataset // BATCH_SIZE + 1) * EPOCHS
warm_up_ratio = 0.1
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warm_up_ratio * total_steps,
                                            num_training_steps=total_steps)


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
best_model = None
_best_val_loss = float("inf")
_best_val_acc = -float("inf")


## Training and test

In [9]:
def train(epoch, model, data_loader, optimizer, scheduler, device):
    model.train()
    losses = 0.0
    step = 0
    for i, batch in enumerate(data_loader):
        optimizer.zero_grad()
        step += 1
        contexts, labels, masks = batch
        contexts = contexts.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        loss = model(contexts, labels, masks)
        losses += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    print("Epoch: {}, Loss:{:.4f}".format(epoch, losses / step))

def validate(epoch, model, data_loader, device):
    model.eval()
    Y, Y_hat = [], []
    losses = 0
    step = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            step += 1
            contexts, labels, masks = batch
            contexts = contexts.to(device)
            labels = labels.to(device)
            masks = masks.to(device)

            y_hat = model(contexts, labels, masks, is_test=True)
            loss = model(contexts, labels, masks)

            losses += loss.item()
            # Save prediction
            for j in y_hat:
                # 1-dimension
                Y_hat.extend(j)
            # Save labels
            masks = (masks == 1)
            y_orig = torch.masked_select(labels, masks)
            Y.append(y_orig.cpu())
    # 2-dimension --> 1-dimension
    Y = torch.cat(Y, dim=0).numpy()
    Y_hat = np.array(Y_hat)
    acc = (Y_hat == Y).mean() * 100

    print("Epoch: {}, Val Loss:{:.4f}, Val Acc:{:.3f}%".format(epoch, losses / step, acc))
    return model, losses / step, acc


def test(model, data, device):
    model.eval()
    Y, Y_hat, y_true, y_pred = [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(data):
            contexts, labels, masks = batch
            contexts = contexts.to(device)
            labels = labels.to(device)
            masks = masks.to(device)
            y_hat = model(contexts, labels, masks, is_test=True)
            # Save prediction
            for j in y_hat:
                Y_hat.extend(j)
            # Save labels
            masks = (masks == 1)
            y_orig = torch.masked_select(labels, masks)
            Y.append(y_orig.cpu())

    Y = torch.cat(Y, dim=0).numpy()
    for y_t, y_p in zip (Y, Y_hat):
        if idx2tag[y_t] not in ["X", "[CLS]", "[SEP]"]:    
            y_true.append(idx2tag[y_t])
            y_pred.append(idx2tag[y_p])

    return y_true, y_pred

In [10]:
print("Num examples = %d"%(len(train_dataset)))
print("Batch size = %d"%(BATCH_SIZE))
print("Epochs = %d"%(EPOCHS))
print('Train Start ...')

for epoch in range(1, EPOCHS + 1):
    train(epoch, model, train_iter, optimizer, scheduler, DEVICE)
    if epoch % 5 == 0:
        print('valid-->', end='')
        candidate_model, loss, acc = validate(epoch, model, dev_iter, DEVICE)
        if loss < _best_val_loss and acc > _best_val_acc:
            best_model = candidate_model
            _best_val_loss = loss
            _best_val_acc = acc
y_test, y_pred = test(best_model, test_iter, DEVICE)

torch.save({'model': best_model.state_dict()}, os.path.join(NER_MODEL_SAVED_DIR, MODEL_NAME + '.ckpt'))
print('Train End ... Model saved')
        
print("Accuracy score: %f"%(metrics.accuracy_score(y_test, y_pred)))
print(metrics.classification_report(y_test, y_pred, labels=LABELS, digits=4))

Num examples = 14387
Batch size = 32
Epochs = 50
Train Start ...
Epoch: 1, Loss:33.7310
Epoch: 2, Loss:24.0599
Epoch: 3, Loss:17.7559
Epoch: 4, Loss:12.3068
Epoch: 5, Loss:8.5998
valid-->Epoch: 5, Val Loss:7.1443, Val Acc:85.833%
Epoch: 6, Loss:6.3561
Epoch: 7, Loss:5.1377
Epoch: 8, Loss:4.3769
Epoch: 9, Loss:3.8559
Epoch: 10, Loss:3.4792
valid-->Epoch: 10, Val Loss:3.0715, Val Acc:94.077%
Epoch: 11, Loss:3.1971
Epoch: 12, Loss:2.9764
Epoch: 13, Loss:2.7936
Epoch: 14, Loss:2.6477
Epoch: 15, Loss:2.5151
valid-->Epoch: 15, Val Loss:2.3384, Val Acc:95.141%
Epoch: 16, Loss:2.4192
Epoch: 17, Loss:2.3401
Epoch: 18, Loss:2.2411
Epoch: 19, Loss:2.1940
Epoch: 20, Loss:2.1300
valid-->Epoch: 20, Val Loss:2.1025, Val Acc:95.551%
Epoch: 21, Loss:2.0721
Epoch: 22, Loss:2.0265
Epoch: 23, Loss:1.9502
Epoch: 24, Loss:1.9317
Epoch: 25, Loss:1.8846
valid-->Epoch: 25, Val Loss:1.9526, Val Acc:95.788%
Epoch: 26, Loss:1.8510
Epoch: 27, Loss:1.8160
Epoch: 28, Loss:1.7804
Epoch: 29, Loss:1.7549
Epoch: 30, Los