# Entity Recognition: NER with BERT
## Set up environment
Clear environment

In [1]:
get_ipython().run_line_magic('reset', '-f')

Define data path and load

In [2]:
import pandas as pd
data_df_path = "data_labelled.csv"
data_df = pd.read_csv(data_df_path, encoding = "utf-8")

# Ensure words are in ascii
data_df['Word'] = data_df['Word'].str.encode('ascii', 'ignore').str.decode('ascii')

## Data Prep
Print dataset size & tag distributions

In [3]:
for data in [data_df]:
    print("Data size: ", data.EmailID.nunique())
    print("POS tag distribution: \n", data.PosTag.value_counts().head())
    print("Entity tag distribution: \n", data.EntTag.value_counts().head())

Data size:  2654
POS tag distribution: 
 NNP    151132
NN      73938
IN      41114
DT      37234
CD      34724
Name: PosTag, dtype: int64
Entity tag distribution: 
 O                598696
I-AuthRepName      1814
B-AuthRepName      1766
B-AccountNum       1099
I-AccountNum        228
Name: EntTag, dtype: int64


### Parse data using sentence parser
Define sentence parser

In [4]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["PosTag"].values.tolist(),
                                                           s["EntTag"].values.tolist())]
        self.grouped = self.data.groupby("EmailID").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Email: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [5]:
# Get full document data structure
getter = SentenceGetter(data_df)

# Get sentence data
# sentences = [[s[0] for s in sent] for sent in getter.sentences]
sentences = [" ".join([str(s[0]) for s in sent]) for sent in getter.sentences]

# Get pos data
poses = [[s[1] for s in sent] for sent in getter.sentences]

# Get tag labels data
labels = [[s[2] for s in sent] for sent in getter.sentences]

### Map entity tags to an index

In [6]:
tags_vals = list(set(data_df["EntTag"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}

### Prepare the sentences and labels
Import libraries

In [7]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pytorch_pretrained_bert
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam
print("pytorch_pretrained_bert version: ", pytorch_pretrained_bert.__version__)

Using TensorFlow backend.


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
pytorch_pretrained_bert version:  0.4.0


Here we fix some configurations. We will limit our sequence length to 75 tokens and we will use a batch size of 32 as suggested by the Bert paper. Note, that Bert natively supports sequences of up to 512 tokens.

In [8]:
MAX_LEN = 300
bs = 32

Set up GPU environment, if there is one

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# n_gpu = torch.cuda.device_count()
# torch.cuda.get_device_name(0)

Import BERT tokeniser and tokenise all sentences

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print(tokenized_texts[0])

['Good', 'morning', ',', 'Tel', '##stra', 'account', '335', '35', '##5', '##8', '97', '##0', 'Could', 'you', 'please', 'action', 'the', 'following', 'items', 'related', 'to', 'services', 'associated', 'with', 'Tel', '##stra', 'account', 'number', '335', '35', '##5', '##8', '97', '##0', ':', 'Service', 'Action', 'required', '04', '##37', '65', '##8', '75', '##3', 'CA', '##NC', '##EL', 'SE', '##R', '##VI', '##CE', 'If', 'you', 'require', 'any', 'additional', 'information', 'or', 'c', '##lar', '##ification', 'please', 'let', 'me', 'know', '.', 'Kind', 'regards', ',', 'Anthony', '.', 'Anthony', 'Chambers', 'Head', 'of', 'Technology', 'Para', '##way', 'Pastor', '##al', 'Company', 'Limited', '70', 'M', '##c', '##N', '##ama', '##ra', 'St', '|', 'P', '##O', 'Box', '22', ',', 'Orange', 'NSW', '280', '##0', 'T', ':', '04', '##16', '218', '59', '##8', 'Anthony', '.', 'Chambers', '@', 'para', '##way', '.', 'com', '.', 'au', 'www', '.', 'para', '##way', '.', 'com', '.', 'au']


Cut, pad and label sequences to desired lenft

In [11]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")

The Bert model supports something called `attention_mask`, which is similar to the masking in keras. So here we create the mask to ignore the padded elements in the sequences.

In [12]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

### Split the dataset

In [13]:
test_size = 0.1
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, 
                                                            tags, 
                                                            random_state=2018, 
                                                            test_size=test_size)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, 
                                             input_ids,
                                             random_state=2018, 
                                             test_size=test_size)

### Convert dataset to torch tensors

In [None]:
tr_inputs = torch.LongTensor(tr_inputs)
val_inputs = torch.LongTensor(val_inputs)
tr_tags = torch.LongTensor(tr_tags)
val_tags = torch.LongTensor(val_tags)
tr_masks = torch.LongTensor(tr_masks)
val_masks = torch.LongTensor(val_masks)

### Define dataloaders

In [None]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

## Prepare the BERT model for training

In [None]:
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(tag2idx))

Before we can start the fine-tuning process, we have to setup the optimizer and add the parameters it should update. A common choice is the Adam optimizer. We also add some weight_decay as regularization to the main weight matrices. If you have limited resources, you can also try to just train the linear classifier on top of Bert and keep all other weights fixed. This will still give you a good performance.

In [None]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [None]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

### Train (fine-tune) the model

In [None]:
from tqdm import tqdm, trange
epochs = 5
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

Epoch:   0%|                                                                                     | 0/5 [00:00<?, ?it/s]

### Evaluate the model

In [None]:
model.eval()
predictions = []
true_labels = []
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                              attention_mask=b_input_mask, labels=b_labels)
        logits = model(b_input_ids, token_type_ids=None,
                       attention_mask=b_input_mask)
        
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
valid_tags = [[tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l ]
print("Validation loss: {}".format(eval_loss/nb_eval_steps))
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))