In [31]:
pip install pytorch-pretrained-bert==0.4.0

Note: you may need to restart the kernel to use updated packages.


In [32]:
import pandas as pd
import numpy as np
from tqdm import trange
from tqdm import tqdm_notebook as tqdm

data = pd.read_csv("ner_dataset.csv", encoding="latin1").fillna(method="ffill")
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [33]:
print(data.shape)
data = data.head(50000)
print(data.shape)

(1048575, 4)
(50000, 4)


In [34]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [35]:
getter = SentenceGetter(data)

This is how the sentences in the dataset look like.

In [36]:
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
sentences[0]

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

The sentences are annotated with the BIO-schema and the labels look like this.

In [37]:
labels = [[s[2] for s in sent] for sent in getter.sentences]
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [38]:
tags_vals = list(set(data["Tag"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}

Before we can start fine-tuning the model, we have to prepare the data set for the use with pytorch and bert.

In [39]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

Here we fix some configurations. We will limit our sequence length to 75 tokens and we will use a batch size of 32 as suggested by the Bert paper. Note, that Bert natively supports sequences of up to 512 tokens.

In [40]:
MAX_LEN = 75
bs = 32

In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

The Bert implementation comes with a pretrained tokenizer and a definied vocabulary. We load the one related to the smallest pre-trained model bert-base-uncased. Try also the cased variate since it is well suited for NER.

In [42]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Now we tokenize all sentences

In [43]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print(tokenized_texts[0])

['thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'london', 'to', 'protest', 'the', 'war', 'in', 'iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'british', 'troops', 'from', 'that', 'country', '.']


Next, we cut and pad the token and label sequences to our desired length.

In [44]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [45]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")

The Bert model supports something called attention_mask, which is similar to the masking in keras. So here we create the mask to ignore the padded elements in the sequences.

In [46]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

In [47]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

Since we’re operating in pytorch, we have to convert the dataset to torch tensors.

In [48]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

The last step is to define the dataloaders. We shuffle the data at training time with the RandomSampler and at test time we just pass them sequentially with the SequentialSampler.

In [49]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

The pytorch-pretrained-bert package provides a BertForTokenClassification class for token-level predictions. BertForTokenClassification is a fine-tuning model that wraps BertModel and adds token-level classifier on top of the BertModel. The token-level classifier is a linear layer that takes as input the last hidden state of the sequence. We load the pre-trained bert-base-uncased model and provide the number of possible labels.

In [50]:
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))

Now we have to pass the model parameters to the GPU.

Before we can start the fine-tuning process, we have to setup the optimizer and add the parameters it should update. A common choice is the Adam optimizer. We also add some weight_decay as regularization to the main weight matrices. If you have limited resources, you can also try to just train the linear classifier on top of Bert and keep all other weights fixed. This will still give you a good performance.

In [51]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [54]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [58]:
epochs = 10
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in tqdm(enumerate(train_dataloader)):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]





Epoch:   0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Train loss: 0.14689894311595708






Epoch:  10%|█         | 1/10 [08:19<1:14:54, 499.43s/it][A[A[A[A

Validation loss: 0.14355559833347797
Validation Accuracy: 0.9599305555555556


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Train loss: 0.1139173333067447






Epoch:  20%|██        | 2/10 [16:38<1:06:33, 499.22s/it][A[A[A[A

Validation loss: 0.1364841479808092
Validation Accuracy: 0.9613194444444445


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Train loss: 0.09384752612095326






Epoch:  30%|███       | 3/10 [24:47<57:53, 496.17s/it]  [A[A[A[A

Validation loss: 0.12852323800325394
Validation Accuracy: 0.9607465277777778


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Train loss: 0.07790683099301532






Epoch:  40%|████      | 4/10 [33:58<51:16, 512.78s/it][A[A[A[A

Validation loss: 0.12984734680503607
Validation Accuracy: 0.96453125


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Train loss: 0.06684341566869989






Epoch:  50%|█████     | 5/10 [42:12<42:15, 507.00s/it][A[A[A[A

Validation loss: 0.1300798375159502
Validation Accuracy: 0.9621874999999999


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Train loss: 0.05698208016110584






Epoch:  60%|██████    | 6/10 [56:00<40:13, 603.41s/it][A[A[A[A

Validation loss: 0.13079484924674034
Validation Accuracy: 0.9655902777777778


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Train loss: 0.048491025547264144






Epoch:  70%|███████   | 7/10 [1:09:09<32:57, 659.06s/it][A[A[A[A

Validation loss: 0.13354886323213577
Validation Accuracy: 0.9661111111111111


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Train loss: 0.0410277959308587






Epoch:  80%|████████  | 8/10 [1:12:35<17:26, 523.08s/it][A[A[A[A

Validation loss: 0.13710338715463877
Validation Accuracy: 0.9646701388888889


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Train loss: 0.036087200249312446






Epoch:  90%|█████████ | 9/10 [1:15:56<07:06, 426.61s/it][A[A[A[A

Validation loss: 0.15141575783491135
Validation Accuracy: 0.9647395833333334


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Train loss: 0.03073726553702727






Epoch: 100%|██████████| 10/10 [1:19:16<00:00, 475.65s/it][A[A[A[A

Validation loss: 0.1522522158920765
Validation Accuracy: 0.9667361111111111





### Evaluate

In [59]:
model.eval()
predictions = []
true_labels = []
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                              attention_mask=b_input_mask, labels=b_labels)
        logits = model(b_input_ids, token_type_ids=None,
                       attention_mask=b_input_mask)
        
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
valid_tags = [[tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l ]
print("Validation loss: {}".format(eval_loss/nb_eval_steps))
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

Validation loss: 0.1522522158920765
Validation Accuracy: 0.9667361111111111


As you can see, this works quite amazing! This approach will give you very strong performing models for named entity recognition. Since Bert is available as a multilingual model in 102 languages, you can use it for a wide variety of tasks. 