In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import math
import numpy as np
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report,accuracy_score,f1_score
import torch.nn.functional as F

In [3]:
import torch
import os
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_transformers import BertTokenizer, BertConfig
from pytorch_transformers import BertForTokenClassification, AdamW

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Introduction

In this notebook, will introduce how to do NER with BERT, including:

- Load and preprocess data
- Parser data
- Make training data
- Train model
- Evaluate result
- Predict result

Tips:

- Update to pytorch-transformer==1.0.0
- When come across OOV,you will find that BERT word piece tokenize method can help a lot
- Case model will be litter better than uncase model for English

**Also this notebook come with a post [NER with BERT in Action](https://medium.com/@yingbiao/ner-with-bert-in-action-936ff275bc73)**<br>
**Feel free to check it, hope that it could help you.**

## Load data

**Load CSV data**

In [4]:
data_path = "data/" 

In [5]:
data_file_address = "data/ner_dataset.csv"

In [6]:
# Fillna method can make same sentence with same sentence name
df_data = pd.read_csv(data_file_address,sep=",",encoding="latin1").fillna(method='ffill')

In [7]:
df_data.columns

Index(['Sentence #', 'Word', 'POS', 'Tag'], dtype='object')

In [8]:
df_data.head(n=20)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


**Have a look POS cat**

In [9]:
df_data.POS.unique()

array(['NNS', 'IN', 'VBP', 'VBN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'CC',
       'JJ', '.', 'VBD', 'WP', '``', 'CD', 'PRP', 'VBZ', 'POS', 'VBG',
       'RB', ',', 'WRB', 'PRP$', 'MD', 'WDT', 'JJR', ':', 'JJS', 'WP$',
       'RP', 'PDT', 'NNPS', 'EX', 'RBS', 'LRB', 'RRB', '$', 'RBR', ';',
       'UH', 'FW'], dtype=object)

**Have a look TAG cat**

In [10]:
df_data.Tag.unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [11]:
# Analyse summary of data
df_data['Sentence #'].nunique(), df_data.Word.nunique(), df_data.POS.nunique(), df_data.Tag.nunique()

(47959, 35178, 42, 17)

In [12]:
# Analyse the Tag distribution
df_data.Tag.value_counts()

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

### Explain tag
As show above, there are two parts for the tag name: "position"-"meaning"
- B: begin, word at the first  position
- I: middle, word not at the first position,especially for phase
- time: time, meaning time
- per: person, meaning people name
- geo: geography, meaning location name
- O: mean other, set as a default tag
<br>......

## Parser data

**Parser data into document structure**

In [13]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [14]:
# Get full document data struce
getter = SentenceGetter(df_data)

In [15]:
# Get sentence data
sentences = [[s[0] for s in sent] for sent in getter.sentences]
sentences[0]

['Thousands',
 'of',
 'demonstrators',
 'have',
 'marched',
 'through',
 'London',
 'to',
 'protest',
 'the',
 'war',
 'in',
 'Iraq',
 'and',
 'demand',
 'the',
 'withdrawal',
 'of',
 'British',
 'troops',
 'from',
 'that',
 'country',
 '.']

In [16]:
# Get pos data
poses = [[s[1] for s in sent] for sent in getter.sentences]
print(poses[0])

['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN', '.']


In [17]:
# Get tag labels data
labels = [[s[2] for s in sent] for sent in getter.sentences]
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


**Make TAG name into index for training**

In [18]:
tags_vals = list(set(df_data["Tag"].values))

In [19]:
# Add X  label for word piece support
# Add [CLS] and [SEP] as BERT need
tags_vals.append('X')
tags_vals.append('[CLS]')
tags_vals.append('[SEP]')

In [20]:
tags_vals = set(tags_vals)

In [21]:
tags_vals

{'B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O',
 'X',
 '[CLS]',
 '[SEP]'}

In [22]:
# Set a dict for mapping id to tag name
#tag2idx = {t: i for i, t in enumerate(tags_vals)}

# Recommend to set it by manual define, good for reusing
tag2idx={'B-art': 14,
 'B-eve': 16,
 'B-geo': 0,
 'B-gpe': 13,
 'B-nat': 12,
 'B-org': 10,
 'B-per': 4,
 'B-tim': 2,
 'I-art': 5,
 'I-eve': 7,
 'I-geo': 15,
 'I-gpe': 8,
 'I-nat': 11,
 'I-org': 3,
 'I-per': 6,
 'I-tim': 1,
 'X':17,
 'O': 9,
 '[CLS]':18,
 '[SEP]':19}

In [23]:
tag2idx

{'B-art': 14,
 'B-eve': 16,
 'B-geo': 0,
 'B-gpe': 13,
 'B-nat': 12,
 'B-org': 10,
 'B-per': 4,
 'B-tim': 2,
 'I-art': 5,
 'I-eve': 7,
 'I-geo': 15,
 'I-gpe': 8,
 'I-nat': 11,
 'I-org': 3,
 'I-per': 6,
 'I-tim': 1,
 'X': 17,
 'O': 9,
 '[CLS]': 18,
 '[SEP]': 19}

In [24]:
# Mapping index to name
tag2name={tag2idx[key] : key for key in tag2idx.keys()}

## Make tranning data

Make raw data into trainable data for BERT, including:

- Set gpu environment
- Load tokenizer and tokenize
- Set 3 embedding, token embedding, mask word embedding, segmentation embedding
- Split data set into train and validate, then send them to dataloader

**Set up gpu environment**

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [26]:
n_gpu

1

### Load tokenizer

In [27]:
# Manual define vocabulary address, if you download the model in local
vocabulary = "models/bert-base-cased/vocab.txt"

In [28]:
# Len of the sentence must be the same as the training model
# See model's 'max_position_embeddings' = 512
max_len  = 45

In [29]:
# load tokenizer, with manual file address or pretrained address
tokenizer=BertTokenizer(vocab_file=vocabulary,do_lower_case=False)

**Tokenizer text**

- In hunggieface for bert, when come across OOV, will word piece the word
- We need to adjust the labels base on the tokenize result, “##abc” need to set label "X" 
- Need to set "[CLS]" at front and "[SEP]" at the end, as what the paper do, [BERT indexer should add [CLS] and [SEP] tokens](https://github.com/allenai/allennlp/issues/2141)


In [30]:
tokenized_texts = []
word_piece_labels = []
i_inc = 0
for word_list,label in (zip(sentences,labels)):
    temp_lable = []
    temp_token = []
    
    # Add [CLS] at the front 
    temp_lable.append('[CLS]')
    temp_token.append('[CLS]')
    
    for word,lab in zip(word_list,label):
        token_list = tokenizer.tokenize(word)
        for m,token in enumerate(token_list):
            temp_token.append(token)
            if m==0:
                temp_lable.append(lab)
            else:
                temp_lable.append('X')  
                
    # Add [SEP] at the end
    temp_lable.append('[SEP]')
    temp_token.append('[SEP]')
    
    tokenized_texts.append(temp_token)
    word_piece_labels.append(temp_lable)
    
    if 5 > i_inc:
        print("No.%d,len:%d"%(i_inc,len(temp_token)))
        print("texts:%s"%(" ".join(temp_token)))
        print("No.%d,len:%d"%(i_inc,len(temp_lable)))
        print("lables:%s"%(" ".join(temp_lable)))
    i_inc +=1
    
    
    

No.0,len:28
texts:[CLS] Thousands of demons ##tra ##tors have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country . [SEP]
No.0,len:28
lables:[CLS] O O O X X O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O [SEP]
No.1,len:29
texts:[CLS] Iranian officials say they expect to get access to sealed sensitive parts of the plant Wednesday , after an I ##A ##EA surveillance system begins functioning . [SEP]
No.1,len:29
lables:[CLS] B-gpe O O O O O O O O O O O O O O B-tim O O O B-org X X O O O O O [SEP]
No.2,len:44
texts:[CLS] He ##lic ##op ##ter guns ##hips Saturday pounded militant hide ##outs in the Or ##ak ##zai tribal region , where many Taliban militants are believed to have fled to avoid an earlier military offensive in nearby South W ##azi ##rist ##an . [SEP]
No.2,len:44
lables:[CLS] O X X X O X B-tim O O O X O O B-geo X X O O O O O B-org O O O O O O O O O O O O O O B-geo I-geo X X X O [SEP]
No.3,len:16
texts:[CLS] They lef

### Set token embedding

Pad or trim the text and label to fit the need for max len

In [31]:
# Make text token into id
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=max_len, dtype="long", truncating="post", padding="post")
print(input_ids[0])

[  101 26159  1104  8568  4487  5067  1138  9639  1194  1498  1106  5641
  1103  1594  1107  5008  1105  4555  1103 10602  1104  1418  2830  1121
  1115  1583   119   102     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0]


In [32]:
# Make label into id, pad with "O" meaning others
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in word_piece_labels],
                     maxlen=max_len, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")
print(tags[0])

[18  9  9  9 17 17  9  9  9  0  9  9  9  9  9  0  9  9  9  9  9 13  9  9
  9  9  9 19  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9]


### Set mask word embedding

In [33]:
# For fine tune of predict, with token mask is 1,pad token is 0
attention_masks = [[int(i>0) for i in ii] for ii in input_ids]
attention_masks[0];

### Set segment embedding(Seem like for sequance tagging task, it's not necessary to make this embedding)

In [34]:
# Since only one sentence, all the segment set to 0
segment_ids = [[0] * len(input_id) for input_id in input_ids]
segment_ids[0];

## Split data into train and validate

70% for training, 30% for validation

**Split all data**

In [35]:
tr_inputs, val_inputs, tr_tags, val_tags,tr_masks, val_masks,tr_segs, val_segs = train_test_split(input_ids, tags,attention_masks,segment_ids, 
                                                            random_state=4, test_size=0.3)

In [36]:
len(tr_inputs),len(val_inputs),len(tr_segs),len(val_segs)

(33571, 14388, 33571, 14388)

**Set data into tensor**

Not recommend tensor.to(device) at this process, since it will run out of GPU memory

In [37]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
tr_segs = torch.tensor(tr_segs)
val_segs = torch.tensor(val_segs)

**Put data into data loader**

In [38]:
# Set batch num
batch_num = 32

In [39]:
# Only set token embedding, attention embedding, no segment embedding
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
# Drop last can make batch training better for the last one
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_num,drop_last=True)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)

## Train model

**Load BERT model**

In [40]:
# In this document, contain confg(txt) and weight(bin) files
model_file_address = 'models/bert-base-cased'

In [41]:
# Will load config and weight with from_pretrained()
model = BertForTokenClassification.from_pretrained(model_file_address,num_labels=len(tag2idx))

In [42]:
model;

In [43]:
# Set model to GPU,if you are using GPU machine
model.cuda();

In [44]:
# Add multi GPU support
if n_gpu >1:
    model = torch.nn.DataParallel(model)

In [45]:
# Set epoch and grad max num
epochs = 5
max_grad_norm = 1.0

In [46]:
# Cacluate train optimiazaion num
num_train_optimization_steps = int( math.ceil(len(tr_inputs) / batch_num) / 1) * epochs

### Set fine tuning method

**Manual optimizer**

In [47]:
# True: fine tuning all the layers 
# False: only fine tuning the classifier layers
FULL_FINETUNING = True

In [48]:
if FULL_FINETUNING:
    # Fine tune model all layer parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    # Only fine tune classifier parameters
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)

### Fine-tuing model

In [49]:
# TRAIN loop
model.train();

In [50]:
print("***** Running training *****")
print("  Num examples = %d"%(len(tr_inputs)))
print("  Batch size = %d"%(batch_num))
print("  Num steps = %d"%(num_train_optimization_steps))
for _ in trange(epochs,desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        # forward pass
        outputs = model(b_input_ids, token_type_ids=None,
        attention_mask=b_input_mask, labels=b_labels)
        loss, scores = outputs[:2]
        if n_gpu>1:
            # When multi gpu, average it
            loss = loss.mean()
        
        # backward pass
        loss.backward()
        
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        
        # update parameters
        optimizer.step()
        optimizer.zero_grad()
        
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
        

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 33571
  Batch size = 32
  Num steps = 5250


Epoch:  20%|██        | 1/5 [02:16<09:05, 136.43s/it]

Train loss: 0.13975251472428143


Epoch:  40%|████      | 2/5 [04:32<06:49, 136.37s/it]

Train loss: 0.06801466842149131


Epoch:  60%|██████    | 3/5 [06:49<04:33, 136.57s/it]

Train loss: 0.05254692953253951


Epoch:  80%|████████  | 4/5 [09:06<02:16, 136.70s/it]

Train loss: 0.04020711769562062


Epoch: 100%|██████████| 5/5 [11:23<00:00, 136.64s/it]

Train loss: 0.02990101350940146





## Save model 

In [51]:
bert_out_address = 'models/bert_out_model/en09'

In [52]:
# Make dir if not exits
if not os.path.exists(bert_out_address):
        os.makedirs(bert_out_address)

In [53]:
# Save a trained model, configuration and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

In [54]:
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(bert_out_address, "pytorch_model.bin")
output_config_file = os.path.join(bert_out_address, "config.json")

In [55]:
# Save model into file
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(bert_out_address)

('models/bert_out_model/en09/vocab.txt',)

## Load model

In [56]:
model = BertForTokenClassification.from_pretrained(bert_out_address,num_labels=len(tag2idx))

In [57]:
# Set model to GPU
model.cuda();

In [58]:
if n_gpu >1:
    model = torch.nn.DataParallel(model)

## Eval model

In [59]:
# Evalue loop
model.eval();

In [60]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
y_true = []
y_pred = []

print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch
    
#     if step > 2:
#         break
    
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None,
        attention_mask=input_mask,)
        # For eval mode, the first result of outputs is logits
        logits = outputs[0] 
    
    # Get NER predict result
    logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2)
    logits = logits.detach().cpu().numpy()
    
    
    # Get NER true result
    label_ids = label_ids.to('cpu').numpy()
    
    
    # Only predict the real word, mark=0, will not calculate
    input_mask = input_mask.to('cpu').numpy()
    
    # Compare the valuable predict result
    for i,mask in enumerate(input_mask):
        # Real one
        temp_1 = []
        # Predict one
        temp_2 = []
        
        for j, m in enumerate(mask):
            # Mark=0, meaning its a pad word, dont compare
            if m:
                if tag2name[label_ids[i][j]] != "X" and tag2name[label_ids[i][j]] != "[CLS]" and tag2name[label_ids[i][j]] != "[SEP]" : # Exclude the X label
                    temp_1.append(tag2name[label_ids[i][j]])
                    temp_2.append(tag2name[logits[i][j]])
            else:
                break
        
            
        y_true.append(temp_1)
        y_pred.append(temp_2)

        

print("f1 socre: %f"%(f1_score(y_true, y_pred)))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))

# Get acc , recall, F1 result report
report = classification_report(y_true, y_pred,digits=4)

# Save the report into file
output_eval_file = os.path.join(bert_out_address, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    print("\n%s"%(report))
    print("f1 socre: %f"%(f1_score(y_true, y_pred)))
    print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))
    
    writer.write("f1 socre:\n")
    writer.write(str(f1_score(y_true, y_pred)))
    writer.write("\n\nAccuracy score:\n")
    writer.write(str(accuracy_score(y_true, y_pred)))
    writer.write("\n\n")  
    writer.write(report)

***** Running evaluation *****
  Num examples =14388
  Batch size = 32
f1 socre: 0.833776
Accuracy score: 0.971076
***** Eval results *****

           precision    recall  f1-score   support

      org     0.7315    0.6889    0.7096      5954
      geo     0.8434    0.9010    0.8712     11066
      gpe     0.9709    0.9327    0.9514      4830
      per     0.7529    0.7947    0.7732      5123
      tim     0.8728    0.8659    0.8693      6016
      eve     0.3854    0.4066    0.3957        91
      art     0.2258    0.1069    0.1451       131
      nat     0.3030    0.4082    0.3478        49

micro avg     0.8279    0.8397    0.8338     33260
macro avg     0.8288    0.8397    0.8336     33260

f1 socre: 0.833776
Accuracy score: 0.971076
