In [None]:
import os

# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
# os.environ["CUDA_VISIBLE_DEVICES"]="2,1"  

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import pandas as pd
import math
import numpy as np
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report,accuracy_score,f1_score
import torch.nn.functional as F

import torch
from tqdm import tqdm, trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW

Using TensorFlow backend.


This notebook work with env:

- Keras                2.3.1                 
- torch                1.1.0                 
- transformers         2.5.1      

# Introduction

In this notebook, will introduce how to do NER with BERT, including:

- Load and preprocess data
- Parser data
- Make training data
- Train model
- Evaluate result
- **Inference**

Tips:

- Update to transformer==2.5.1
- When come across OOV,you will find that BERT word piece tokenize method can help a lot
- Case model will be litter better than uncase model for English

**Also this notebook come with a post [NER with BERT in Action](https://medium.com/@yingbiao/ner-with-bert-in-action-936ff275bc73)**<br>
**Feel free to check it, hope that it could help you.**

## Load data

**Load CSV data**

In [4]:
data_path_train = "./processed/merged/train.tsv" 
df_data = pd.read_csv(data_path_train, sep="\t").astype(str)

In [5]:
df_data.head(n=20)

Unnamed: 0.1,Unnamed: 0,sentence #,word,tag
0,0,sentence# 0,245317863,O
1,0,sentence# 0,WGH,O
2,1,sentence# 1,9156109,O
3,2,sentence# 2,780576,O
4,3,sentence# 3,0720567,O
5,4,sentence# 4,04/20/2005,O
6,4,sentence# 4,12:00:00,O
7,4,sentence# 4,AM,O
8,5,sentence# 5,ED,O
9,5,sentence# 5,Discharge,O


**Have a look POS cat**

**Have a look TAG cat**

In [6]:
df_data['tag'].unique()

array(['O', 'B-problem', 'I-problem', 'B-test', 'I-test', 'B-treatment',
       'I-treatment'], dtype=object)

In [7]:
# Analyse the Tag distribution
df_data['tag'].value_counts()

O              103039
I-problem        9201
B-problem        6308
B-treatment      4359
B-test           4160
I-treatment      3696
I-test           3491
Name: tag, dtype: int64

## Parser data

**Parser data into document structure**

In [8]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["word"].values.tolist(),
                                                        #    s["POS"].values.tolist(),
                                                           s["tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [9]:
# Get full document data struce
getter = SentenceGetter(df_data)

In [10]:
getter.sentences

[[('245317863', 'O'), ('WGH', 'O')],
 [('9156109', 'O')],
 [('ED', 'O'),
  ('DISCHARGE', 'O'),
  ('NOTIFICATION', 'O'),
  ('/', 'O'),
  ('SUMMARY', 'O')],
 [('She', 'O'),
  ('reports', 'O'),
  ('that', 'O'),
  ('it', 'O'),
  ('had', 'O'),
  ('essentially', 'O'),
  ('started', 'O'),
  ('earlier', 'O'),
  ('in', 'O'),
  ('the', 'O'),
  ('day', 'O'),
  ('with', 'O'),
  ('some', 'B-problem'),
  ('GERD-like', 'I-problem'),
  ('symptoms', 'I-problem'),
  ('.', 'O')],
 [('9.', 'O'),
  ('Lopressor', 'B-treatment'),
  ('50', 'O'),
  ('mg', 'O'),
  ('PO', 'O'),
  ('b.i.d.', 'O'),
  (',', 'O')],
 [('were', 'O'),
  ('clear', 'O'),
  ('to', 'O'),
  ('auscultation', 'B-test'),
  ('.', 'O')],
 [('HEART', 'O'), (':', 'O')],
 [('had', 'O'),
  ('a', 'O'),
  ('regular', 'O'),
  ('rate', 'O'),
  ('and', 'O'),
  ('rhythm', 'O'),
  ('with', 'O'),
  ('a', 'B-problem'),
  ('II', 'I-problem'),
  ('/', 'I-problem'),
  ('VI', 'I-problem'),
  ('systolic', 'I-problem'),
  ('murmur', 'I-problem'),
  ('at', 'I-probl

In [11]:
# Get sentence data
sentences = [[s[0] for s in sent] for sent in getter.sentences]
sentences[102]

['Normal', 'Delivering', 'Resident', '/', 'Staff', ':']

In [12]:
# Get tag labels data
labels = [[s[1] for s in sent] for sent in getter.sentences]
print(labels[102])

['O', 'O', 'O', 'O', 'O', 'O']


**Make TAG name into index for training**

In [13]:
tags_vals = list(set(df_data["tag"].values))

In [14]:
# Add X  label for word piece support
# Add [CLS] and [SEP] as BERT need
tags_vals.append('X')
tags_vals.append('[CLS]')
tags_vals.append('[SEP]')

In [15]:
tags_vals = set(tags_vals)

In [16]:
tags_vals

{'B-problem',
 'B-test',
 'B-treatment',
 'I-problem',
 'I-test',
 'I-treatment',
 'O',
 'X',
 '[CLS]',
 '[SEP]'}

In [17]:
# Set a dict for mapping id to tag name
#tag2idx = {t: i for i, t in enumerate(tags_vals)}

# Recommend to set it by manual define, good for reusing
tag2idx={'B-problem': 0,
 'B-test': 1,
 'B-treatment': 2,
 'I-problem': 3,
 'I-test': 4,
 'I-treatment': 5,
 'O': 6,
 'X': 7,
 '[CLS]': 8,
 '[SEP]': 9
 }

In [18]:
tag2idx

{'B-problem': 0,
 'B-test': 1,
 'B-treatment': 2,
 'I-problem': 3,
 'I-test': 4,
 'I-treatment': 5,
 'O': 6,
 'X': 7,
 '[CLS]': 8,
 '[SEP]': 9}

In [19]:
# Mapping index to name
tag2name={tag2idx[key] : key for key in tag2idx.keys()}

In [20]:
tag2name

{0: 'B-problem',
 1: 'B-test',
 2: 'B-treatment',
 3: 'I-problem',
 4: 'I-test',
 5: 'I-treatment',
 6: 'O',
 7: 'X',
 8: '[CLS]',
 9: '[SEP]'}

## Make training data

Make raw data into trainable data for BERT, including:

- Set gpu environment
- Load tokenizer and tokenize
- Set 3 embedding, token embedding, mask word embedding, segmentation embedding
- Split data set into train and validate, then send them to dataloader

**Set up gpu environment**

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [22]:
n_gpu

3

### Load tokenizer

You can download the tokenizer file into local folder first :
- [vocab.txt](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt)

In [23]:
# Manual define vocabulary address, if you download the tokenzier file in local
# vocab.txt, download from: https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt
vocabulary = "/users/home/jwu51/Programs/Projects/NER/Bio_ClinicalBERT/vocab.txt"


In [24]:
# Len of the sentence must be not bigger than the training model
# See model's 'max_position_embeddings' = 512
max_len  = 128

In [25]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [26]:
# load tokenizer, with manual file address or pretrained address
tokenizer=BertTokenizer(vocab_file=vocabulary, do_lower_case=False)

**Tokenizer text**

- In hunggieface for bert, when come across OOV, will word piece the word
- We need to adjust the labels base on the tokenize result, “##abc” need to set label "X" 
- Need to set "[CLS]" at front and "[SEP]" at the end, as what the paper do, [BERT indexer should add [CLS] and [SEP] tokens](https://github.com/allenai/allennlp/issues/2141)


In [27]:
tokenized_texts = []
word_piece_labels = []
i_inc = 0
for word_list,label in (zip(sentences,labels)):
    temp_lable = []
    temp_token = []
    
    # Add [CLS] at the front 
    temp_lable.append('[CLS]')
    temp_token.append('[CLS]')
    
    for word,lab in zip(word_list,label):
        token_list = tokenizer.tokenize(word)
        for m,token in enumerate(token_list):
            temp_token.append(token)
            if m==0:
                temp_lable.append(lab)
            else:
                temp_lable.append('X')  
                
    # Add [SEP] at the end
    temp_lable.append('[SEP]')
    temp_token.append('[SEP]')
    
    tokenized_texts.append(temp_token)
    word_piece_labels.append(temp_lable)
    
    if 5 > i_inc:
        print("No.%d,len:%d"%(i_inc,len(temp_token)))
        print("texts:%s"%(" ".join(temp_token)))
        print("No.%d,len:%d"%(i_inc,len(temp_lable)))
        print("lables:%s"%(" ".join(temp_lable)))
    i_inc +=1
    
    
    

No.0,len:10
texts:[CLS] 245 ##31 ##7 ##86 ##3 W ##G ##H [SEP]
No.0,len:10
lables:[CLS] O X X X X O X X [SEP]
No.1,len:7
texts:[CLS] 91 ##5 ##6 ##10 ##9 [SEP]
No.1,len:7
lables:[CLS] O X X X X [SEP]
No.2,len:21
texts:[CLS] E ##D D ##IS ##CH ##AR ##GE NO ##TI ##FI ##CA ##TI ##ON / S ##UM ##MA ##R ##Y [SEP]
No.2,len:21
lables:[CLS] O X O X X X X O X X X X X O O X X X X [SEP]
No.3,len:21
texts:[CLS] She reports that it had essentially started earlier in the day with some GE ##RD - like symptoms . [SEP]
No.3,len:21
lables:[CLS] O O O O O O O O O O O O B-problem I-problem X X X I-problem O [SEP]
No.4,len:18
texts:[CLS] 9 . Lo ##press ##or 50 mg P ##O b . i . d . , [SEP]
No.4,len:18
lables:[CLS] O X B-treatment X X O O O X O X X X X X O [SEP]


### Set token embedding

Pad or trim the text and label to fit the need for max len

In [28]:
# Make text token into id
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=max_len, dtype="long", truncating="post", padding="post")
print(input_ids[0])

[  101 23067 22639  1559 22392  1495   160  2349  3048   102     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]


In [29]:
print(input_ids[3])

[  101  1153  3756  1115  1122  1125  7588  1408  2206  1107  1103  1285
  1114  1199 25075 23354   118  1176  8006   119   102     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]


In [30]:
# Make label into id, pad with "O" meaning others
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in word_piece_labels],
                     maxlen=max_len, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")
print(tags[3])

[8 6 6 6 6 6 6 6 6 6 6 6 6 0 3 7 7 7 3 6 9 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6]


### Set mask word embedding

In [31]:
# For fine tune of predict, with token mask is 1,pad token is 0
attention_masks = [[int(i>0) for i in ii] for ii in input_ids]
attention_masks[0]

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

### Set segment embedding(Seem like for sequance tagging task, it's not necessary to make this embedding)

In [32]:
# Since only one sentence, all the segment set to 0
segment_ids = [[0] * len(input_id) for input_id in input_ids]
segment_ids[0];

## Split data into train and validate

70% for training, 30% for validation

**Split all data**

In [33]:
tr_inputs, val_inputs, tr_tags, val_tags,tr_masks, val_masks,tr_segs, val_segs = train_test_split(input_ids, tags, attention_masks, segment_ids, 
                                                            random_state=4, test_size=0.2)

In [34]:
len(tr_inputs),len(val_inputs),len(tr_segs),len(val_segs)

(11620, 2906, 11620, 2906)

**Set data into tensor**

Not recommend tensor.to(device) at this process, since it will run out of GPU memory

In [35]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
tr_segs = torch.tensor(tr_segs)
val_segs = torch.tensor(val_segs)

**Put data into data loader**

In [36]:
# Set batch num
batch_num = 32

In [37]:
# Only set token embedding, attention embedding, no segment embedding
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
# Drop last can make batch training better for the last one
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_num,drop_last=True)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)

## Train model

- You can download the model require files into local folder first
- pytorch_model.bin: [pytorch_model.bin](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin)
- config.json: [config.json](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json)    

**Load BERT model**

In [38]:
# In this folder, contain model confg(json) and model weight(bin) files
# pytorch_model.bin, download from: https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin
# config.json, downlaod from: https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json
model_file_address = 'emilyalsentzer/Bio_ClinicalBERT'

In [39]:
# Will load config and weight with from_pretrained()
model = BertForTokenClassification.from_pretrained(model_file_address, num_labels=len(tag2idx))

In [40]:
# Set model to GPU,if you are using GPU machine
model.cuda();

In [41]:
# Add multi GPU support
if n_gpu >1:
    model = torch.nn.DataParallel(model)

    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


In [42]:
# Set epoch and grad max num
epochs = 5
max_grad_norm = 1.0

In [43]:
# Cacluate train optimiazaion num
num_train_optimization_steps = int(math.ceil(len(tr_inputs) / batch_num) / 1) * epochs

### Set fine tuning method

**Manual optimizer**

In [44]:
# True: fine tuning all the layers 
# False: only fine tuning the classifier layers
FULL_FINETUNING = True

In [45]:
if FULL_FINETUNING:
    # Fine tune model all layer parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    # Only fine tune classifier parameters
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
    
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)

### Fine-tuning model

In [45]:
# TRAIN loop
model.train();

In [47]:
print("***** Running training *****")
print("  Num examples = %d"%(len(tr_inputs)))
print("  Batch size = %d"%(batch_num))
print("  Num steps = %d"%(num_train_optimization_steps))
for _ in trange(epochs,desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # print(b_input_ids.shape)
        # print(b_input_mask.shape)
        # print(b_labels.shape)
        # print('===')
        # break
        # forward pass
        outputs = model(b_input_ids, token_type_ids=None,
        attention_mask=b_input_mask, labels=b_labels)
        loss, scores = outputs[:2]
        if n_gpu>1:
            # When multi gpu, average it
            loss = loss.mean()
        
        # backward pass
        loss.backward()
        
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        
        # update parameters
        optimizer.step()
        optimizer.zero_grad()
        
    # print train loss per epoch
    # print("Train loss: {}".format(tr_loss/nb_tr_steps))
        

***** Running training *****
  Num examples = 11620
  Batch size = 32
  Num steps = 1820




## Save model 

In [47]:
bert_out_address = 'models/C-Bert-01'

In [48]:
# Make dir if not exits
if not os.path.exists(bert_out_address):
        os.makedirs(bert_out_address)

In [49]:
# Save a trained model, configuration and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

In [50]:
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(bert_out_address, "pytorch_model.bin")
output_config_file = os.path.join(bert_out_address, "config.json")

In [51]:
# Save model into file
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(bert_out_address)

('models/C-Bert-01/vocab.txt',)

## Load model

In [52]:
model = BertForTokenClassification.from_pretrained(bert_out_address, num_labels=len(tag2idx))

In [53]:
# Set model to GPU
model.cuda();

In [54]:
if n_gpu >1:
    model = torch.nn.DataParallel(model)

## Eval model

In [55]:
# Evalue loop
model.eval();

In [56]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
y_true = []
y_pred = []

print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch
    
#     if step > 2:
#         break
    
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None,
        attention_mask=input_mask,)
        # For eval mode, the first result of outputs is logits
        logits = outputs[0] 
    
    # Get NER predict result
    logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2)
    logits = logits.detach().cpu().numpy()
    
    
    # Get NER true result
    label_ids = label_ids.to('cpu').numpy()
    
    
    # Only predict the real word, mark=0, will not calculate
    input_mask = input_mask.to('cpu').numpy()
    
    # Compare the valuable predict result
    for i,mask in enumerate(input_mask):
        # Real one
        temp_1 = []
        # Predict one
        temp_2 = []
        
        for j, m in enumerate(mask):
            # Mark=0, meaning its a pad word, dont compare
            if m:
                if tag2name[label_ids[i][j]] != "X" and tag2name[label_ids[i][j]] != "[CLS]" and tag2name[label_ids[i][j]] != "[SEP]" : # Exclude the X label
                    temp_1.append(tag2name[label_ids[i][j]])
                    temp_2.append(tag2name[logits[i][j]])
            else:
                break
        
            
        y_true.append(temp_1)
        y_pred.append(temp_2)

        

print("f1 socre: %f"%(f1_score(y_true, y_pred)))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))

# Get acc , recall, F1 result report
report = classification_report(y_true, y_pred,digits=4)

# Save the report into file
output_eval_file = os.path.join(bert_out_address, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    print("\n%s"%(report))
    print("f1 socre: %f"%(f1_score(y_true, y_pred)))
    print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))
    
    writer.write("f1 socre:\n")
    writer.write(str(f1_score(y_true, y_pred)))
    writer.write("\n\nAccuracy score:\n")
    writer.write(str(accuracy_score(y_true, y_pred)))
    writer.write("\n\n")  
    writer.write(report)

***** Running evaluation *****
  Num examples =2948
  Batch size = 32




f1 socre: 0.848935
Accuracy score: 0.953714


  _warn_prf(average, modifier, msg_start, len(result))


***** Eval results *****

              precision    recall  f1-score   support

           _     0.0000    0.0000    0.0000         0
     problem     0.8039    0.8867    0.8432      1271
        test     0.8860    0.8773    0.8816       815
   treatment     0.8191    0.8637    0.8408       844

   micro avg     0.8222    0.8775    0.8489      2930
   macro avg     0.6272    0.6569    0.6414      2930
weighted avg     0.8311    0.8775    0.8532      2930

f1 socre: 0.848935
Accuracy score: 0.953714


## Inference

After we trained a model, we can make it into service ---- sending a new sentence to the model,then get the prediction.

The process contains:

- Load model
- Load tokenizer
- Set test query
- Make query into embedding
- Predict with model
- Parser result

### Set tag index

In [5]:
# Tag to index, must be the same as we training
tag2idx={'B-art': 14,
 'B-eve': 16,
 'B-geo': 0,
 'B-gpe': 13,
 'B-nat': 12,
 'B-org': 10,
 'B-per': 4,
 'B-tim': 2,
 'I-art': 5,
 'I-eve': 7,
 'I-geo': 15,
 'I-gpe': 8,
 'I-nat': 11,
 'I-org': 3,
 'I-per': 6,
 'I-tim': 1,
 'X':17,
 'O': 9,
 '[CLS]':18,
 '[SEP]':19}

In [6]:
# Mapping index to name
tag2name={tag2idx[key] : key for key in tag2idx.keys()}

### Load model

In [7]:
# Model we trained before, the dir containing pytorch_model.bin and vocab.txt
save_model_address = 'models/bert_out_model/en09'

In [8]:
save_model = BertForTokenClassification.from_pretrained(save_model_address,num_labels=len(tag2idx))

I0507 16:25:01.311969 139888124225280 configuration_utils.py:254] loading configuration file models/bert_out_model/en09/config.json
I0507 16:25:01.313669 139888124225280 configuration_utils.py:292] Model config BertConfig {
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "do_sample": false,
  "eos_token_ids": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
 

### Load Tokenizer

In [9]:
# Here, our save model address containing pytorch_model.bin and vocab.txt
# So, we can load the tokenzier from the same dir as the save model address
tokenizer = BertTokenizer.from_pretrained(save_model_address,do_lower_case=False)

I0507 16:25:05.673797 139888124225280 tokenization_utils.py:417] Model name 'models/bert_out_model/en09' not found in model shortcut name list (bert-base-chinese, bert-base-finnish-cased-v1, bert-base-german-dbmdz-uncased, bert-large-cased-whole-word-masking-finetuned-squad, bert-large-cased, bert-base-uncased, bert-large-uncased-whole-word-masking, bert-base-multilingual-cased, bert-base-dutch-cased, bert-base-finnish-uncased-v1, bert-base-multilingual-uncased, bert-large-uncased-whole-word-masking-finetuned-squad, bert-base-german-dbmdz-cased, bert-base-cased, bert-large-uncased, bert-base-cased-finetuned-mrpc, bert-large-cased-whole-word-masking, bert-base-german-cased). Assuming 'models/bert_out_model/en09' is a path, a model identifier, or url to a directory containing tokenizer files.
I0507 16:25:05.675888 139888124225280 tokenization_utils.py:446] Didn't find file models/bert_out_model/en09/added_tokens.json. We won't load it.
I0507 16:25:05.677664 139888124225280 tokenization_u

In [10]:
# Set max sentence length, must be the same as our training process
max_len  = 45

### Set test query

In [11]:
test_query = "I live in USA, this is my IBM laptop."

### Make query into embeddings

- token id embedding, need to tokenize first
- mask word embedding
- segmentation embedding

In [12]:
tokenized_texts = []

In [13]:
temp_token = []

In [14]:
# Add [CLS] at the front 
temp_token.append('[CLS]')

In [15]:
token_list = tokenizer.tokenize(test_query)

In [16]:
token_list

['I', 'live', 'in', 'USA', ',', 'this', 'is', 'my', 'IBM', 'laptop', '.']

In [17]:
for m,token in enumerate(token_list):
    temp_token.append(token)

In [18]:
# Trim the token to fit the length requirement
if len(temp_token) > max_len-1:
    temp_token= temp_token[:max_len-1]

In [19]:
# Add [SEP] at the end
temp_token.append('[SEP]')

In [20]:
temp_token

['[CLS]',
 'I',
 'live',
 'in',
 'USA',
 ',',
 'this',
 'is',
 'my',
 'IBM',
 'laptop',
 '.',
 '[SEP]']

In [21]:
tokenized_texts.append(temp_token)

**Make id embedding**

In [22]:
# Make text token into id
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=max_len, dtype="long", truncating="post", padding="post")
print(input_ids[0])

[  101   146  1686  1107  3066   117  1142  1110  1139  9768 12574   119
   102     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0]


**Make mask embedding**

In [23]:
# For fine tune of predict, with token mask is 1,pad token is 0
attention_masks = [[int(i>0) for i in ii] for ii in input_ids]
attention_masks[0];

**Make segmention type embedding**

In [24]:
segment_ids = [[0] * len(input_id) for input_id in input_ids]
segment_ids[0];

**Make embeddings into torch tensor**

In [25]:
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
segment_ids = torch.tensor(segment_ids)

### Predict with model

We only send input_ids embedding to model

In [26]:
# Set save model to Evalue loop
save_model.eval();

In [27]:
# Get model predict result
with torch.no_grad():
        outputs = save_model(input_ids, token_type_ids=None,
        attention_mask=None,)
        # For eval mode, the first result of outputs is logits
        logits = outputs[0] 

In [28]:
# Make logits into numpy type predict result
# The predict result contain each token's all tags predict result
predict_results = logits.detach().cpu().numpy()

In [29]:
predict_results.shape

(1, 45, 20)

**Make each token predict result into softmax mode**

In [30]:
from scipy.special import softmax

In [31]:
result_arrays_soft = softmax(predict_results[0])

In [32]:
result_arrays_soft[0]

array([5.9468857e-07, 6.0080066e-07, 4.0490704e-07, 4.3075502e-07,
       5.3710983e-07, 8.1970501e-07, 4.4107060e-07, 6.8387163e-07,
       1.4901701e-06, 4.3826424e-07, 4.2638061e-07, 1.4972500e-06,
       1.3224795e-06, 3.3777670e-07, 2.8167085e-06, 4.8201440e-07,
       1.5525582e-06, 6.1996315e-07, 8.1188858e-01, 1.1874760e-06],
      dtype=float32)

In [33]:
result_array = result_arrays_soft

In [34]:
len(result_array),len(result_array[0])

(45, 20)

### Parser result

Make the predict array into readable result 

In [35]:
# Get each token final predict tag index result
result_list = np.argmax(result_array,axis=-1)

In [36]:
result_list

array([18,  9,  9,  9,  0,  9,  9,  9,  9, 10,  9,  9, 19,  9, 17, 17, 17,
       17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
       17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17])

**Get token predict tag**

In [37]:
for i, mark in enumerate(attention_masks[0]):
    if mark>0:
        print("Token:%s"%(temp_token[i]))
#         print("Tag:%s"%(result_list[i]))
        print("Predict_Tag:%s"%(tag2name[result_list[i]]))
        #print("Posibility:%f"%(result_array[i][result_list[i]]))
        print()

Token:[CLS]
Predict_Tag:[CLS]

Token:I
Predict_Tag:O

Token:live
Predict_Tag:O

Token:in
Predict_Tag:O

Token:USA
Predict_Tag:B-geo

Token:,
Predict_Tag:O

Token:this
Predict_Tag:O

Token:is
Predict_Tag:O

Token:my
Predict_Tag:O

Token:IBM
Predict_Tag:B-org

Token:laptop
Predict_Tag:O

Token:.
Predict_Tag:O

Token:[SEP]
Predict_Tag:[SEP]

