In [1]:
import os

# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
# os.environ["CUDA_VISIBLE_DEVICES"]="2,1"  

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import pandas as pd
import math
import numpy as np
from seqeval.metrics import classification_report,accuracy_score,f1_score
import torch.nn.functional as F

import torch
from tqdm import tqdm, trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer, BertConfig, BertForTokenClassification

## Load data

In [4]:
data_path_train = "../processed/merged/train.tsv" 
train_data = pd.read_csv(data_path_train, sep="\t").astype(str)

data_path_dev= "../processed/merged/dev.tsv" 
dev_data = pd.read_csv(data_path_dev, sep="\t").astype(str)

data_path_test= "../processed/merged/test.tsv" 
test_data = pd.read_csv(data_path_test, sep="\t").astype(str)

## Parser data

In [5]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["word"].values.tolist(),
                                                        #    s["POS"].values.tolist(),
                                                           s["tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    # def get_next(self):
    #     try:
    #         s = self.grouped["sentence: {}".format(self.n_sent)]
    #         self.n_sent += 1
    #         return s
    #     except:
    #         return None

In [6]:
# Get full document data struce
train_getter = SentenceGetter(train_data)
# Get sentence data
train_sentences = [[s[0] for s in sent] for sent in train_getter.sentences]
train_labels = [[s[1] for s in sent] for sent in train_getter.sentences]

In [7]:
# Get full document data struce
dev_getter = SentenceGetter(dev_data)
# Get sentence data
dev_sentences = [[s[0] for s in sent] for sent in dev_getter.sentences]
dev_labels = [[s[1] for s in sent] for sent in dev_getter.sentences]

In [8]:
# Get full document data struce
test_getter = SentenceGetter(test_data)
# Get sentence data
test_sentences = [[s[0] for s in sent] for sent in test_getter.sentences]
test_labels = [[s[1] for s in sent] for sent in test_getter.sentences]

In [9]:
train_labels[125]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-problem',
 'I-problem']

In [10]:
# Set a dict for mapping id to tag name
#tag2idx = {t: i for i, t in enumerate(tags_vals)}

# Recommend to set it by manual define, good for reusing
tag2idx={'B-problem': 0,
 'B-test': 1,
 'B-treatment': 2,
 'I-problem': 3,
 'I-test': 4,
 'I-treatment': 5,
 'O': 6,
 'X': 7,
 '[CLS]': 8,
 '[SEP]': 9
 }
# Mapping index to name
tag2name = {tag2idx[key]: key for key in tag2idx}

In [11]:
tag2idx

{'B-problem': 0,
 'B-test': 1,
 'B-treatment': 2,
 'I-problem': 3,
 'I-test': 4,
 'I-treatment': 5,
 'O': 6,
 'X': 7,
 '[CLS]': 8,
 '[SEP]': 9}

In [13]:
tag2name

{0: 'B-problem',
 1: 'B-test',
 2: 'B-treatment',
 3: 'I-problem',
 4: 'I-test',
 5: 'I-treatment',
 6: 'O',
 7: 'X',
 8: '[CLS]',
 9: '[SEP]'}

## Make training data

Make raw data into trainable data for BERT, including:

- Set gpu environment
- Load tokenizer and tokenize
- Set 3 embedding, token embedding, mask word embedding, segmentation embedding
- Split data set into train and validate, then send them to dataloader

**Set up gpu environment**

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

### Load tokenizer

You can download the tokenizer file into local folder first :
- [vocab.txt](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt)

In [15]:
# # Manual define vocabulary address, if you download the tokenzier file in local
# # vocab.txt, download from: https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt
# vocabulary = "Bio_ClinicalBERT/vocab.txt"
# # load tokenizer, with manual file address or pretrained address
# tokenizer=BertTokenizer(vocab_file=vocabulary, do_lower_case=False)


In [16]:
# Len of the sentence must be not bigger than the training model
# See model's 'max_position_embeddings' = 512
max_len  = 128

In [17]:
tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

**Tokenizer text**

- In hunggieface for bert, when come across OOV, will word piece the word
- We need to adjust the labels base on the tokenize result, “##abc” need to set label "X" 
- Need to set "[CLS]" at front and "[SEP]" at the end, as what the paper do, [BERT indexer should add [CLS] and [SEP] tokens](https://github.com/allenai/allennlp/issues/2141)


In [18]:
def process_data(sentences, labels):
    tokenized_texts = []
    word_piece_labels = []
    i_inc = 0
    for word_list,label in (zip(sentences,labels)):
        temp_lable = []
        temp_token = []
        
        # # Add [CLS] at the front 
        # temp_lable.append('[CLS]')
        # temp_token.append('[CLS]')
        
        for word, lab in zip(word_list,label):
            token_list = tokenizer.tokenize(word)
            for m,token in enumerate(token_list):
                temp_token.append(token)
                if m==0:
                    temp_lable.append(lab)
                else:
                    temp_lable.append('X')  
                    
        # # Add [SEP] at the end
        # temp_lable.append('[SEP]')
        # temp_token.append('[SEP]')
        
        tokenized_texts.append(temp_token)
        word_piece_labels.append(temp_lable)
    
    id_list = []
    target_list = [] 
    attention_mask_list = []   
        
    # PADING        
    for text, label in zip(tokenized_texts, word_piece_labels):
        
        # Add [CLS] and [SEP], 
        # Truncate seq if it is too long
        text = ['[CLS]'] + text[:max_len-2] + ['[SEP]']
        label = ['[CLS]'] + label[:max_len-2] + ['[SEP]']
        
        # convert to ids
        ids = tokenizer.convert_tokens_to_ids(text)
        target_tag =[tag2idx.get(t) for t in label]
        
        # padding 
        # Label [PAD] with O (other)
        padding_len = max_len - len(ids)
        ids = ids + [0] * padding_len
        target_tag = target_tag + [tag2idx['O']] * padding_len
        
        # create masks
        attention_masks = [int(i>0) for i in ids]
        
        id_list.append(ids)
        target_list.append(target_tag)
        attention_mask_list.append(attention_masks)
    
    return id_list, target_list, attention_mask_list

    
    
    

In [19]:
train_input_ids, train_input_tags, train_attention_masks = process_data(train_sentences, train_labels)
dev_input_ids, dev_input_tags, dev_attention_masks = process_data(dev_sentences, dev_labels)
test_input_ids, test_input_tags, test_attention_masks = process_data(test_sentences, test_labels)

In [20]:
train_inputs = torch.tensor(train_input_ids)
dev_inputs = torch.tensor(dev_input_ids)
test_inputs = torch.tensor(test_input_ids)
train_tags = torch.tensor(train_input_tags)
dev_tags = torch.tensor(dev_input_tags)
test_tags = torch.tensor(test_input_tags)
train_masks = torch.tensor(train_attention_masks)
dev_masks = torch.tensor(dev_attention_masks)
test_masks = torch.tensor(test_attention_masks)

**Put data into data loader**

In [21]:
# Set batch num
batch_num = 32

In [22]:
# Only set token embedding, attention embedding, no segment embedding
train_dataset = TensorDataset(train_inputs, train_masks, train_tags)
train_sampler = RandomSampler(train_dataset)
# Drop last can make batch training better for the last one
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_num, drop_last=True)

dev_dataset = TensorDataset(dev_inputs, dev_masks, dev_tags)
dev_sampler = SequentialSampler(dev_dataset)
dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=batch_num)

test_data = TensorDataset(test_inputs, test_masks, test_tags)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_num)

## Train model

**Load BERT model**

In [23]:
# In this folder, contain model confg(json) and model weight(bin) files
# pytorch_model.bin, download from: https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin
# config.json, downlaod from: https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json
model_file_address = 'emilyalsentzer/Bio_ClinicalBERT'

In [24]:
# Will load config and weight with from_pretrained()
model = BertForTokenClassification.from_pretrained(model_file_address, num_labels=len(tag2idx))

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint 

In [25]:
# Set model to GPU,if you are using GPU machine
model.cuda();

In [26]:
# Add multi GPU support
if n_gpu >1:
    model = torch.nn.DataParallel(model)

In [27]:
# Set epoch and grad max num
epochs = 10
max_grad_norm = 1.0

In [28]:
# Cacluate train optimiazaion num
num_train_optimization_steps = int(math.ceil(len(train_inputs) / batch_num) / 1) * epochs

### Set fine tuning method

**Manual optimizer**

In [29]:
# True: fine tuning all the layers 
# False: only fine tuning the classifier layers
FULL_FINETUNING = True

In [30]:
if FULL_FINETUNING:
    # Fine tune model all layer parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    # Only fine tune classifier parameters
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
    
optimizer = Adam(optimizer_grouped_parameters, lr=5e-5)

### Fine-tuning model

In [31]:
# TRAIN loop
model.train()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [32]:
print("***** Running training *****")
print("  Num examples = %d"%(len(train_inputs)))
print("  Batch size = %d"%(batch_num))
print("  Num steps = %d"%(num_train_optimization_steps))
for _ in trange(epochs,desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for batch in train_dataloader:
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # forward pass
        outputs = model(b_input_ids, token_type_ids=None,
        attention_mask=b_input_mask, labels=b_labels)
        loss, scores = outputs[:2]
        if n_gpu>1:
            # When multi gpu, average it
            loss = loss.mean()

        # backward pass
        loss.backward()

        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)

        # update parameters
        optimizer.step()
        optimizer.zero_grad()

    # print train loss per epoch
    print(f"Train loss: {tr_loss / nb_tr_steps}")
        

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 12790
  Batch size = 32
  Num steps = 4000


Epoch:  10%|█         | 1/10 [00:52<07:50, 52.24s/it]

Train loss: 0.05536287849626614


Epoch:  20%|██        | 2/10 [01:45<06:59, 52.41s/it]

Train loss: 0.008693494840862374


Epoch:  30%|███       | 3/10 [02:38<06:08, 52.64s/it]

Train loss: 0.005215539501566804


Epoch:  40%|████      | 4/10 [03:31<05:16, 52.81s/it]

Train loss: 0.0035362142988102497


Epoch:  50%|█████     | 5/10 [04:25<04:25, 53.08s/it]

Train loss: 0.00272121988088832


Epoch:  60%|██████    | 6/10 [05:18<03:32, 53.18s/it]

Train loss: 0.0021927864580353336


Epoch:  70%|███████   | 7/10 [06:12<02:39, 53.29s/it]

Train loss: 0.00206140744129536


Epoch:  80%|████████  | 8/10 [07:05<01:46, 53.34s/it]

Train loss: 0.0016493910344733032


Epoch:  90%|█████████ | 9/10 [07:59<00:53, 53.38s/it]

Train loss: 0.0015490532072941815


Epoch: 100%|██████████| 10/10 [08:52<00:00, 53.27s/it]

Train loss: 0.0014289507929159545





## Save model 

In [33]:
bert_out_address = '../trained_models/NER/C-Bert-test'

In [34]:
# Make dir if not exits
if not os.path.exists(bert_out_address):
        os.makedirs(bert_out_address)

In [35]:
# Save a trained model, configuration and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

In [36]:
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(bert_out_address, "pytorch_model.bin")
output_config_file = os.path.join(bert_out_address, "config.json")

In [37]:
# Save model into file
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(bert_out_address)

('./trained_models/C-Bert-test/vocab.txt',)

## Load model

In [38]:
model = BertForTokenClassification.from_pretrained(bert_out_address, num_labels=len(tag2idx))

In [39]:
# Set model to GPU
model.cuda();

In [40]:
if n_gpu >1:
    model = torch.nn.DataParallel(model)

## Eval model

In [41]:
# Evalue loop
model.eval();

In [42]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
y_true = []
y_pred = []

print("***** Running evaluation *****")
print(f"  Num examples = {len(test_inputs)}")
print(f"  Batch size = {batch_num}")
for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch

    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None,
        attention_mask=input_mask,)
        # For eval mode, the first result of outputs is logits
        logits = outputs[0] 

    # Get NER predict result
    logits = torch.argmax(F.log_softmax(logits, dim=2),dim=2)
    logits = logits.detach().cpu().numpy()
    # Get NER true result
    label_ids = label_ids.to('cpu').numpy()
    # Only predict the real word, mark=0, will not calculate
    input_mask = input_mask.to('cpu').numpy()
    # Compare the valuable predict result
    for i,mask in enumerate(input_mask):
        # Real one
        temp_1 = []
        # Predict one
        temp_2 = []

        for j, m in enumerate(mask):
            # Mark=0, meaning its a pad word, dont compare
            if m:
                if tag2name[label_ids[i][j]] not in ["X", "[CLS]", "[SEP]"]: # Exclude the X label
                    # print(tag2name[logits[i][j]])
                    temp_1.append(tag2name[label_ids[i][j]])
                    temp_2.append(tag2name[logits[i][j]])
            else:
                break

        y_true.append(temp_1)
        y_pred.append(temp_2)


print("f1 socre: %f"%(f1_score(y_true, y_pred)))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))



***** Running evaluation *****
  Num examples = 27625
  Batch size = 32
f1 socre: 0.897269
Accuracy score: 0.964981




In [1]:
classification_report(y_true, y_pred, digits=4)

NameError: name 'classification_report' is not defined

In [43]:
# Get acc , recall, F1 result report
report = classification_report(y_true, y_pred, digits=4)

# Save the report into file
output_eval_file = os.path.join(bert_out_address, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    print("\n%s"%(report))
    print("f1 socre: %f"%(f1_score(y_true, y_pred)))
    print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))
    
    writer.write("f1 socre:\n")
    writer.write(str(f1_score(y_true, y_pred)))
    writer.write("\n\nAccuracy score:\n")
    writer.write(str(accuracy_score(y_true, y_pred)))
    writer.write("\n\n")  
    writer.write(report)

***** Eval results *****

              precision    recall  f1-score   support

     problem     0.9119    0.9477    0.9295       153
        test     0.8298    0.8966    0.8619        87
   treatment     0.9389    0.8601    0.8978       143

   micro avg     0.9010    0.9034    0.9022       383
   macro avg     0.8936    0.9015    0.8964       383
weighted avg     0.9034    0.9034    0.9023       383

f1 socre: 0.902216
Accuracy score: 0.967076
