In [1]:
import os

# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
# os.environ["CUDA_VISIBLE_DEVICES"]="2,1"  

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import pandas as pd
import math
import numpy as np
from seqeval.metrics import classification_report,accuracy_score,f1_score, precision_score, recall_score
import torch.nn.functional as F

import torch
from tqdm import tqdm, trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import RobertaTokenizer, RobertaForTokenClassification, AdamW

  from .autonotebook import tqdm as notebook_tqdm


## Load data

In [4]:
data_path_train = "../Data/processed/merged/train.tsv" 
train_data = pd.read_csv(data_path_train, sep="\t").astype(str)

data_path_dev= "../Data/processed/merged/dev.tsv" 
dev_data = pd.read_csv(data_path_dev, sep="\t").astype(str)

data_path_test= "../Data/processed/merged/test.tsv" 
test_data = pd.read_csv(data_path_test, sep="\t").astype(str)

## Parser data

In [5]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["word"].values.tolist(),
                                                        #    s["POS"].values.tolist(),
                                                           s["tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    # def get_next(self):
    #     try:
    #         s = self.grouped["sentence: {}".format(self.n_sent)]
    #         self.n_sent += 1
    #         return s
    #     except:
    #         return None

In [6]:
# Get full document data struce
train_getter = SentenceGetter(train_data)
# Get sentence data
train_sentences = [[s[0] for s in sent] for sent in train_getter.sentences]
train_labels = [[s[1] for s in sent] for sent in train_getter.sentences]

  self.grouped = self.data.groupby("sentence #").apply(agg_func)


In [7]:
# Get full document data struce
dev_getter = SentenceGetter(dev_data)
# Get sentence data
dev_sentences = [[s[0] for s in sent] for sent in dev_getter.sentences]
dev_labels = [[s[1] for s in sent] for sent in dev_getter.sentences]

  self.grouped = self.data.groupby("sentence #").apply(agg_func)


In [8]:
# Get full document data struce
test_getter = SentenceGetter(test_data)
# Get sentence data
test_sentences = [[s[0] for s in sent] for sent in test_getter.sentences]
test_labels = [[s[1] for s in sent] for sent in test_getter.sentences]

  self.grouped = self.data.groupby("sentence #").apply(agg_func)


In [9]:
train_labels[125]

['O', 'O']

In [10]:
# Set a dict for mapping id to tag name
#tag2idx = {t: i for i, t in enumerate(tags_vals)}

# Recommend to set it by manual define, good for reusing
tag2idx={'B-problem': 0,
 'B-test': 1,
 'B-treatment': 2,
 'I-problem': 3,
 'I-test': 4,
 'I-treatment': 5,
 'O': 6,
 'X': 7,
 '[CLS]': 8,
 '[SEP]': 9
 }
# Mapping index to name
tag2name = {tag2idx[key]: key for key in tag2idx}

In [11]:
tag2idx

{'B-problem': 0,
 'B-test': 1,
 'B-treatment': 2,
 'I-problem': 3,
 'I-test': 4,
 'I-treatment': 5,
 'O': 6,
 'X': 7,
 '[CLS]': 8,
 '[SEP]': 9}

In [12]:
tag2name

{0: 'B-problem',
 1: 'B-test',
 2: 'B-treatment',
 3: 'I-problem',
 4: 'I-test',
 5: 'I-treatment',
 6: 'O',
 7: 'X',
 8: '[CLS]',
 9: '[SEP]'}

## Make training data

Make raw data into trainable data for BERT, including:

- Set gpu environment
- Load tokenizer and tokenize
- Set 3 embedding, token embedding, mask word embedding, segmentation embedding
- Split data set into train and validate, then send them to dataloader

**Set up gpu environment**

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

print(device)

cuda


### Load tokenizer

You can download the tokenizer file into local folder first :
- [vocab.txt](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt)

In [14]:
# # Manual define vocabulary address, if you download the tokenzier file in local
# # vocab.txt, download from: https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt
# vocabulary = "Bio_ClinicalBERT/vocab.txt"
# # load tokenizer, with manual file address or pretrained address
# tokenizer=BertTokenizer(vocab_file=vocabulary, do_lower_case=False)


In [15]:
# Len of the sentence must be not bigger than the training model
# See model's 'max_position_embeddings' = 512
max_len  = 128

In [16]:
model_name = "FacebookAI/roberta-large"  

In [17]:
tokenizer = RobertaTokenizer.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


**Tokenizer text**

- In hunggieface for bert, when come across OOV, will word piece the word
- We need to adjust the labels base on the tokenize result, “##abc” need to set label "X" 
- Need to set "[CLS]" at front and "[SEP]" at the end, as what the paper do, [BERT indexer should add [CLS] and [SEP] tokens](https://github.com/allenai/allennlp/issues/2141)


In [18]:
def process_data(sentences, labels):
    tokenized_texts = []
    word_piece_labels = []
    i_inc = 0
    for word_list,label in (zip(sentences,labels)):
        temp_lable = []
        temp_token = []
        
        # # Add [CLS] at the front 
        # temp_lable.append('[CLS]')
        # temp_token.append('[CLS]')
        
        for word, lab in zip(word_list,label):
            token_list = tokenizer.tokenize(word)
            for m,token in enumerate(token_list):
                temp_token.append(token)
                if m==0:
                    temp_lable.append(lab)
                else:
                    temp_lable.append('X')  
                    
        # # Add [SEP] at the end
        # temp_lable.append('[SEP]')
        # temp_token.append('[SEP]')
        
        tokenized_texts.append(temp_token)
        word_piece_labels.append(temp_lable)
    
    id_list = []
    target_list = [] 
    attention_mask_list = []   
        
    # PADING        
    for text, label in zip(tokenized_texts, word_piece_labels):
        
        # Add [CLS] and [SEP], 
        # Truncate seq if it is too long
        text = ['[CLS]'] + text[:max_len-2] + ['[SEP]']
        label = ['[CLS]'] + label[:max_len-2] + ['[SEP]']
        
        # convert to ids
        ids = tokenizer.convert_tokens_to_ids(text)
        target_tag =[tag2idx.get(t) for t in label]
        
        # padding 
        # Label [PAD] with O (other)
        padding_len = max_len - len(ids)
        ids = ids + [0] * padding_len
        target_tag = target_tag + [tag2idx['O']] * padding_len
        
        # create masks
        attention_masks = [int(i>0) for i in ids]
        
        id_list.append(ids)
        target_list.append(target_tag)
        attention_mask_list.append(attention_masks)
    
    return id_list, target_list, attention_mask_list

    
    
    

In [19]:
train_input_ids, train_input_tags, train_attention_masks = process_data(train_sentences, train_labels)
dev_input_ids, dev_input_tags, dev_attention_masks = process_data(dev_sentences, dev_labels)
test_input_ids, test_input_tags, test_attention_masks = process_data(test_sentences, test_labels)

In [20]:
train_inputs = torch.tensor(train_input_ids)
dev_inputs = torch.tensor(dev_input_ids)
test_inputs = torch.tensor(test_input_ids)
train_tags = torch.tensor(train_input_tags)
dev_tags = torch.tensor(dev_input_tags)
test_tags = torch.tensor(test_input_tags)
train_masks = torch.tensor(train_attention_masks)
dev_masks = torch.tensor(dev_attention_masks)
test_masks = torch.tensor(test_attention_masks)

**Put data into data loader**

In [21]:
# Set batch num
batch_num = 32

In [22]:
# Only set token embedding, attention embedding, no segment embedding
train_dataset = TensorDataset(train_inputs, train_masks, train_tags)
train_sampler = RandomSampler(train_dataset)
# Drop last can make batch training better for the last one
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_num, drop_last=True)

dev_dataset = TensorDataset(dev_inputs, dev_masks, dev_tags)
dev_sampler = SequentialSampler(dev_dataset)
dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=batch_num)

test_data = TensorDataset(test_inputs, test_masks, test_tags)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_num)

## Train model

**Load BERT model**

In [23]:
# In this folder, contain model confg(json) and model weight(bin) files
# pytorch_model.bin, download from: https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin
# config.json, downlaod from: https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json
# model_file_address = 'emilyalsentzer/Bio_ClinicalBERT'
model_file_address = model_name

In [24]:
# Will load config and weight with from_pretrained()
model = RobertaForTokenClassification.from_pretrained(model_file_address, num_labels=len(tag2idx))

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# Set model to GPU,if you are using GPU machine
model.cuda();

In [26]:
# Add multi GPU support
if n_gpu >1:
    model = torch.nn.DataParallel(model)

In [27]:
# Set epoch and grad max num
epochs = 6
max_grad_norm = 1.0

In [28]:
# Cacluate train optimiazaion num
num_train_optimization_steps = int(math.ceil(len(train_inputs) / batch_num) / 1) * epochs

### Set fine tuning method

**Manual optimizer**

In [29]:
# True: fine tuning all the layers 
# False: only fine tuning the classifier layers
FULL_FINETUNING = True

In [30]:
if FULL_FINETUNING:
    # Fine tune model all layer parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    # Only fine tune classifier parameters
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
    
optimizer = Adam(optimizer_grouped_parameters, lr=5e-5)

### Fine-tuning model

In [31]:
# TRAIN loop
model.train()

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)


In [32]:
print("***** Running training *****")
print("  Num examples = %d"%(len(train_inputs)))
print("  Batch size = %d"%(batch_num))
print("  Num steps = %d"%(num_train_optimization_steps))
for _ in trange(epochs,desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for batch in train_dataloader:
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # forward pass
        outputs = model(b_input_ids, token_type_ids=None,
        attention_mask=b_input_mask, labels=b_labels)
        loss, scores = outputs[:2]
        if n_gpu>1:
            # When multi gpu, average it
            loss = loss.mean()

        # backward pass
        loss.backward()

        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)

        # update parameters
        optimizer.step()
        optimizer.zero_grad()

    # print train loss per epoch
    print(f"Train loss: {tr_loss / nb_tr_steps}")
        

***** Running training *****
  Num examples = 14526
  Batch size = 32
  Num steps = 2724


Epoch:  17%|█▋        | 1/6 [11:57<59:46, 717.21s/it]

Train loss: 0.05251172570939326


Epoch:  33%|███▎      | 2/6 [23:57<47:55, 718.82s/it]

Train loss: 0.016079183459224357


Epoch:  50%|█████     | 3/6 [35:56<35:57, 719.15s/it]

Train loss: 0.010948422773757634


Epoch:  67%|██████▋   | 4/6 [47:57<23:59, 719.62s/it]

Train loss: 0.009704814898209938


Epoch:  83%|████████▎ | 5/6 [59:57<11:59, 719.77s/it]

Train loss: 0.007938123337675717


Epoch: 100%|██████████| 6/6 [1:11:56<00:00, 719.42s/it]

Train loss: 0.00700002498785959





## Save model 

In [33]:
# Extract the base model name (remove any slashes or paths)
model_base_name = model_name.split("/")[-1]

In [34]:
# Define the output address based on the model name
bert_out_address = f"../trained_models/NER/{model_base_name}-test"

In [35]:
import shutil

# Check if the directory exists
if os.path.exists(bert_out_address):
    # Delete the directory and its contents
    shutil.rmtree(bert_out_address)

# Now create the directory
os.makedirs(bert_out_address)

In [36]:
# Save a trained model, configuration and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

In [37]:
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(bert_out_address, "pytorch_model.bin")
output_config_file = os.path.join(bert_out_address, "config.json")

In [38]:
# Save model into file
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(bert_out_address)

('../trained_models/NER/roberta-large-test\\vocab.json',
 '../trained_models/NER/roberta-large-test\\merges.txt')

## Load model

In [39]:
model = RobertaForTokenClassification.from_pretrained(bert_out_address, num_labels=len(tag2idx))

In [40]:
# Set model to GPU
model.cuda();

In [41]:
if n_gpu >1:
    model = torch.nn.DataParallel(model)

## Eval model

In [42]:
# Evalue loop
model.eval();

In [43]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
y_true = []
y_pred = []

print("***** Running evaluation *****")
print(f"  Num examples = {len(test_inputs)}")
print(f"  Batch size = {batch_num}")
for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch

    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None,
        attention_mask=input_mask,)
        # For eval mode, the first result of outputs is logits
        logits = outputs[0] 

    # Get NER predict result
    logits = torch.argmax(F.log_softmax(logits, dim=2),dim=2)
    logits = logits.detach().cpu().numpy()
    # Get NER true result
    label_ids = label_ids.to('cpu').numpy()
    # Only predict the real word, mark=0, will not calculate
    input_mask = input_mask.to('cpu').numpy()
    # Compare the valuable predict result
    for i,mask in enumerate(input_mask):
        # Real one
        temp_1 = []
        # Predict one
        temp_2 = []

        for j, m in enumerate(mask):
            # Mark=0, meaning its a pad word, dont compare
            if m:
                if tag2name[label_ids[i][j]] not in ["X", "[CLS]", "[SEP]"]: # Exclude the X label
                    # print(tag2name[logits[i][j]])
                    temp_1.append(tag2name[label_ids[i][j]])
                    temp_2.append(tag2name[logits[i][j]])
            else:
                break

        y_true.append(temp_1)
        y_pred.append(temp_2)


print("f1 socre: %f"%(f1_score(y_true, y_pred)))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))



***** Running evaluation *****
  Num examples = 27625
  Batch size = 32




f1 socre: 0.827187
Accuracy score: 0.945368


In [44]:
classification_report(y_true, y_pred, digits=4)

  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n           _     0.0000    0.0000    0.0000         0\n     problem     0.8501    0.7957    0.8220     12572\n        test     0.8259    0.8780    0.8512      9207\n   treatment     0.8323    0.8157    0.8239      9270\n\n   micro avg     0.8283    0.8261    0.8272     31049\n   macro avg     0.6271    0.6224    0.6243     31049\nweighted avg     0.8376    0.8261    0.8312     31049\n'

In [45]:
# Get acc , recall, F1 result report
report = classification_report(y_true, y_pred, digits=4)

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)

# Save the report into file
output_eval_file = os.path.join(bert_out_address, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    print("\n%s"%(report))
    print("Precision: %f"%(precision))
    print("Recall: %f"%(recall))
    print("F1: %f"%(f1))
    print("Accuracy: %f"%(accuracy))
    

    writer.write("Precision: " + str(precision))
    writer.write("\n\nRecall: " + str(recall))
    writer.write("\n\nF1: " + str(f1))
    writer.write("\n\nAccuracy: " + str(accuracy))
    writer.write("\n\n")  
    writer.write(report)

***** Eval results *****

              precision    recall  f1-score   support

           _     0.0000    0.0000    0.0000         0
     problem     0.8501    0.7957    0.8220     12572
        test     0.8259    0.8780    0.8512      9207
   treatment     0.8323    0.8157    0.8239      9270

   micro avg     0.8283    0.8261    0.8272     31049
   macro avg     0.6271    0.6224    0.6243     31049
weighted avg     0.8376    0.8261    0.8312     31049

Precision: 0.828296
Recall: 0.826081
F1: 0.827187
Accuracy: 0.945368
