# Libraries installation

Install Transformers libaries, framework seqeval, some extensions: unidecode for formatting accent language, datasets for creating a Dataset,...

In [1]:
#Install required libraries
!pip install datasets transformers evaluate seqeval unidecode

Installing collected packages: tokenizers, xxhash, unidecode, multidict, frozenlist, dill, async-timeout, yarl, responses, multiprocess, huggingface-hub, aiosignal, transformers, seqeval, aiohttp, datasets, evaluate
Successfully installed aiohttp-3.8.4 aiosignal-1.3.1 async-timeout-4.0.2 datasets-2.12.0 dill-0.3.6 evaluate-0.4.0 frozenlist-1.3.3 huggingface-hub-0.14.1 multidict-6.0.4 multiprocess-0.70.14 responses-0.18.0 seqeval-1.2.2 tokenizers-0.13.3 transformers-4.28.1 unidecode-1.3.6 xxhash-3.2.0 yarl-1.9.2


Import required libraries

In [2]:
#Import libraries to project
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments
import torch
from datasets import Dataset
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
import json
import pandas as pd
from unidecode import unidecode 

Mount Google Drive to Google Colab

In [3]:
#Mount Drive to Colab
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Data preparation

Read training set from dataset stored in Google Drive.
Create a list of input and output sequences

In [None]:
#Read training set 
trainWord = [json.loads(line) for line in open('gdrive/MyDrive/Ct550/word/train_word.json', 'r', encoding='utf-8')]
trainWordData = pd.DataFrame(trainWord)

In [None]:
#Create a list of texts
inputs = trainWordData['words']
input_sequences = []

for sentence in inputs:
  input_sequences.append(' '.join(sentence))

Convert problem into text-to-text format by formatting an output which is a sequence of class:entity pairs separated by a semicolon ';'

In [None]:
def format_targets(text, label):
  s = ""
  texts = []
  for i in range(len(text)-1):
    if label[i] != 'O':
      s += text[i] + " "
      if label[i+1] == 'O' or label[i+1][0:1] == 'B':
        s = label[i][2:] + ": " + s
        texts.append(s.strip())
        s = ""
  texts = "; ".join(texts)
  return texts

In [None]:
str1 = format_targets(text, label)
str1

'DATE: 12/8; DATE: 20/8'

 Convert them back to sentence format

In [None]:
output_sequences = []
for i in range(5027):
  text = trainWordData['words'][i]
  label = trainWordData['tags'][i]

  target = format_targets(text, label)
  output_sequences.append(target)

In [None]:
input_sequences[4089], output_sequences[4089]

('Bệnh_nhân vào viện ngày 31/7 , đến nay đã 4 lần xét_nghiệm âm_tính với nCoV , gồm lần một ngày 6/8 , lần hai ngày 8/8 , lần ba ngày 10/8 , lần bốn ngày 11/8 .',
 'DATE: 31/7; DATE: 6/8; DATE: 8/8; DATE: 10/8; DATE: 11/8')

# Training preparation

We first load a ViT5 tokenizer and a pretrained model powered by Transformers library

In [None]:
#Create a tokenizer and a model supported by ViT5
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")  
model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-base")
# model.to("cuda")

Define a function which converts data to tokens

In [None]:
#Function for tokenizing texts and labels
def preprocess(examples):
  model_inputs = tokenizer(
      examples["inputs"], max_length=256, truncation=True, padding=True
  )
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(
        examples["labels"], max_length=256, truncation=True, padding=True
    )
  model_inputs["labels"] = labels['input_ids']
  model_inputs['input_ids'] = model_inputs['input_ids']
  return model_inputs

Define a dictionary that holds inputs and their corresponding labels

Create a dataset from that dictionary and apply the tokenization process to every data sample with map function

In [None]:
#Tokenise dataset
dict_obj = {'inputs':input_sequences, 'labels':output_sequences}
dataset = Dataset.from_dict(dict_obj)
tokenized_datasets = dataset.map(preprocess, batched=True, num_proc=8)

Set training arguments for training process

In [None]:
#Arguments for training model
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")
training_args = Seq2SeqTrainingArguments('gdrive/MyDrive/checkpoint',
                                      do_train=True,
                                      do_eval=False,
                                      num_train_epochs=10,
                                      learning_rate=2e-5,
                                      warmup_ratio=0.00,
                                      weight_decay=0.01,
                                      per_device_train_batch_size=4,
                                      per_device_eval_batch_size=4,
                                      logging_dir='./log',
                                      group_by_length=True,
                                      save_strategy="epoch",
                                      save_total_limit=3,
                                      #eval_steps=1,
                                      #evaluation_strategy="steps",
                                      # evaluation_strategy="no",
                                      # fp16=True,
                                      )

Make sure to push the model to GPU for optimisation

In [None]:
model.to('cuda')

# Train Named Entity Recognition model

In [None]:
#Procession of training model 
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)

trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.6376
1000,0.0311
1500,0.0163
2000,0.0126
2500,0.0104
3000,0.0076
3500,0.0063
4000,0.0054
4500,0.0043
5000,0.0047


TrainOutput(global_step=12570, training_loss=0.030352786416002656, metrics={'train_runtime': 5754.3935, 'train_samples_per_second': 8.736, 'train_steps_per_second': 2.184, 'total_flos': 1.1147259635712e+16, 'train_loss': 0.030352786416002656, 'epoch': 10.0})

# Inference

Load fine-tuned model stored in Google Drive with a Tokenizer by ViT5

In [4]:
#Load model saved as a checkpoint
from transformers import AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained('VietAI/vit5-base')
model = AutoModelForSeq2SeqLM.from_pretrained("gdrive/MyDrive/checkpoint/checkpoint-12570")
# model.to("cuda")

In [6]:
#Total number of trainable parameters
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
pytorch_total_params

225950976

In [None]:
model.to('cpu')

To use the model at its best, make sure the data fed to the model is word-level,
 if it's not, simply use an RDR-Segmenter!

In [None]:
!pip install python-rdrsegmenter

In [None]:
from python_rdrsegmenter import load_segmenter
segmenter = load_segmenter()
text = "Bệnh nhân 651 , quê Duy Xuyên , Quảng Nam , có tiền sử suy thượng thận mạn tính ."
segmenter.tokenize(text)

'Bệnh_nhân 651 , quê Duy_Xuyên , Quảng_Nam , có tiền_sử suy thượng_thận mạn_tính .'

The cell below helps produce target text with 'generate' method 

In [None]:
sentence = input('Your sentence: ')
# sentence = eval_input_sequences[23]
encoding = tokenizer(sentence, return_tensors="pt", max_length=1024)
input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]
outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    max_length=1024,
)
labels = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
print ("Results: ")
labels

Your sentence: Bệnh_nhân 315 là một y_tá bệnh_viện Chợ_Rẫy
Results: 


'PATIENT_ID: 315'

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

'word_labels' takes into account two parameters 'sentence' and 'labels' to calculate the correct BIO label and map them to a list 

In [None]:
def word_labels(sentence, labels):
  predictions = ["O" for i in range(len(sentence.split()))]
  if labels != '':
    list_labels = labels.split(";")
    sent = sentence.split()

    start = 0
    for i in range(len(list_labels)):
      sub_list = list_labels[i].split(":")
      class_entity = sub_list[0].strip() # location, organization, age,...
      named_entity = sub_list[1].strip().lower() # Ha Noi, London, 433,...
      named_entity_element = named_entity.split() # Ha, Noi, London, 433, Soc Trang,...

      flist = []
      for i in range(len(named_entity_element)):
        if named_entity_element[i][-1] == ',':
          entity1 = named_entity_element[i][0:len(named_entity_element[i])-1]
          entity2 = ","
          flist.append(entity1)
          flist.append(entity2)
        else:
          flist.append(named_entity_element[i])

      named_entity_element = flist
      for i in range(len(named_entity_element)): 
        try:
          findex = sent.index(named_entity_element[i], start)
          start = findex + 1
          f_class = ""
          if i == 0:
            f_class = "B-" + class_entity
          else:
            f_class = "I-" + class_entity
          predictions[findex] = f_class
        except:
          pass

  return predictions 

# reverse the subword tokens to their real words 

Load dataset for devaluation

In [None]:
#Read test set
evalWord = [json.loads(line) for line in open('gdrive/MyDrive/Ct550/word/test_word.json', 'r', encoding='utf-8')]
evalWordData = pd.DataFrame(evalWord)
evalWordData = evalWordData.rename(columns={'tags':'target_text', 'words':'source_text'})

Create lists of input and output sequences 

In [None]:
#Create a list of texts
eval_input_data = evalWordData['source_text']
eval_input_sequences = []

for sentence in eval_input_data:
  eval_input_sequences.append(" ".join(sentence))

#Create a list of targets
eval_output_data = evalWordData['target_text']
eval_output_sequences = []

for sentence in eval_output_data:
  eval_output_sequences.append(" ".join(sentence))

In [None]:
eval_input_sequences

['Từ 24 - 7 đến 31 - 7 , bệnh_nhân được mẹ là bà H.T.P ( 47 tuổi ) đón về nhà ở phường Phước_Hoà ( bằng xe_máy ) , không đi đâu chỉ ra Tạp_hoá Phượng , chợ Vườn_Lài , phường An_Sơn cùng mẹ bán tạp_hoá ở đây .',
 'Bác_sĩ Trần_Thanh_Linh , từ Bệnh_viện Chợ_Rẫy chi_viện phụ_trách đơn_nguyên hồi_sức tích_cực , cho biết " bệnh_nhân 416 " vẫn đang duy_trì ECMO , thở máy , hiện xơ phổi rất nhiều .',
 'Theo đó , Sở Y_tế Bình_Thuận cho biết sau khi xác_định bệnh_nhân số 34 ( nữ_giới 51 tuổi , từ Mỹ về Việt_Nam ngày 29 - 2 có quá_cảnh Qatar ) , Trung_tâm Kiểm_soát bệnh_tật Bình_Thuận đã điều_tra dịch_tễ , khoanh vùng , khử khuẩn , tiến_hành cách_ly người liên_quan đến ca bệnh số 34 .',
 'Bệnh_nhân 218 : nữ , 43 tuổi , quốc_tịch Việt_Nam , địa_chỉ tại Phú_Xá , Thái_Nguyên , về nước trên chuyến bay SU290 ( số ghế 46 G ) ngày 25 - 3 , sau nhập_cảnh được cách_ly tập_trung tại Đại_học FPT ở Láng - Hoà_Lạc ( Hà_Nội ) . Từ 31 - 3 bệnh_nhân được cách_ly , điều_trị tại Bệnh_viện Bệnh nhiệt_đới trung_ương

In [None]:
model.to('cuda')

The process below predicts targets for 2k data samples in the dev set

In [None]:
predictions = []
references = []

In [None]:
for i in range(2000):
  print (i)
  sentence = eval_input_sequences[i]
  tokenized_input = tokenizer(sentence, return_tensors='pt', max_length=1024)
  input_ids = tokenized_input['input_ids'].to('cuda')
  attention_mask = tokenized_input['attention_mask'].to('cuda')

  outputs = model.generate(
      input_ids=input_ids, attention_mask=attention_mask, max_length=1024
      )
  labels = tokenizer.decode(outputs[0], clean_up_tokenization_spaces=True, skip_special_tokens=True)

  sentence = unidecode(sentence).lower()
  labels = unidecode(labels)
  flabels = word_labels(sentence, labels)
  predictions.append(flabels)
  references.append(eval_output_sequences[i].split())

In [None]:
eval_input_sequences[121]

'Hai viện Bệnh Nhiệt_đới và Pasteur_TPHCM xét_nghiệm cho kết_quả có nCoV .'

# resultssss

Use seqeval framework introduced to calculate scores F1/precision/recall for every class entity and overall score for model performance 

In [None]:
result = seqeval.compute(predictions=predictions, references=references)
result

{'AGE': {'precision': 0.9506849315068493,
  'recall': 0.961218836565097,
  'f1': 0.9559228650137742,
  'number': 361},
 'DATE': {'precision': 0.9740143369175627,
  'recall': 0.985494106980961,
  'f1': 0.9797205948625506,
  'number': 1103},
 'GENDER': {'precision': 0.9490909090909091,
  'recall': 0.9422382671480144,
  'f1': 0.9456521739130435,
  'number': 277},
 'JOB': {'precision': 0.7153846153846154,
  'recall': 0.7045454545454546,
  'f1': 0.7099236641221374,
  'number': 132},
 'LOCATION': {'precision': 0.9458668617410387,
  'recall': 0.9448301059554256,
  'f1': 0.9453481995978796,
  'number': 2737},
 'NAME': {'precision': 0.9545454545454546,
  'recall': 0.8936170212765957,
  'f1': 0.9230769230769231,
  'number': 188},
 'ORGANIZATION': {'precision': 0.8785714285714286,
  'recall': 0.8929219600725953,
  'f1': 0.8856885688568857,
  'number': 551},
 'PATIENT_ID': {'precision': 0.9803613511390417,
  'recall': 0.9780564263322884,
  'f1': 0.9792075323656336,
  'number': 1276},
 'SYMPTOM_AND