In [35]:
# This script is used for finetuning BioBERT v1.1 for performing NER task with NCBI disease dataset
# Load packages
import os

from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer

In [29]:
# Download the dataset
# Check the dataset

dataset = load_dataset("ncbi_disease")
print(dataset["train"][0]["tokens"])
print(dataset["train"][0]["ner_tags"])

# Note we have to use save_to_disk() to data and configure files
wd = os.getcwd()
dataset_dir = wd + "/../datasets/ncbi_disease/"
dataset.save_to_disk(dataset_dir)

['Identification', 'of', 'APC2', ',', 'a', 'homologue', 'of', 'the', 'adenomatous', 'polyposis', 'coli', 'tumour', 'suppressor', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0]


Saving the dataset (0/1 shards):   0%|          | 0/5433 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/924 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/941 [00:00<?, ? examples/s]

In [14]:
# Note it seems that some attributes are only kept in the data directly downloaded from hugging face
# If you save data and read from your local path, those attributes will be lost
# 0: O, 1: B-Disease, 2: I-Disease 

ner_feature = dataset["train"].features["ner_tags"]
print(ner_feature)
label_names = ner_feature.feature.names
print(label_names)

for i in range(3):
    print("Label: "+ str(i) + ", is:", label_names[i])

Sequence(feature=ClassLabel(names=['O', 'B-Disease', 'I-Disease'], id=None), length=-1, id=None)
['O', 'B-Disease', 'I-Disease']
Label: 0, is: O
Label: 1, is: B-Disease
Label: 2, is: I-Disease


In [15]:
# Tokenize the data
# Use one sentence as an example
# Visualize the tokenization process

words = dataset["train"][0]["tokens"]
labels = dataset["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

Identification of APC2 , a homologue of the adenomatous polyposis coli      tumour    suppressor . 
O              O  O    O O O         O  O   B-Disease   I-Disease I-Disease I-Disease O          O 


In [16]:
# leading the tokenizer
wd = os.getcwd()
model_checkpoint = wd + "/../models/biobert-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [17]:
inputs = tokenizer(dataset["train"][0]["tokens"], is_split_into_words=True)
print(inputs.tokens())

['[CLS]', 'I', '##dent', '##ification', 'of', 'AP', '##C', '##2', ',', 'a', 'ho', '##mo', '##logue', 'of', 'the', 'ad', '##eno', '##mat', '##ous', 'p', '##oly', '##po', '##sis', 'co', '##li', 't', '##umour', 'suppress', '##or', '.', '[SEP]']


In [18]:
# One word may generate more than one tokens
# This step is to assign the tokenized word to the original word 
# Example: [0, 0, 0]  means these three tokens are from the original first word (the first word in dataset["train"][0]["tokens"]) 
print(inputs.word_ids()) 

[None, 0, 0, 0, 1, 2, 2, 2, 3, 4, 5, 5, 5, 6, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 11, 11, 12, 12, 13, None]


In [19]:
# Here we assign the label to each token
# If both the current and previous token are derived from the same word
# Then both the current and previous token has same labels

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [20]:
words = dataset["train"][0]["tokens"]
labels = dataset["train"][0]["ner_tags"]
inputs = tokenizer(words, is_split_into_words=True)

labels =dataset["train"][0]["ner_tags"]

# Assign the tokens to their original id (the original position index of this word in sentence)
word_ids = inputs.word_ids()
print(labels)
print(word_ids)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0]
[None, 0, 0, 0, 1, 2, 2, 2, 3, 4, 5, 5, 5, 6, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 11, 11, 12, 12, 13, None]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, -100]


In [21]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [23]:

tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5433
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 924
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 941
    })
})

In [36]:
wd = os.getcwd()
tokenized_dataset_dir = wd + "/../datasets/ncbi_disease_tokenized/"
tokenized_datasets.save_to_disk(tokenized_dataset_dir)
reloaded_tokenized_datasets = load_from_disk(tokenized_dataset_dir)
reloaded_tokenized_datasets

Saving the dataset (0/1 shards):   0%|          | 0/5433 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/924 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/941 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5433
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 924
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 941
    })
})

In [31]:
# Check here
# https://github.com/UMESH519/finetune-BioBERT-NCBI-Disease/blob/main/BERT_BioBERT_Finetuned_ncbi_dataset.ipynb