# GenoVarDis: NER in Genomic Variants and related Diseases

In [None]:
if 'google.colab' in str(get_ipython()):
    print('Running on CoLab')
    from google.colab import drive
    #drive.flush_and_unmount()
    drive.mount('/content/drive')
    root = '/content/drive/My Drive/Colab Notebooks'
else:
    print('Not running on CoLab')
    root = './'

print("Current directory: {}".format(root))

Running on CoLab
Mounted at /content/drive
Current directory: /content/drive/My Drive/Colab Notebooks


**1. Getting the resources**

In [None]:
%%capture
!pip install pytorch_transformers
!pip install transformers
!pip install seqeval
!pip install sklearn-crfsuite
!pip install spacy
!python -m spacy download es_core_news_sm


**2. Data loading and formatting**

Definiendo rutas de datos:

In [None]:
LANG = 'es'  # Este sistema también procesa textos escritos en inglés, LANG='en'
allTypes = True
sTypes = ''
if allTypes:
    sTypes = '_all'
import spacy
from spacy.lang.es.examples import sentences

nlp = spacy.load("es_core_news_sm")


# Rutas de los datos
path_data = root + '/ner/data/genovardis_train_dev/' # Carpeta donde se encuentran los conjuntos de datos
path_models = root + '/ner/models/{}/'.format(LANG) # Carpeta para guardar los modelos
checkpoints = root + '/ner/checkpoints/' # Carpeta para los puntos de control
path_scores = root + '/ner/scores/{}/'.format(LANG) # Carpeta para guardar los puntajes
path_outputs = root + '/ner/outputs/{}/'.format(LANG) # Carpeta para guardar las salidas

Loading y formateo:



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Cargar datos de entrenamiento
train_data = pd.read_csv(path_data+'train_text.tsv'.format(sTypes), sep='\t', index_col=0)
df_annotation = pd.read_csv(path_data+'train_annotation.tsv'.format(sTypes), sep='\t', index_col=0)
df_dev = pd.read_csv(path_data+'dev_text.tsv'.format(sTypes), sep='\t', index_col=0)


In [None]:
# Estadísticas generales para el número de palabras en cada texto
train_data['word_count'] = train_data['text'].apply(lambda x: len(x.split())) # Divide el texto en palabras y calcula la cantidad de palabras
count_df_train = train_data.groupby('text')['word_count'].max() # Cuenta la cantidad máxima de palabras en cada texto
statistics_train = count_df_train.describe() # Calcula estadísticas descriptivas sobre la cantidad máxima de palabras en los textos de entrenamiento
print('\nAlgunas estadísticas de las oraciones en el conjunto de entrenamiento:')
print(statistics_train)

df_dev['word_count'] = df_dev['text'].apply(lambda x: len(x.split())) # Divide el texto en palabras y calcula la cantidad de palabras
count_df_dev = df_dev.groupby('text')['word_count'].max() # Cuenta la cantidad máxima de palabras en cada texto
statistics_dev = count_df_dev.describe() # Calcula estadísticas descriptivas sobre la cantidad máxima de palabras en los textos de desarrollo
print('\nAlgunas estadísticas de las oraciones en el conjunto de desarrollo:')
print(statistics_dev)

# Longitud de la oración más larga. La longitud es el número de palabras.
MAX_LEN_TRAIN = int(statistics_train['max']) # Longitud máxima de las oraciones en el conjunto de entrenamiento
MAX_LEN_DEV = int(statistics_dev['max']) # Longitud máxima de las oraciones en el conjunto de desarrollo
MAX_LEN = max(MAX_LEN_TRAIN, MAX_LEN_DEV) # Longitud máxima total de las oraciones
print('\n')
print('La longitud máxima de las oraciones en ENTRENAMIENTO es: ', MAX_LEN_TRAIN)
print('La longitud máxima de las oraciones en DESARROLLO es: ', MAX_LEN_DEV)
print('La longitud máxima de las oraciones en TOTAL es:', MAX_LEN)


Algunas estadísticas de las oraciones en el conjunto de entrenamiento:
count    427.000000
mean     276.784543
std       65.877967
min       78.000000
25%      239.000000
50%      279.000000
75%      315.000000
max      469.000000
Name: word_count, dtype: float64

Algunas estadísticas de las oraciones en el conjunto de desarrollo:
count     70.000000
mean     287.457143
std       59.899599
min      114.000000
25%      257.000000
50%      289.500000
75%      314.500000
max      438.000000
Name: word_count, dtype: float64


La longitud máxima de las oraciones en ENTRENAMIENTO es:  469
La longitud máxima de las oraciones en DESARROLLO es:  438
La longitud máxima de las oraciones en TOTAL es: 469


# Formatting data and putting POS data

In [None]:
# Function to tag each character with its annotation
def tag_characters(text, annotations):
    tags = ['O'] * len(text)
    for _, ann in annotations.iterrows():
        start, end = ann['offset1'], ann['offset2']
        label = ann['label']
        for i in range(start, end):
            tags[i] = label
    return tags

# Apply function to tag characters in each text
def apply_char_tags(row, df_annotation):
    doc_id = row.name
    text = row['text']
    annotations = df_annotation[df_annotation.index == doc_id]
    char_tags = tag_characters(text, annotations)
    return char_tags

# Apply the function to each row in train_data
train_data['char_tags'] = train_data.apply(lambda row: apply_char_tags(row, df_annotation), axis=1)

# Function to map character annotations to spaCy tokens
def map_char_annotations_to_tokens(doc, char_tags):
    token_annotations = []
    for token in doc:
        start, end = token.idx, token.idx + len(token)
        token_tag = char_tags[start:end]
        # Determine the most frequent tag in the range
        if token_tag:
            most_frequent_tag = max(set(token_tag), key=token_tag.count)
        else:
            most_frequent_tag = 'O'
        token_annotations.append(most_frequent_tag)
    return token_annotations

# Function to convert word tags to IOB tags
def convert_to_iob_tags(word_tags):
    iob_tags = []
    prev_tag = 'O'
    for tag in word_tags:
        if tag == 'O':
            iob_tags.append('O')
        elif tag != prev_tag:
            iob_tags.append(f'B-{tag}')
        else:
            iob_tags.append(f'I-{tag}')
        prev_tag = tag
    return iob_tags

# Initialize a global sentence counter
global_sentence_counter = 1

# Process text to include POS data and other required information
def process_text(text, doc_id, char_tags):
    global global_sentence_counter
    doc = nlp(text)
    all_sentences = []

    # Local sentence counter for the current document
    local_sentence_counter = 1

    # Map character annotations to spaCy tokens
    token_annotations = map_char_annotations_to_tokens(doc, char_tags)
    iob_tags = convert_to_iob_tags(token_annotations)

    token_index = 0  # Index to keep track of the current token in tokens and iob_tags

    for sent in doc.sents:
        for token in sent:
            # Create a dictionary for each word with all required information
            word_data = {
                'doc_id': doc_id,
                'Sentence # in the paper': local_sentence_counter,
                'Global Sentence #': global_sentence_counter,
                'Word': token.text,
                'POS': token.pos_,
                'IOB': iob_tags[token_index] if token_index < len(iob_tags) else 'O'
            }
            all_sentences.append(word_data)
            token_index += 1

        # Increment the local and global sentence counters
        local_sentence_counter += 1
        global_sentence_counter += 1

    return all_sentences

# Process each document to include POS data and IOB tags
new_train = []
for doc_id, row in train_data.iterrows():
    processed_sentences = process_text(row['text'], doc_id, row['char_tags'])
    new_train.extend(processed_sentences)

# Convert the list of dictionaries to a DataFrame
new_train_df = pd.DataFrame(new_train)



In [None]:
new_train_df

Unnamed: 0,doc_id,Sentence # in the paper,Global Sentence #,Word,POS,IOB
0,12672033,1,1,12672033|t|Análisis,NUM,O
1,12672033,1,1,de,ADP,O
2,12672033,1,1,mutaciones,NOUN,O
3,12672033,1,1,en,ADP,O
4,12672033,1,1,DMBT1,PROPN,B-Gene
...,...,...,...,...,...,...
135341,22106692,15,4462,partes,NOUN,O
135342,22106692,15,4462,del,ADP,O
135343,22106692,15,4462,mundo,NOUN,O
135344,22106692,15,4462,.,PUNCT,O


clean

In [None]:
import re
def clean_text(text):
    # Aquí puedes ajustar el patrón según la basura específica que necesitas limpiar
    text = re.sub(r'^\d+\|[ta]\|', '', text)
    return text

# Aplicar la limpieza a la primera palabra de la primera oración de cada documento
def clean_first_word(df):
    for doc_id in df['doc_id'].unique():
        # Filtrar las filas correspondientes al primer documento
        doc_rows = df[df['doc_id'] == doc_id]
        # Obtener la primera fila de la primera oración
        first_word_index = doc_rows.index[0]
        # Limpiar el texto de la primera palabra
        df.at[first_word_index, 'Word'] = clean_text(df.at[first_word_index, 'Word'])
    return df

# Aplicar la función al DataFrame
new_train_df = clean_first_word(new_train_df)

In [None]:
# Dividir datos de entrenamiento en conjunto de entrenamiento y prueba
df_train, df_test = train_test_split(new_train_df, test_size=0.2, random_state=42)
print('Tamaño del conjunto de entrenamiento: {}'.format(len(df_train)))
print('Tamaño del conjunto de prueba: {}'.format(len(df_test)))

Tamaño del conjunto de entrenamiento: 108276
Tamaño del conjunto de prueba: 27070


# We visualize next the distribution of each of the datasets according to its entities.

In [None]:
separator = '*'*60
# Tag statistics train df
print('TRAIN:\n', df_annotation['label'].value_counts(), sep='\n', end='\n')
print()
print('Percentages:', df_annotation['label'].value_counts(normalize=True)*100, sep='\n', end='\n')

TRAIN:

label
Disease                        4028
Gene                           3093
DNAMutation                     496
OtherMutation                   271
DNAAllele                       139
SNP                             120
NucleotideChange-BaseChange      51
Transcript                        1
Name: count, dtype: int64

Percentages:
label
Disease                        49.127942
Gene                           37.724113
DNAMutation                     6.049518
OtherMutation                   3.305281
DNAAllele                       1.695329
SNP                             1.463593
NucleotideChange-BaseChange     0.622027
Transcript                      0.012197
Name: proportion, dtype: float64


**3. Imports and pre-processing of the data**

First, all the steps to import the necessary packages to use the model are defined. After that, the pre processing of the data is necessary as the Bert model needs to meet some special requirements.


In [None]:
%%capture
#importing a few necessary packages and setting the DATA directory
DATA_DIR="."
import os
import numpy as np
import pickle
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# install BERT
!pip install pytorch_pretrained_bert pytorch-nlp

# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
# from pytorch_pretrained_bert import BertTokenizer, BertConfig, BertForSequenceClassification
from pytorch_pretrained_bert import BertAdam
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt


# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

We need to transform all the pre-defined entities (labels) to a language that can be interpreted by the model, i.e., numbers. To do that, we create a dictionary with the desired labels following the IOB scheme.

In [None]:
# We have to create a dictionary for the labels (IOB labels):
# label is key and value is index.
tags = new_train_df["IOB"].unique()
tag_index = {t : i + 1 for i, t in enumerate(tags)}
#we have to add a new label for pad tokens
tag_index["PAD"] = 0
num_tags = len(tag_index)
print('Dictionary for labels:', tag_index)
print('Number of tags added the tag for pad tokens:', num_tags)

Dictionary for labels: {'O': 1, 'B-Gene': 2, 'B-Disease': 3, 'I-Disease': 4, 'B-NucleotideChange-BaseChange': 5, 'I-NucleotideChange-BaseChange': 6, 'B-DNAMutation': 7, 'I-DNAMutation': 8, 'I-Gene': 9, 'B-OtherMutation': 10, 'I-OtherMutation': 11, 'B-DNAAllele': 12, 'I-DNAAllele': 13, 'B-SNP': 14, 'B-Transcript': 15, 'I-SNP': 16, 'PAD': 0}
Number of tags added the tag for pad tokens: 17


In [None]:
tag_index.keys()

dict_keys(['O', 'B-Gene', 'B-Disease', 'I-Disease', 'B-NucleotideChange-BaseChange', 'I-NucleotideChange-BaseChange', 'B-DNAMutation', 'I-DNAMutation', 'I-Gene', 'B-OtherMutation', 'I-OtherMutation', 'B-DNAAllele', 'I-DNAAllele', 'B-SNP', 'B-Transcript', 'I-SNP', 'PAD'])

Next, we perform the first step of preprocessing of the data. Here, a class SentenceGetter in combination to a function vectorization are defined to extract the desired features from the input data.

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

class SentenceGetter(object):
    #This is a class to get sentence. Each sentence will be a list of tuples with its words, tag and pos.
    def __init__(self, df):

        self.n_sent = 1
        self.df = df
        self.empty = False
        ## The lambda function defined as agg is used to create a list of tuples for each group of data corresponding to a sentence. Each tuple in the list contains:
        # w: a word from the sentence.
        # p: the POS tag of the word.
        # t: the NER tag of the word.
        agg = lambda s : [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                       s['POS'].values.tolist(),
                                                       s['IOB'].values.tolist())]
        self.grouped = self.df.groupby("Global Sentence #").apply(agg)
        self.sentences = [s for s in self.grouped]

    def get_text(self):
        try:
            #s = self.grouped['Sentence: {}'.format(self.n_sent)]
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent +=1
            return s
        except:
            return None

def vectorization(df, tag_index):
    """This functions gets the dataframe with the dataset and transform it to vectors.
    First, its sentences are retrieved. Then, for each sentence, the function creates a list
    with its corresponding indexes. In addition to X (which are the sentences transformed to vectors),
    the functions also returns the corresponding labels for each token"""

    df = df[['Global Sentence #','Word','POS','IOB']]

    # Getting full sentences
    getter = SentenceGetter(df)
    sentences = getter.sentences

    X = [[w[0] for w in s] for s in sentences]

    # Convert label to index
    y = [[tag_index[w[2]] for w in s] for s in sentences]

    return (X, y)

# vectorization of datasets
sentences_train, labels_train = vectorization(df_train, tag_index)
# sentences_dev, labels_dev = vectorization(df_dev, tag_index)
sentences_test, labels_test = vectorization(df_test, tag_index)
print('Datasets loaded!')

Datasets loaded!


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModelForMaskedLM.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.18M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/523k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
def align_labels(original_sentences, original_labels, MAX_LEN):
    """
    This function assigns the new labels following the original input and according to
    the results from the Tokenization to create a good alignment between words/subwords
    and labels.
    Besides this, it provides the new maximum length of the sentences after the Tokenization
    for future padding purposes.
    """
    tokenized_input = tokenizer(original_sentences, is_split_into_words=True, add_special_tokens=False)
    tokens = [tokenizer.convert_ids_to_tokens(t) for t in tokenized_input["input_ids"]]
    list_len = [len(i) for i in tokens]
    MAX_LEN = max(max(list_len), MAX_LEN)

    word_ids_global = []
    for nr_sentence, list_tokens in enumerate(tokens):
        word_ids = []
        index = 0
        current_word = ''
        for token in list_tokens:
            if index >= len(original_sentences[nr_sentence]):
                break  # Avoid out of range error
            if token.startswith("##"):
                word_ids.append(index-1)
            elif token in current_word:
                if len(tokenizer.tokenize(current_word)) != 1:
                    subtokens_qty = len(tokenizer.tokenize(current_word))
                    for subtoken in tokenizer.tokenize(current_word):
                        if token == subtoken:
                            word_ids.append(index-1)
                            break
                        subtokens_qty -= 1
                        if subtokens_qty == 0:
                            word_ids.append(index)
                            if index < len(original_sentences[nr_sentence]):
                                current_word = original_sentences[nr_sentence][index].lower()
                            index += 1
                else:
                    word_ids.append(index)
                    if index < len(original_sentences[nr_sentence]):
                        current_word = original_sentences[nr_sentence][index].lower()
                    index += 1
            else:
                word_ids.append(index)
                if index < len(original_sentences[nr_sentence]):
                    current_word = original_sentences[nr_sentence][index].lower()
                index += 1
        word_ids_global.append(word_ids)

    aligned_global = []
    for nr_sentence, list_word_ids in enumerate(word_ids_global):
        aligned_labels = []
        old_index = -1
        if allTypes:
            for i in list_word_ids:
                if (i == old_index) and (original_labels[nr_sentence][i] in [2, 4, 6, 8]):
                    aligned_labels.append(original_labels[nr_sentence][i] + 1)
                else:
                    aligned_labels.append(original_labels[nr_sentence][i])
                old_index = i
        else:
            for i in list_word_ids:
                if (i == old_index) and (original_labels[nr_sentence][i] in [2, 4]):
                    aligned_labels.append(original_labels[nr_sentence][i] + 1)
                else:
                    aligned_labels.append(original_labels[nr_sentence][i])
                old_index = i
        aligned_global.append(aligned_labels)

    return MAX_LEN, aligned_global

print('Aligning labels...')
train_max_len, aligned_labels_train = align_labels(sentences_train, labels_train, MAX_LEN)
test_max_len, aligned_labels_test = align_labels(sentences_test, labels_test, MAX_LEN)
print('Labels aligned!')
CORRECTED_MAX_LEN = max(train_max_len, test_max_len)
print('New defined MAX_LEN after tokenization is: ', CORRECTED_MAX_LEN)


Aligning labels...
Labels aligned!
New defined MAX_LEN after tokenization is:  469


The pre-processing of the data is finished with the padding of the new labels to the maximum lenght and the tokenization of the words.



In [None]:
# Padding labels according to corrected MAX_LEN
final_train_labels = pad_sequences(maxlen=CORRECTED_MAX_LEN, sequences=aligned_labels_train, padding="post", value=tag_index["PAD"]).tolist()
# final_dev_labels = pad_sequences(maxlen=CORRECTED_MAX_LEN, sequences=aligned_labels_dev, padding="post", value=tag_index["PAD"]).tolist()
final_test_labels = pad_sequences(maxlen=CORRECTED_MAX_LEN, sequences=aligned_labels_test, padding="post", value=tag_index["PAD"]).tolist()
print('Labels padded')

# Tokenize inputs according to corrected MAX_LEN
# train dataset
tokenized_input_train = tokenizer(sentences_train, truncation=True, max_length=CORRECTED_MAX_LEN, padding='max_length', is_split_into_words=True, add_special_tokens=False)
print('\nTrain dataset:\n\tsentences lenght: {}.\n\tlabels lenght: {}.\n\tinput_ids length: {}.'.format(len(sentences_train), len(labels_train), len(tokenized_input_train['input_ids'])))

# # dev dataset
# tokenized_input_dev = tokenizer(sentences_dev, truncation=True, max_length=CORRECTED_MAX_LEN, padding='max_length', is_split_into_words=True, add_special_tokens=False)
# print('\nDev dataset:\n\tsentences lenght: {}.\n\tlabels lenght: {}.\n\tinput_ids length: {}.'.format(len(sentences_dev), len(labels_dev), len(tokenized_input_dev['input_ids'])))

# # test dataset
tokenized_input_test = tokenizer(sentences_test, truncation=True, max_length=CORRECTED_MAX_LEN, padding='max_length', is_split_into_words=True, add_special_tokens=False)
print('\nTest dataset:\n\tsentences lenght: {}.\n\tlabels lenght: {}.\n\tinput_ids length: {}.'.format(len(sentences_test), len(labels_test), len(tokenized_input_test['input_ids'])))

Labels padded

Train dataset:
	sentences lenght: 4461.
	labels lenght: 4461.
	input_ids length: 4461.

Test dataset:
	sentences lenght: 4394.
	labels lenght: 4394.
	input_ids length: 4394.


**4. Definition of the model**

For reproducibility reasons, we fix a seed for PyTorch and convert all the data into tensors as the model requires.

In [None]:
import torch
torch.manual_seed(42)
torch.cuda.manual_seed(42)
np.random.seed(42)
torch.backends.cudnn.deterministic=True
batch_size = 32 # 64

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(tokenized_input_train["input_ids"])
train_masks = torch.tensor(tokenized_input_train["attention_mask"])
train_labels = torch.tensor(final_train_labels)

# validation_inputs = torch.tensor(tokenized_input_dev["input_ids"])
# validation_masks = torch.tensor(tokenized_input_dev["attention_mask"])
# validation_labels = torch.tensor(final_dev_labels)

test_inputs = torch.tensor(tokenized_input_test["input_ids"])
test_masks = torch.tensor(tokenized_input_test["attention_mask"])
test_labels = torch.tensor(final_test_labels)

# Checking outputs
print('Train tensor shapes:')
print('Inputs: ', train_inputs.shape)
print('Masks: ', train_masks.shape)
print('Labels: ', train_labels.shape)
# print('\nValidation tensor shapes:')
# print('Inputs: ', validation_inputs.shape)
# print('Masks: ', validation_masks.shape)
# print('Labels: ', validation_labels.shape)
print('\nTest tensor shapes:')
print('Inputs: ', test_inputs.shape)
print('Masks: ', test_masks.shape)
print('Labels: ', test_labels.shape)

# Create an iterator of our data with torch DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
shuffled_train_data = torch.utils.data.Subset(train_data, torch.randperm(len(train_data)).tolist())
train_dataloader = DataLoader(shuffled_train_data, batch_size=batch_size, shuffle=False)

# validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
# shuffled_validation_data = torch.utils.data.Subset(validation_data, torch.randperm(len(validation_data)).tolist())
# validation_dataloader = DataLoader(shuffled_validation_data, batch_size=len(labels_dev), shuffle=False)

test_batch_size = batch_size  # Adjust based on available memory

test_data = TensorDataset(test_inputs, test_masks, test_labels)
shuffled_test_data = torch.utils.data.Subset(test_data, torch.randperm(len(test_data)).tolist())
test_dataloader = DataLoader(shuffled_test_data, batch_size=test_batch_size, shuffle=False)

Train tensor shapes:
Inputs:  torch.Size([4461, 469])
Masks:  torch.Size([4461, 469])
Labels:  torch.Size([4461, 469])

Test tensor shapes:
Inputs:  torch.Size([4394, 469])
Masks:  torch.Size([4394, 469])
Labels:  torch.Size([4394, 469])


Here, an example of the evaluation report is provided. For this, we use the seqeval library along with the labels or tags we desire to compute.

In [None]:
# from sklearn_crfsuite.metrics import flat_classification_report
from seqeval.metrics.sequence_labeling import get_entities
from seqeval.metrics import classification_report, accuracy_score
from seqeval.scheme import IOB2



# Assuming `tags` contains your tag list
tags_metrics = np.array(list(tag_index.keys()))

# Example of metrics
_labels = final_test_labels[-10:]  # Select the last 10 sentences from the true labels
_labels_output = [
    [l for l in sentence if l != 0]  # Filter out padding tokens (assuming padding is represented by 0)
    for sentence in _labels
]
_converted = [
    [tags_metrics[l-1] for l in sentence if l != 0]  # Convert numeric labels to tag names
    for sentence in _labels
]

# Generate the classification report using seqeval
_report = classification_report(y_true=_converted, y_pred=_converted, scheme=IOB2)


# print('Tags: {}\nEntities in tag: {}'.format(tags_metrics, get_entities(tags_metrics)))
# print()
# print('Nr of sentences with labels: {}\nExample of labels: {}\nConverted labels: {}'.format(len(_labels), _labels_output, _converted))
# print()
# print(_report)

**5. Fine tuning the model and training**

In [None]:
# BERT fine-tuning parameters
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

# Function to compute the metrics of our predictions vs labels
def flat_accuracy(predictions, labels):
    """
    This function computes the accuracy of the network as a float number.
    """
    pred_flat = np.argmax(predictions, axis=2).flatten()
    labels_flat = labels.flatten()
    valid_predictions = [tags_metrics[p-1] for (p, l) in zip(pred_flat, labels_flat) if l != 0]
    valid_flags = [tags_metrics[l-1] for (p, l) in zip(pred_flat, labels_flat) if l != 0]
    return accuracy_score(y_true=valid_flags, y_pred=valid_predictions)

# Function to calculate the accuracy of our predictions vs labels
def compute_nn_metrics(predictions, labels, tags=tags_metrics, entity_level=False, imbalanced=False):
    """
    This function computes the metrics of the network through the seqeval model.
    """
    # Flatten predictions and labels
    pred_flat = np.argmax(predictions, axis=2).flatten()
    labels_flat = labels.flatten()

    # Remove padding
    valid_predictions = [tags_metrics[p-1] for (p, l) in zip(pred_flat, labels_flat) if l != 0]
    valid_labels = [tags_metrics[l-1] for (p, l) in zip(pred_flat, labels_flat) if l != 0]

    # Group predictions and labels into sentences
    valid_predictions = [valid_predictions]
    valid_labels = [valid_labels]

    if entity_level:
        return classification_report(valid_labels, valid_predictions, scheme=IOB2, zero_division=0)
    else:
        return classification_report(valid_labels, valid_predictions, zero_division=0, output_dict=True)
torch.cuda.empty_cache()
# Store our loss and accuracy for plotting
train_loss_set = []
eval_accuracy_set = []
# Number of training epochs
epochs = 9

print('Model finetuned!\nNr of epochs to use: {}'.format(epochs))



Model finetuned!
Nr of epochs to use: 9


In [None]:
# Move the model to the GPU
model.to(device)

# BERT training loop
for _ in trange(epochs, desc="Epoch"):

    ## TRAINING

    # Set our model to training mode
    model.train()
    # Tracking variables
    tr_loss, train_accuracy = 0, 0
    nb_train_steps = 0
    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch

        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits
        train_loss_set.append(loss.item())
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        # Update tracking variables
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_train_accuracy = flat_accuracy(logits, label_ids)
        train_accuracy += tmp_train_accuracy
        tr_loss += loss.item()
        nb_train_steps += 1
    print("\nTrain loss: {}".format(tr_loss/nb_train_steps))
    print("Total Train Accuracy: {}".format(train_accuracy/nb_train_steps))


Epoch:   0%|          | 0/9 [00:02<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 324.00 MiB. GPU 

In [None]:
# plot training performance
results_figure = plt.figure(figsize=(18,9))
ax_train = results_figure.add_subplot(1,1, 1)
results_figure.suptitle('BioBERT Results')
ax_train.set_title("Training loss evolution")
ax_train.set_xlabel("Batch")
ax_train.set_ylabel("Loss")
ax_train.plot(train_loss_set)


In [None]:
## TEST

# Put model in evaluation mode
model.eval()
# Evaluate data for one epoch
for batch in test_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs.logits
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    test_metrics = compute_nn_metrics(logits, label_ids)
    test_metrics_imbalanced = compute_nn_metrics(logits, label_ids, imbalanced=True)
    test_metrics_per_entity = compute_nn_metrics(logits, label_ids, entity_level=True)
print('Metrics report in Test (with "O" class):\n{}'.format(test_metrics))
print('Metrics report in Test (w/o "O" class):\n{}'.format(test_metrics_imbalanced))
print('Metrics report in Test per entity:\n{}'.format(test_metrics_per_entity))