# GenoVarDis: NER in Genomic Variants and related Diseases

In [1]:
if 'google.colab' in str(get_ipython()):
    print('Running on CoLab')
    from google.colab import drive
    #drive.flush_and_unmount()
    drive.mount('/content/drive')
    root = '/content/drive/My Drive/Colab Notebooks'
else:
    print('Not running on CoLab')
    root = './'

print("Current directory: {}".format(root))

Running on CoLab
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Current directory: /content/drive/My Drive/Colab Notebooks


**1. Getting the resources**

In [2]:
%%capture
!pip install pytorch_transformers
!pip install transformers
!pip install seqeval
!pip install sklearn-crfsuite
!pip install spacy
!python -m spacy download es_core_news_sm

**2. Data loading and formatting**

Definiendo rutas de datos:

In [3]:
LANG = 'es'  # Este sistema también procesa textos escritos en inglés, LANG='en'
allTypes = True
sTypes = ''
if allTypes:
    sTypes = '_all'
import spacy
from spacy.lang.es.examples import sentences

nlp = spacy.load("es_core_news_sm")


# Rutas de los datos
path_data = root + '/ner/data/genovardis_train_dev/' # Carpeta donde se encuentran los conjuntos de datos
path_models = root + '/ner/models/{}/'.format(LANG) # Carpeta para guardar los modelos
checkpoints = root + '/ner/checkpoints/' # Carpeta para los puntos de control
path_scores = root + '/ner/scores/{}/'.format(LANG) # Carpeta para guardar los puntajes
path_outputs = root + '/ner/outputs/{}/'.format(LANG) # Carpeta para guardar las salidas

Loading y formateo:

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
import transformers
from transformers import BertForTokenClassification, AdamW

# Cargar datos de entrenamiento
train_data = pd.read_csv(path_data+'train_text.tsv'.format(sTypes), sep='\t', index_col=0)
train_annotation = pd.read_csv(path_data+'train_annotation.tsv'.format(sTypes), sep='\t', index_col=0)
dev_data = pd.read_csv(path_data+'dev_text.tsv'.format(sTypes), sep='\t', index_col=0)
dev_annotation = pd.read_csv(path_data+'dev_annotation.tsv'.format(sTypes), sep='\t', index_col=0)
test_data = pd.read_csv(path_data+'test_text.tsv'.format(sTypes), sep='\t', index_col=0)

In [5]:
# Tag statistics train df
print('DEV:\n', dev_annotation['label'].value_counts(), sep='\n', end='\n')
print()
print('Percentages:', dev_annotation['label'].value_counts(normalize=True)*100, sep='\n', end='\n')

DEV:

label
Disease                        588
Gene                           550
DNAMutation                    103
OtherMutation                   53
SNP                             15
DNAAllele                       12
NucleotideChange-BaseChange     11
Transcript                       1
Name: count, dtype: int64

Percentages:
label
Disease                        44.111028
Gene                           41.260315
DNAMutation                     7.726932
OtherMutation                   3.975994
SNP                             1.125281
DNAAllele                       0.900225
NucleotideChange-BaseChange     0.825206
Transcript                      0.075019
Name: proportion, dtype: float64


In [6]:
# Estadísticas generales para el número de palabras en cada texto
train_data['word_count'] = train_data['text'].apply(lambda x: len(x.split())) # Divide el texto en palabras y calcula la cantidad de palabras
count_df_train = train_data.groupby('text')['word_count'].max() # Cuenta la cantidad máxima de palabras en cada texto
statistics_train = count_df_train.describe() # Calcula estadísticas descriptivas sobre la cantidad máxima de palabras en los textos de entrenamiento
print('\nAlgunas estadísticas de las oraciones en el conjunto de entrenamiento:')
print(statistics_train)

dev_data['word_count'] = dev_data['text'].apply(lambda x: len(x.split())) # Divide el texto en palabras y calcula la cantidad de palabras
count_df_dev = dev_data.groupby('text')['word_count'].max() # Cuenta la cantidad máxima de palabras en cada texto
statistics_dev = count_df_dev.describe() # Calcula estadísticas descriptivas sobre la cantidad máxima de palabras en los textos de desarrollo
print('\nAlgunas estadísticas de las oraciones en el conjunto de desarrollo:')
print(statistics_dev)

# Longitud de la oración más larga. La longitud es el número de palabras.
MAX_LEN_TRAIN = int(statistics_train['max']) # Longitud máxima de las oraciones en el conjunto de entrenamiento
MAX_LEN_DEV = int(statistics_dev['max']) # Longitud máxima de las oraciones en el conjunto de desarrollo
MAX_LEN = max(MAX_LEN_TRAIN, MAX_LEN_DEV) # Longitud máxima total de las oraciones
print('\n')
print('La longitud máxima de las oraciones en ENTRENAMIENTO es: ', MAX_LEN_TRAIN)
print('La longitud máxima de las oraciones en DESARROLLO es: ', MAX_LEN_DEV)
print('La longitud máxima de las oraciones en TOTAL es:', MAX_LEN)


Algunas estadísticas de las oraciones en el conjunto de entrenamiento:
count    427.000000
mean     276.784543
std       65.877967
min       78.000000
25%      239.000000
50%      279.000000
75%      315.000000
max      469.000000
Name: word_count, dtype: float64

Algunas estadísticas de las oraciones en el conjunto de desarrollo:
count     70.000000
mean     287.457143
std       59.899599
min      114.000000
25%      257.000000
50%      289.500000
75%      314.500000
max      438.000000
Name: word_count, dtype: float64


La longitud máxima de las oraciones en ENTRENAMIENTO es:  469
La longitud máxima de las oraciones en DESARROLLO es:  438
La longitud máxima de las oraciones en TOTAL es: 469


## Formatting data and putting POS data


First we tag each word with the iob tags in the train.


Once words are tagged we clean the words from the pmid that is present at the beginning of each paper sentece.

In [7]:
# Function to tag each character with its annotation
def tag_characters(text, annotations):
    tags = ['O'] * len(text)
    for _, ann in annotations.iterrows():
        start, end = ann['offset1'], ann['offset2']
        label = ann['label']
        for i in range(start, end):
            tags[i] = label
    return tags

# Apply function to tag characters in each text
def apply_char_tags(row, train_annotation):
    doc_id = row.name
    text = row['text']
    annotations = train_annotation[train_annotation.index == doc_id]
    char_tags = tag_characters(text, annotations)
    return char_tags

# Apply the function to each row in train_data and dev data
train_data['char_tags'] = train_data.apply(lambda row: apply_char_tags(row, train_annotation), axis=1)
dev_data['char_tags'] = dev_data.apply(lambda row: apply_char_tags(row, dev_annotation), axis=1)

# Function to tag each character with its annotation
def tag_characters(text, annotations):
    tags = ['O'] * len(text)
    for _, ann in annotations.iterrows():
        start, end = ann['offset1'], ann['offset2']
        label = ann['label']
        for i in range(start, end):
            tags[i] = label
    return tags

# Apply function to tag characters in each text
def apply_char_tags(row, train_annotation):
    doc_id = row.name
    text = row['text']
    annotations = train_annotation[train_annotation.index == doc_id]
    char_tags = tag_characters(text, annotations)
    return char_tags

# Apply the function to each row in train_data and dev data
train_data['char_tags'] = train_data.apply(lambda row: apply_char_tags(row, train_annotation), axis=1)
dev_data['char_tags'] = dev_data.apply(lambda row: apply_char_tags(row, dev_annotation), axis=1)

# Function to map character annotations to spaCy tokens
def map_char_annotations_to_tokens(doc, char_tags):
    token_annotations = []
    for token in doc:
        start, end = token.idx, token.idx + len(token)
        token_tag = char_tags[start:end]
        # Determine the most frequent tag in the range
        if token_tag:
            most_frequent_tag = max(set(token_tag), key=token_tag.count)
        else:
            most_frequent_tag = 'O'
        token_annotations.append(most_frequent_tag)
    return token_annotations

# Function to convert word tags to IOB tags
def convert_to_iob_tags(word_tags):
    iob_tags = []
    prev_tag = 'O'
    for tag in word_tags:
        if tag == 'O':
            iob_tags.append('O')
        elif tag != prev_tag:
            iob_tags.append(f'B-{tag}')
        else:
            iob_tags.append(f'I-{tag}')
        prev_tag = tag
    return iob_tags

global_sentence_counter = 1
# Function to process text to include IOB tags
def process_text_with_iob(text, doc_id, char_tags):
    global global_sentence_counter
    doc = nlp(text)
    all_sentences = []
    token_annotations = map_char_annotations_to_tokens(doc, char_tags)
    iob_tags = convert_to_iob_tags(token_annotations)

    token_index = 0
    for sent in doc.sents:
        for token in sent:
            word_data = {
                'doc_id': doc_id,
                'Global Sentence #': global_sentence_counter,
                'Word': token.text,
                'IOB': iob_tags[token_index] if token_index < len(iob_tags) else 'O'
            }
            all_sentences.append(word_data)
            token_index += 1
        global_sentence_counter += 1
    return all_sentences

# Process each document to include IOB tags
train_iob_data = []
for doc_id, row in train_data.iterrows():
    processed_sentences = process_text_with_iob(row['text'], doc_id, row['char_tags'])
    train_iob_data.extend(processed_sentences)

dev_iob_data = []
for doc_id, row in dev_data.iterrows():
    processed_sentences = process_text_with_iob(row['text'], doc_id, row['char_tags'])
    dev_iob_data.extend(processed_sentences)


# Convert the combined list of dictionaries to a DataFrame
train_iob_df = pd.DataFrame(train_iob_data)
dev_iob_df = pd.DataFrame(dev_iob_data)



Clean the data from the docid in the word.

In [8]:
import re

def clean_text(text):
    text = re.sub(r'^\d+\|[ta]\|', '', text)
    return text

# Aplicar la limpieza a la primera palabra de la primera oración de cada documento
def clean_first_word(df):
    for doc_id in df['doc_id'].unique():
        doc_rows = df[df['doc_id'] == doc_id]
        first_word_index = doc_rows.index[0]
        df.at[first_word_index, 'Word'] = clean_text(df.at[first_word_index, 'Word'])
    return df

def clean_abstract(word):
    cleaned_word = re.sub(r'^\d+\|a\|', '', word)
    return cleaned_word


# Aplicar la función al DataFrame
train_iob_df = clean_first_word(train_iob_df)
dev_iob_data = clean_first_word(dev_iob_df)
# # Apply the cleaning function to the 'Word' column in iob_df
# iob_df['Word'] = iob_df['Word'].apply(clean_abstract)

POS Tagging

In [9]:
# Function to add POS tags to the words
def add_pos_tags(df):
    df['POS'] = ''
    for doc_id in df['doc_id'].unique():
        doc_rows = df[df['doc_id'] == doc_id]
        text = " ".join(doc_rows['Word'])
        doc = nlp(text)
        pos_tags = [token.pos_ for token in doc]
        df.loc[doc_rows.index, 'POS'] = pos_tags[:len(doc_rows)]
    return df

import string
# Apply the function to add POS tags
new_train_df = add_pos_tags(train_iob_df)
new_dev_df = add_pos_tags(dev_iob_data)

def clean_word(word):
    # Remove punctuation
    word = word.translate(str.maketrans('', '', string.punctuation))
    # Remove leading/trailing spaces
    word = word.strip()
    return word

# Apply the cleaning function to the 'Word' column in new_train_df using .loc
new_train_df.loc[:, 'Word'] = new_train_df['Word'].apply(clean_word)
new_dev_df.loc[:, 'Word'] = new_dev_df['Word'].apply(clean_word)

# Remove rows where 'Word' is blank after cleaning
new_train_df = new_train_df[new_train_df['Word'] != '']
new_dev_df = new_dev_df[new_dev_df['Word'] != '']

Process **test data**

In [10]:
# Initialize a global sentence counter
global_sentence_counter = 1

def process_test_text_offsets(text, doc_id):
    global global_sentence_counter

    # Clean the text by removing the garbage word
    cleaned_text = clean_text(text)
    garbage_length = len(text) - len(cleaned_text)

    doc = nlp(cleaned_text)
    all_sentences = []

    # Local sentence counter for the current document
    local_sentence_counter = 1

    for sent in doc.sents:
        for token in sent:
            word_data = {
                'doc_id': doc_id,
                'Sentence # in the paper': local_sentence_counter,
                'Global Sentence #': global_sentence_counter,
                'Word': token.text,
                'offset1': token.idx + garbage_length,
                'offset2': token.idx + garbage_length + len(token)
            }
            all_sentences.append(word_data)

        # Increment the local and global sentence counters
        local_sentence_counter += 1
        global_sentence_counter += 1

    return all_sentences

# Process each document to include offsets
test_data_processed = []
for doc_id, row in test_data.iterrows():
    processed_sentences = process_test_text_offsets(row['text'], doc_id)
    test_data_processed.extend(processed_sentences)

# Convert the list of dictionaries to a DataFrame
test_df = pd.DataFrame(test_data_processed)
test_df = add_pos_tags(test_df)

test_df.loc[:, 'Word'] = test_df['Word'].apply(clean_word)
test_df = test_df[test_df['Word'] != '']

**3. Imports and pre-processing of the data**

First, all the steps to import the necessary packages to use the model are defined. After that, the pre processing of the data is necessary as the Bert model needs to meet some special requirements.

In [11]:
%%capture
#importing a few necessary packages and setting the DATA directory
DATA_DIR="."
import os
import numpy as np
import pickle
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# install BERT
!pip install pytorch_pretrained_bert pytorch-nlp

# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
# from pytorch_pretrained_bert import BertTokenizer, BertConfig, BertForSequenceClassification
from pytorch_pretrained_bert import BertAdam
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt


# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

We need to transform all the pre-defined entities (labels) to a language that can be interpreted by the model, i.e., numbers. To do that, we create a dictionary with the desired labels following the IOB scheme.

In [12]:
# We have to create a dictionary for the labels (IOB labels):
# label is key and value is index.
tags = new_train_df["IOB"].unique()
tag_index = {t : i + 1 for i, t in enumerate(tags)}
#we have to add a new label for pad tokens
tag_index["PAD"] = 0
num_tags = len(tag_index)
print('Dictionary for labels:', tag_index)
print('Number of tags added the tag for pad tokens:', num_tags)

Dictionary for labels: {'O': 1, 'B-Gene': 2, 'B-Disease': 3, 'I-Disease': 4, 'B-NucleotideChange-BaseChange': 5, 'I-NucleotideChange-BaseChange': 6, 'B-DNAMutation': 7, 'I-DNAMutation': 8, 'I-Gene': 9, 'B-OtherMutation': 10, 'I-OtherMutation': 11, 'B-DNAAllele': 12, 'I-DNAAllele': 13, 'B-SNP': 14, 'B-Transcript': 15, 'I-SNP': 16, 'PAD': 0}
Number of tags added the tag for pad tokens: 17


Next, we perform the first step of preprocessing of the data. Here, a class SentenceGetter in combination to a function vectorization are defined to extract the desired features from the input data.

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

class SentenceGetter(object):
    # This is a class to get sentences. Each sentence will be a list of tuples with its words, tag and pos.
    def __init__(self, df, is_test=False):
        self.n_sent = 1
        self.df = df
        self.is_test = is_test
        self.empty = False
        if not is_test:
            agg = lambda s : [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                           s['POS'].values.tolist(),
                                                           s['IOB'].values.tolist())]
        else:
            agg = lambda s : [(w, p) for w, p in zip(s['Word'].values.tolist(),
                                                     s['POS'].values.tolist())]
        self.grouped = self.df.groupby("Global Sentence #").apply(agg)
        self.sentences = [s for s in self.grouped]

    def get_text(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent +=1
            return s
        except KeyError:
            return None

def vectorization(df, tag_index, is_test=False):
    """This function gets the dataframe with the dataset and transforms it to vectors.
    First, its sentences are retrieved. Then, for each sentence, the function creates a list
    with its corresponding indexes. In addition to X (which are the sentences transformed to vectors),
    the function also returns the corresponding labels for each token (if available)."""

    if not is_test:
        df = df[['Global Sentence #', 'Word', 'POS', 'IOB']]
    else:
        df = df[['Global Sentence #', 'Word', 'POS']]

    # Getting full sentences
    getter = SentenceGetter(df, is_test=is_test)
    sentences = getter.sentences

    X = [[w[0] for w in s] for s in sentences]

    if not is_test:
        # Convert label to index
        y = [[tag_index[w[2]] for w in s] for s in sentences]
    else:
        y = None

    return (X, y)

# Vectorization of datasets
sentences_train, labels_train = vectorization(new_train_df, tag_index, is_test=False)
sentences_dev, labels_dev = vectorization(new_dev_df, tag_index, is_test=False)
sentences_test, labels_test = vectorization(test_df, tag_index, is_test=True)


In [38]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, AutoModelForMaskedLM

# tokenizer = AutoTokenizer.from_pretrained("StivenLancheros/roberta-base-biomedical-clinical-es-finetuned-ner-CRAFT_AugmentedTransfer_ES")
# model = AutoModelForTokenClassification.from_pretrained("StivenLancheros/roberta-base-biomedical-clinical-es-finetuned-ner-CRAFT_AugmentedTransfer_ES")

#tokenizer = AutoTokenizer.from_pretrained("tner/xlm-roberta-base-uncased-bionlp2004")
#model = AutoModelForTokenClassification.from_pretrained("tner/xlm-roberta-base-uncased-bionlp2004")

# tokenizer = AutoTokenizer.from_pretrained("PlanTL-GOB-ES/roberta-base-biomedical-clinical-es")
# model = AutoModelForMaskedLM.from_pretrained("PlanTL-GOB-ES/roberta-base-biomedical-clinical-es", num_labels=17)

 tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
 model = BertForTokenClassification.from_pretrained(
#     "bert-base-multilingual-cased",
#     num_labels=17,
#     output_attentions = False,
#     output_hidden_states = False
# )

tokenizer_config.json:   0%|          | 0.00/211 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [39]:
def align_labels(original_sentences, original_labels, MAX_LEN):
    """
    This function assigns the new labels following the original input and according to
    the results from the Tokenization to create a good alignment between words/subwords
    and labels.
    Besides this, it provides the new maximum length of the sentences after the Tokenization
    for future padding purposes.
    """
    tokenized_input = tokenizer(original_sentences, is_split_into_words=True, add_special_tokens=False)
    tokens = [tokenizer.convert_ids_to_tokens(t) for t in tokenized_input["input_ids"]]
    list_len = [len(i) for i in tokens]
    MAX_LEN = max(max(list_len), MAX_LEN)

    word_ids_global = []
    for nr_sentence, list_tokens in enumerate(tokens):
        word_ids = []
        index = 0
        current_word = ''
        for token in list_tokens:
            if index >= len(original_sentences[nr_sentence]):
                break  # Avoid out of range error
            if token.startswith("##"):
                word_ids.append(index-1)
            elif token in current_word:
                if len(tokenizer.tokenize(current_word)) != 1:
                    subtokens_qty = len(tokenizer.tokenize(current_word))
                    for subtoken in tokenizer.tokenize(current_word):
                        if token == subtoken:
                            word_ids.append(index-1)
                            break
                        subtokens_qty -= 1
                        if subtokens_qty == 0:
                            word_ids.append(index)
                            if index < len(original_sentences[nr_sentence]):
                                current_word = original_sentences[nr_sentence][index].lower()
                            index += 1
                else:
                    word_ids.append(index)
                    if index < len(original_sentences[nr_sentence]):
                        current_word = original_sentences[nr_sentence][index].lower()
                    index += 1
            else:
                word_ids.append(index)
                if index < len(original_sentences[nr_sentence]):
                    current_word = original_sentences[nr_sentence][index].lower()
                index += 1
        word_ids_global.append(word_ids)

    aligned_global = []
    for nr_sentence, list_word_ids in enumerate(word_ids_global):
        aligned_labels = []
        old_index = -1
        if allTypes:
            for i in list_word_ids:
                if (i == old_index) and (original_labels[nr_sentence][i] in [2, 4, 6, 8]):
                    aligned_labels.append(original_labels[nr_sentence][i] + 1)
                else:
                    aligned_labels.append(original_labels[nr_sentence][i])
                old_index = i
        else:
            for i in list_word_ids:
                if (i == old_index) and (original_labels[nr_sentence][i] in [2, 4]):
                    aligned_labels.append(original_labels[nr_sentence][i] + 1)
                else:
                    aligned_labels.append(original_labels[nr_sentence][i])
                old_index = i
        aligned_global.append(aligned_labels)

    return MAX_LEN, aligned_global

print('Aligning labels...')
train_max_len, aligned_labels_train = align_labels(sentences_train, labels_train, MAX_LEN)
dev_max_len, aligned_labels_dev = align_labels(sentences_dev, labels_dev, MAX_LEN)
print('Labels aligned!')
CORRECTED_MAX_LEN = 512
print('New defined MAX_LEN after tokenization is: ', CORRECTED_MAX_LEN)


Aligning labels...
Labels aligned!
New defined MAX_LEN after tokenization is:  512


The pre-processing of the data is finished with the padding of the new labels to the maximum lenght and the tokenization of the words.

In [40]:
# Padding labels according to corrected MAX_LEN
final_train_labels = pad_sequences(maxlen=CORRECTED_MAX_LEN, sequences=aligned_labels_train, padding="post", value=tag_index["PAD"]).tolist()
final_dev_labels = pad_sequences(maxlen=CORRECTED_MAX_LEN, sequences=aligned_labels_dev, padding="post", value=tag_index["PAD"]).tolist()

# Tokenize inputs according to corrected MAX_LEN
# train dataset
tokenized_input_train = tokenizer(sentences_train, truncation=True, max_length=CORRECTED_MAX_LEN, padding='max_length', is_split_into_words=True, add_special_tokens=False)
# dev dataset
tokenized_input_dev = tokenizer(sentences_dev, truncation=True, max_length=CORRECTED_MAX_LEN, padding='max_length', is_split_into_words=True, add_special_tokens=False)
# test dataset
tokenized_input_test = tokenizer(sentences_test, truncation=True, max_length=CORRECTED_MAX_LEN, padding='max_length', is_split_into_words=True, add_special_tokens=False)

**4. Definition of the model**

For reproducibility reasons, we fix a seed for PyTorch and convert all the data into tensors as the model requires.

In [43]:
import torch
torch.manual_seed(42)
torch.cuda.manual_seed(42)
np.random.seed(42)
torch.backends.cudnn.deterministic=True
batch_size = 16 # 64

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(tokenized_input_train["input_ids"])
train_masks = torch.tensor(tokenized_input_train["attention_mask"])
train_labels = torch.tensor(final_train_labels)

dev_inputs = torch.tensor(tokenized_input_dev["input_ids"])
dev_masks = torch.tensor(tokenized_input_dev["attention_mask"])
dev_labels = torch.tensor(final_dev_labels)

test_inputs = torch.tensor(tokenized_input_test["input_ids"])
test_masks = torch.tensor(tokenized_input_test["attention_mask"])


# Create an iterator of our data with torch DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
shuffled_train_data = torch.utils.data.Subset(train_data, torch.randperm(len(train_data)).tolist())
train_dataloader = DataLoader(shuffled_train_data, batch_size=batch_size, shuffle=False)


dev_data = TensorDataset(dev_inputs, dev_masks, dev_labels)
shuffled_dev_data = torch.utils.data.Subset(dev_data, torch.randperm(len(dev_data)).tolist())
dev_dataloader = DataLoader(shuffled_dev_data, batch_size=batch_size, shuffle=False)

test_batch_size = batch_size  # Adjust based on available memory

test_data = TensorDataset(test_inputs, test_masks)
shuffled_test_data = torch.utils.data.Subset(test_data, torch.randperm(len(test_data)).tolist())
test_dataloader = DataLoader(shuffled_test_data, batch_size=test_batch_size, shuffle=False)

In [44]:
# from sklearn_crfsuite.metrics import flat_classification_report
from seqeval.metrics.sequence_labeling import get_entities
from seqeval.metrics import classification_report, accuracy_score
from seqeval.scheme import IOB2

**5. Fine tuning the model and training**

In [46]:
tags_metrics = np.array(list(tag_index.keys()))
# BERT fine-tuning parameters
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

# Function to compute the metrics of our predictions vs labels
def flat_accuracy(predictions, labels):
    """
    This function computes the accuracy of the network as a float number.
    """
    pred_flat = np.argmax(predictions, axis=2).flatten()
    labels_flat = labels.flatten()
    valid_predictions = [tags_metrics[p-1] for (p, l) in zip(pred_flat, labels_flat) if l != 0]
    valid_flags = [tags_metrics[l-1] for (p, l) in zip(pred_flat, labels_flat) if l != 0]
    return accuracy_score(y_true=valid_flags, y_pred=valid_predictions)

# Function to calculate the accuracy of our predictions vs labels
def compute_nn_metrics(predictions, labels, tags=tags_metrics, entity_level=False, imbalanced=False):
    """
    This function computes the metrics of the network through the seqeval model.
    """
    # Flatten predictions and labels
    pred_flat = np.argmax(predictions, axis=2).flatten()
    labels_flat = labels.flatten()

    # Remove padding
    valid_predictions = [tags_metrics[p-1] for (p, l) in zip(pred_flat, labels_flat) if l != 0]
    valid_labels = [tags_metrics[l-1] for (p, l) in zip(pred_flat, labels_flat) if l != 0]

    # Group predictions and labels into sentences
    valid_predictions = [valid_predictions]
    valid_labels = [valid_labels]

    if entity_level:
        return classification_report(valid_labels, valid_predictions, scheme=IOB2, zero_division=0)
    else:
        return classification_report(valid_labels, valid_predictions, zero_division=0, output_dict=True)
torch.cuda.empty_cache()
# Store our loss and accuracy for plotting
train_loss_set = []
eval_accuracy_set = []
# Number of training epochs
epochs = 8

print('Model finetuned!\nNr of epochs to use: {}'.format(epochs))



Model finetuned!
Nr of epochs to use: 8


In [47]:
# Move the model to the GPU
device = torch.device('cuda')
model.to(device)

# BERT training loop
for _ in trange(epochs, desc="Epoch"):

    ## TRAINING

    # Set our model to training mode
    model.train()
    # Tracking variables
    tr_loss, train_accuracy = 0, 0
    nb_train_steps = 0
    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits
        train_loss_set.append(loss.item())
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        # Update tracking variables
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_train_accuracy = flat_accuracy(logits, label_ids)
        train_accuracy += tmp_train_accuracy
        tr_loss += loss.item()
        nb_train_steps += 1
    print("\nTrain loss: {}".format(tr_loss/nb_train_steps))
    print("Total Train Accuracy: {}".format(train_accuracy/nb_train_steps))

    ## VALIDATION

    # Put model in evaluation mode
    model.eval()
    # Tracking variables
    eval_accuracy = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in dev_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs.logits
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        eval_metrics = compute_nn_metrics(logits, label_ids)
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        eval_accuracy_set.append(tmp_eval_accuracy)
        nb_eval_steps += 1
    print('\nMetrics report in Validation:\n{}'.format(eval_metrics))
    print("Total Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
## Make predictions on Validation SET.

# Put model in evaluation mode
model.eval()
# Evaluate data for one epoch
for batch in dev_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs.logits
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    test_metrics = compute_nn_metrics(logits, label_ids)
    test_metrics_imbalanced = compute_nn_metrics(logits, label_ids, imbalanced=True)
    test_metrics_per_entity = compute_nn_metrics(logits, label_ids, entity_level=True)
print('Metrics report in Test (with "O" class):\n{}'.format(test_metrics))
print('Metrics report in Test (w/o "O" class):\n{}'.format(test_metrics_imbalanced))
print('Metrics report in Test per entity:\n{}'.format(test_metrics_per_entity))

We tried converting the results from the model to the format required vy Iberlef, but we were not able to do it without errrors.

In [None]:
# Initialize list to store predictions
# predictions = []

# # Evaluate data for one epoch
# for batch in test_dataloader:
#     # Add batch to GPU
#     batch = tuple(t.to(device) for t in batch)
#     # Unpack the inputs from our dataloader
#     b_input_ids, b_input_mask = batch  # Adjusting since we don't have labels for the test set
#     # Telling the model not to compute or store gradients, saving memory and speeding up validation
#     with torch.no_grad():
#         # Forward pass, calculate logit predictions
#         outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
#         logits = outputs.logits
#     # Move logits to CPU
#     logits = logits.detach().cpu().numpy()
#     predictions.extend(logits)

# # Flatten predictions and convert to label names
# pred_flat = np.argmax(predictions, axis=2).flatten()
# labels_flat = [tags_metrics[p] for p in pred_flat if p != tag_index["PAD"]]  # Exclude padding tokens

# # Add predicted labels to the test_df
# test_df['Predicted_Label'] = labels_flat[:len(test_df)]
# # Function to simplify labels
# def simplify_labels(label):
#     if label.startswith('B-') or label.startswith('I-'):
#         return label[2:]
#     return label
# # Apply the function to the 'Predicted_Label' column
# test_df['Predicted_Label'] = test_df['Predicted_Label'].apply(simplify_labels)

# # Initialize lists to store the results
# results = []

# # Group consecutive words with the same label into single entities
# current_label = None
# current_span = []
# current_offset1 = None
# current_offset2 = None

# for i, row in test_df.iterrows():
#     if row['Predicted_Label'] != 'O':
#         if current_label is None:
#             current_label = row['Predicted_Label']
#             current_span = [row['Word']]
#             current_offset1 = row['offset1']
#             current_offset2 = row['offset2']
#         elif row['Predicted_Label'] == current_label:
#             current_span.append(row['Word'])
#             current_offset2 = row['offset2']
#         else:
#             results.append({
#                 'pmid': row['doc_id'],
#                 'filename': f"pmid-{row['doc_id']}.ann",
#                 'label': current_label,
#                 'offset1': current_offset1,
#                 'offset2': current_offset2,
#                 'span': ' '.join(current_span)
#             })
#             current_label = row['Predicted_Label']
#             current_span = [row['Word']]
#             current_offset1 = row['offset1']
#             current_offset2 = row['offset2']
#     else:
#         if current_label is not None:
#             results.append({
#                 'pmid': row['doc_id'],
#                 'filename': f"pmid-{row['doc_id']}.ann",
#                 'label': current_label,
#                 'offset1': current_offset1,
#                 'offset2': current_offset2,
#                 'span': ' '.join(current_span)
#             })
#             current_label = None
#             current_span = []
#             current_offset1 = None
#             current_offset2 = None

# # If there's any remaining entity, add it to the results
# if current_label is not None:
#     results.append({
#         'pmid': test_df.iloc[-1]['doc_id'],
#         'filename': f"pmid-{test_df.iloc[-1]['doc_id']}.ann",
#         'label': current_label,
#         'offset1': current_offset1,
#         'offset2': current_offset2,
#         'span': ' '.join(current_span)
#     })

# # Convert results to DataFrame
# results_df = pd.DataFrame(results)


# # # If there's any remaining entity, add it to the results
# # if current_label is not None:
# #     results.append({
# #         'pmid': test_df.iloc[-1]['doc_id'],
# #         'filename': test_df.iloc[-1]['doc_id'],
# #         'label': current_label,
# #         'offset1': current_offset1,
# #         'offset2': current_offset2,
# #         'span': ' '.join(current_span)
# #     })

# # Convert results to DataFrame
# results_df = pd.DataFrame(results)

# # Save to TSV file
# output_path = '/content/drive/MyDrive/Colab Notebooks/ner/outputs/test_predictions.tsv'
# results_df.to_csv(output_path, sep='\t', index=False)
# print(f"Predictions saved to {output_path}")