In [1]:
####################################
# IMPORTS:
####################################

# External dependencies:
from nltk.stem import WordNetLemmatizer, PorterStemmer
from scipy.spatial.distance import cosine, euclidean, cdist
import matplotlib.pyplot as plt

from transformers import AutoTokenizer, AutoModel, get_scheduler
import torch
from torch import nn
from torch.utils.data import DataLoader
from tqdm import tqdm

# Internal libraries:
import numpy as np
import copy
import os
from os import listdir, write
from os.path import isfile, join, splitext
from data_loader import file_list_loader, norm_list_loader, mention2concept, encoder, Dataset
from preprocess import id_combination, lowercaser_mentions
from normalization import NeuralNetwork, cos_dist
from inference import tokenize, inference
import my_global
 
my_global._init()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
####################################
# LOADING DATA
####################################

train_file = file_list_loader('./dataset/train/train_file_list.txt')
norm_list = norm_list_loader('./dataset/train/train_norm.txt')
train_norm, train_span_split = mention2concept('./dataset/train/train_note', './dataset/train/train_norm', train_file, with_text = False)

test_file = file_list_loader('./dataset/test/test_file_list.txt')
test_norm, test_span_split = mention2concept('./dataset/test/test_note', './dataset/test/test_norm_cui_replaced_with_unk', test_file, with_text = False)

####################################
# PRE-PROCESSING
####################################

train_dict = id_combination(train_norm)
train_dict = lowercaser_mentions(train_dict)

test_dict = id_combination(test_norm)
test_dict = lowercaser_mentions(test_dict)

In [3]:
################################################
# INITIALIZING
################################################

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
my_global.set_value('device', device)
print(f"Using {device} device")

Using cuda device


In [4]:
################################################
# LOADING EMBEDDING MODEL
################################################

model_name = 'dmis-lab/biobert-base-cased-v1.1'
model = AutoModel.from_pretrained(model_name).to(device)
embbed_size = 768
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 20

my_global.set_value('model', model)
my_global.set_value('tokenizer', tokenizer)
my_global.set_value('max_length', max_length)
my_global.set_value('embbed_size', embbed_size)

X_train, y_train = encoder(train_dict, tokenizer)

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Number of mentions: 6684


In [5]:
################################################
# PREPARING DATA
################################################

train_set = Dataset(X_train, y_train)

train_dataloader = DataLoader(train_set, batch_size=64, shuffle=True)

In [6]:
################################################
# TRAINING
################################################

# fine-tuning layer (linear)
basenorm = NeuralNetwork(embbed_size).to(device)

# training parameters
learning_rate = 1e-5
epochs = 50
optimizer = torch.optim.NAdam(basenorm.parameters(), lr=learning_rate)
num_training_steps = epochs * len(train_dataloader)

# loss function
loss_fn = cos_dist
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [7]:
# training loop
model.train()
basenorm.train()

def checkpoint_loader(checkpoint):
    epoch = checkpoint['epoch']
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    batch_loss = checkpoint['loss']
    return epoch, model, optimizer, batch_loss

def train(from_checkpoint = False):
    if from_checkpoint:
        epoch = input("Please enter a number of epoch to load model: ")
        checkpoint = torch.load(f'./checkpoint/epoch{epoch}_checkpoint.pt')
        start_epoch, model, optimizer, batch_loss = checkpoint_loader(checkpoint)
    else:
        start_epoch = 0

    for epoch in range(start_epoch, epochs):
        for X, y in train_dataloader: # both X and y contains n=batch_size tokenized mentions and labels respectively
            batch_loss = None
            for tokenized_mention, tokenized_label in zip(X, y):
                tokenized_mention = tokenized_mention.to(device)
                tokenized_label = tokenized_label.to(device)
                pred = basenorm(model(tokenized_mention)[0][:,0]) # Taking last hidden state of the embedding model and piping it into a linear layer.
                ground_truth = basenorm(model(tokenized_label)[0][:,0])
                loss = loss_fn(pred, ground_truth) # Cosine similarity between embedding of mention and associated label.
                if batch_loss == None:
                    batch_loss = loss.reshape(1,1)
                else:
                    batch_loss = torch.cat((batch_loss, loss.reshape(1,1)), dim=1) # Appends current loss to all losses in batch

            # Backpropagation
            batch_loss = torch.mean(batch_loss) # Averages loss over the whole batch.
            batch_loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        if epoch % 10 == 0:
            # Check if the directory already exists
            if not os.path.exists('./checkpoint'):
            # If the directory does not exist, create it
                os.makedirs('./checkpoint')
            checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': batch_loss
                        }
            # Save the checkpoint
            torch.save(checkpoint, f'./checkpoint/epoch{epoch}_checkpoint.pt')
            print(f"Fine-tuning: Epoch n° {epoch}, loss = {batch_loss.item()}")

In [8]:
# starting training
# train()

In [9]:
################################################
# TEST
################################################

# load the model
model.eval()
basenorm.eval()
epoch = input("Please enter a number of epoch to load model: ")
checkpoint = torch.load(f'./checkpoint/epoch{epoch}_checkpoint.pt')
start_epoch, model, optimizer, batch_loss = checkpoint_loader(checkpoint)


In [10]:
dd_predictions = inference(norm_list=norm_list, basenorm=basenorm, dd_test=test_dict)

Embedding ontology concept labels...
Number of concepts in ontology: 6684
Done.



  tokenized_mention = torch.tensor(tokenize(dd_test[id]['mention']).to(device))
Building embeddings from cui list: 6925it [00:51, 134.93it/s]


ValueError: could not broadcast input array from shape (20,) into shape (768,)

768