In [65]:
####################################
# IMPORTS:
####################################

# External dependencies:
from nltk.stem import WordNetLemmatizer, PorterStemmer
from scipy.spatial.distance import cosine, euclidean, cdist
import matplotlib.pyplot as plt
from sklearn import metrics

from transformers import AutoTokenizer, AutoModel, get_scheduler
import torch
from torch import nn
from torch.utils.data import DataLoader
from tqdm import tqdm

# Internal libraries:
import numpy as np
import time
import random
import os
from os import listdir, write
from os.path import isfile, join, splitext
from data_loader import file_list_loader, norm_list_loader, mention2concept, encoder, Dataset
from preprocess import id_combination, lowercaser_mentions
from normalization import NeuralNetwork, cos_dist
from inference import tokenize, inference
import my_global
 
my_global._init()
random.seed(999)

In [66]:
####################################
# LOADING DATA
####################################

# cui lists in training set
norm_list = norm_list_loader('./dataset/train/train_norm.txt')
# file names
train_file = file_list_loader('./dataset/train/train_file_list.txt')
test_file = file_list_loader('./dataset/test/test_file_list.txt')

####################
# 1/4 of training set
####################
# Select 1/4 of the keys randomly
selected_keys = random.sample(list(train_file.keys()), len(train_file)//4)

# Create a new dictionary with only the selected keys
train_small_file = {k: train_file[k] for k in selected_keys}

####################
# 1/4 of test set
####################
# Select 1/4 of the keys randomly
selected_keys = random.sample(list(test_file.keys()), len(test_file)//4)

# Create a new dictionary with only the selected keys
test_small_file = {k: test_file[k] for k in selected_keys}

train_norm, train_cui_less_dict, train_span_split = mention2concept('./dataset/train/train_note', './dataset/train/train_norm', train_small_file, with_text = False)
test_norm, test_cui_less_dict, test_span_split = mention2concept('./dataset/test/test_note', './dataset/test/test_norm_cui_replaced_with_unk', test_small_file, with_text = False)
####################################
# PRE-PROCESSING
####################################

train_dict = id_combination(train_norm)
train_dict = lowercaser_mentions(train_dict)

test_dict = id_combination(test_norm)
test_dict = lowercaser_mentions(test_dict)

In [60]:
train_span_split 

{'2': 0, '1': 39, '0': 1621}

In [33]:
################################################
# INITIALIZING
################################################

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
my_global.set_value('device', device)
print(f"Using {device} device")

Using cuda device


In [34]:
################################################
# LOADING EMBEDDING MODEL
################################################

model_name = 'dmis-lab/biobert-base-cased-v1.1'
model = AutoModel.from_pretrained(model_name).to(device)
embbed_size = 768
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 20

my_global.set_value('model', model)
my_global.set_value('tokenizer', tokenizer)
my_global.set_value('max_length', max_length)
my_global.set_value('embbed_size', embbed_size)

X_train, y_train = encoder(train_dict, tokenizer)

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Number of mentions: 1660


In [47]:
################################################
# TRAINING
################################################

# training parameters
learning_rate = 1e-5
epochs = 10
batch_size = 32
optimizer = torch.optim.NAdam(model.parameters(), lr=learning_rate)
num_training_steps = epochs * len(train_dataloader)

# loss function
loss_fn = cos_dist
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [48]:
################################################
# PREPARING DATA
################################################

train_set = Dataset(X_train, y_train)

train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)

In [49]:
# training loop
model.train()

def checkpoint_loader(checkpoint):
    epoch = checkpoint['epoch']
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    batch_loss = checkpoint['loss']
    return epoch, model, optimizer, batch_loss

def train(model, optimizer,from_checkpoint = False):
    if from_checkpoint:
        epoch = input("Please enter a number of epoch to load model: ")
        checkpoint = torch.load(f'./checkpoint/epoch{epoch}_checkpoint.pt')
        start_epoch, model, optimizer, batch_loss = checkpoint_loader(checkpoint)
    else:
        start_epoch = 0

    start_time = time.time()
    for epoch in range(start_epoch, epochs):
        for X, y in train_dataloader: # both X and y contains n=batch_size tokenized mentions and labels respectively
            batch_loss = None
            for tokenized_mention, tokenized_label in zip(X, y):
                tokenized_mention = tokenized_mention.to(device)
                tokenized_label = tokenized_label.to(device)
                pred = model(tokenized_mention)[0][:,0] # Taking last hidden state of the embedding model and piping it into a linear layer.
                ground_truth = model(tokenized_label)[0][:,0]
                loss = loss_fn(pred, ground_truth) # Cosine similarity between embedding of mention and associated label.
                if batch_loss == None:
                    batch_loss = loss.reshape(1,1)
                else:
                    batch_loss = torch.cat((batch_loss, loss.reshape(1,1)), dim=1) # Appends current loss to all losses in batch

            # Backpropagation
            batch_loss = torch.mean(batch_loss) # Averages loss over the whole batch.
            batch_loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
        end_time = time.time()
        print("Epoch [{}/{}], Elapsed Time: {:.2f}mins".format(epoch+1, epochs, (end_time - start_time) / 60))
        print(f"loss = {batch_loss.item()}")

        if (epoch+1) % 10 == 0:
            # Check if the directory already exists
            if not os.path.exists('./checkpoint_3.31'):
            # If the directory does not exist, create it
                os.makedirs('./checkpoint_3.31')
            checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': batch_loss
                        }
            # Save the checkpoint
            torch.save(checkpoint, f'./checkpoint_3.31/epoch{epoch+1}_checkpoint.pt')

In [50]:
# starting training
train(model, optimizer, from_checkpoint=False)

Epoch [1/10], Elapsed Time: 1.93mins
loss = -0.9970153570175171
Epoch [2/10], Elapsed Time: 3.86mins
loss = -0.9981711506843567
Epoch [3/10], Elapsed Time: 5.76mins
loss = -0.9985478520393372
Epoch [4/10], Elapsed Time: 7.68mins
loss = -0.9986712336540222
Epoch [5/10], Elapsed Time: 9.59mins
loss = -0.9986713528633118
Epoch [6/10], Elapsed Time: 11.52mins
loss = -0.9986199140548706
Epoch [7/10], Elapsed Time: 13.43mins
loss = -0.9987155795097351
Epoch [8/10], Elapsed Time: 15.38mins
loss = -0.9986706376075745
Epoch [9/10], Elapsed Time: 17.30mins
loss = -0.9987432956695557
Epoch [10/10], Elapsed Time: 19.25mins
loss = -0.998694658279419


In [53]:
################################################
# TEST
################################################

# load the model
epoch = input("Please enter a number of epoch to load model: ")
checkpoint = torch.load(f'./checkpoint_3.31//epoch{epoch}_checkpoint.pt')
start_epoch, model, optimizer, batch_loss = checkpoint_loader(checkpoint)

model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
norm_list.remove('CUI-less')
dd_predictions = inference(norm_list=norm_list, dd_test=test_dict)

Embedding ontology concept labels...
Number of concepts in ontology: 6683
Done.



  tokenized_mention = torch.tensor(tokenize(dd_test[id]['mention']).to(device))
Building embeddings from cui list: 6925it [00:53, 129.86it/s]


	Distance matrix calculation...
	Done.


In [72]:
def tokenize(sentence):
    return tokenizer.encode(sentence, padding="max_length", max_length=max_length, truncation=True, add_special_tokens=True, return_tensors="pt").to(device) # Tokenize input into ids.

print("Embedding ontology concept labels...")

######
# Build labels/tags embeddings from ontology:
######
# norm_list.remove('CUI-less')
norm_list = set(norm_list)
cui_encode = dict()
N = 0
with torch.no_grad():
    for cui in norm_list:
        cui_encode[cui] = model(tokenize(cui))[0][:,0].cpu().detach().numpy()
        N += 1
        if cui == 'CUI-less':
            print('yes')
        if embbed_size == None:
            embbed_size = len(cui_encode[cui][0])
print("Number of concepts in ontology:", len(norm_list))
print("Done.\n")

Embedding ontology concept labels...
Number of concepts in ontology: 2330
Done.



In [73]:
len(cui_encode.keys())

2330

In [74]:
######
# Build mention embeddings from testing set:
######

X_pred = np.zeros((len(test_dict.keys()), embbed_size)) # (6925, 768)
with torch.no_grad():
    for i, id in tqdm(enumerate(test_dict.keys()), desc ='Building embeddings from cui list'):
        tokenized_mention = torch.tensor(tokenize(test_dict[id]['mention']).to(device))
        X_pred[i] = model(tokenized_mention)[0][:,0].cpu().detach().numpy()

  tokenized_mention = torch.tensor(tokenize(test_dict[id]['mention']).to(device))
Building embeddings from cui list: 1802it [00:17, 101.86it/s]


In [76]:
######
# Nearest neighbours calculation:
######
dd_predictions = dict()
for id in test_dict.keys():
    dd_predictions[id] = dict()
    dd_predictions[id]["pred_cui"] = [] # {'id': {'pred_cui': [] }}

# dd_predictions can be {'id': {'first candidate': [], 'top 5 candidates': [], ...}} later

CUIVectorMatrix = np.zeros((len(norm_list), embbed_size)) # len(norm_list) x embbed_size
i = 0
for cui in cui_encode.keys():
    CUIVectorMatrix[i] = cui_encode[cui]
    i += 1

In [77]:
print('\tDistance matrix calculation...')
scoreMatrix = cdist(X_pred, CUIVectorMatrix, 'cosine')  # cdist() is an optimized algo to distance calculation.
# (doc: https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html)

print("\tDone.")

	Distance matrix calculation...
	Done.


In [None]:
# For each mention, find back the nearest cui vector, then attribute the associated cui:
i=0
for i, id in enumerate(test_dict.keys()):
    minScore = min(scoreMatrix[i])
    minScore_5 = sorted(scoreMatrix[i])[:5]
    j = -1
    stopSearch = False
    for cui in cui_encode.keys():
        if stopSearch == True:
            break
        j += 1
        if scoreMatrix[i][j] == minScore:
            dd_predictions[id]["pred_cui"] = [cui]
            stopSearch = True
            break
# del cui_encode

In [137]:
for i, id in enumerate(test_dict.keys()):
    dd_predictions[id]["pred_cui_5"] = []
    minScore_5 = sorted(scoreMatrix[i])[:5] # top 5 candidates
    min_indices = scoreMatrix[i].argmin() # top 1 candidates
    min_indices_5 = [i for i, x in enumerate(scoreMatrix[i]) if x in minScore_5]
    k = 0
    stopSearch = False
    for idx, cui in enumerate(cui_encode.keys()):
        if stopSearch == True:
            break
        if idx == min_indices:
            dd_predictions[id]["pred_cui"] = [cui]
            stopSearch = True
            break
    stopSearch = False
    for idx, cui in enumerate(cui_encode.keys()):
        if stopSearch == True:
            break
        if idx in min_indices_5:
            dd_predictions[id]["pred_cui_5"].append(cui)
            k += 1
            if len(dd_predictions[id]["pred_cui_5"]) == 5:
                stopSearch = True
                break

In [138]:
dd_predictions

{'0150_N000': {'pred_cui': ['C0038002'],
  'pred_cui_5': ['C0032320', 'C0021641', 'C0038002', 'C0035110', 'C0032371']},
 '0150_N001': {'pred_cui': ['C1961006'],
  'pred_cui_5': ['C0020258', 'C1691002', 'C1961006', 'C0043250', 'C0020437']},
 '0150_N002': {'pred_cui': ['C0038002'],
  'pred_cui_5': ['C0032320', 'C0021641', 'C0038002', 'C0035110', 'C0032371']},
 '0150_N003': {'pred_cui': ['C0205161'],
  'pred_cui_5': ['C0205161', 'C0006681', 'C0003862', 'C0205168', 'C0205156']},
 '0150_N005': {'pred_cui': ['C4510046'],
  'pred_cui_5': ['C0020541', 'C0205548', 'C0455060', 'C0202865', 'C4510046']},
 '0150_N006': {'pred_cui': ['C4510046'],
  'pred_cui_5': ['C0205548', 'C0455060', 'C4510046', 'C1280500', 'C0205042']},
 '0150_N007': {'pred_cui': ['C4510046'],
  'pred_cui_5': ['C1704243', 'C0020502', 'C4510046', 'C0010055', 'C3538423']},
 '0150_N008': {'pred_cui': ['C4510046'],
  'pred_cui_5': ['C0412519', 'C1995000', 'C4510046', 'C2919541', 'C3538423']},
 '0150_N009': {'pred_cui': ['C4510046'],

In [98]:
# Load the true cui
test_norm_results, test_cui_less_dict, test_span_split = mention2concept('./dataset/test/test_note', './dataset/gold/test_norm', test_small_file, with_text = False)
test_dict_results = id_combination(test_norm_results)
test_dict_results = lowercaser_mentions(test_dict_results)

true_cui = []
pred_cui = []
for key in test_dict_results.keys():
    true_cui.append(test_dict_results[key]['cui'])
    pred_cui.append(dd_predictions[key]['pred_cui'])

f1_score_macro = metrics.f1_score(true_cui, pred_cui, average='macro')