In [None]:
#Reading in the necessary files
import pickle
import os
import pandas as pd
from transformers import AutoTokenizer

BERT_variant = 'navteca/roberta-base-squad2'
tokenizer = AutoTokenizer.from_pretrained(BERT_variant)

UMLS_KG_path = os.path.abspath('../../UMLS_KG')

with open(os.path.join(UMLS_KG_path, 'KGT.pkl'), 'rb') as file:
    KGT = pickle.load(file)
    
with open(os.path.join(UMLS_KG_path, 'entity2idx.pkl'), 'rb') as f:
    entity2id = pickle.load(f)

with open(os.path.join(UMLS_KG_path, 'relation2idx.pkl'), 'rb') as f:
    relation2id = pickle.load(f)

KGE_path = os.path.join(UMLS_KG_path, os.path.relpath('embeddings/distmult'))

ent_embeddings = pd.read_csv(os.path.join(KGE_path, 'ent_embedding.tsv'), sep='\t', header=None)
rel_embeddings = pd.read_csv(os.path.join(KGE_path, 'rel_embedding.tsv'), sep='\t', header=None)    

metamap_rel2desc = pd.read_csv(os.path.abspath('../../metamap_rel2desc.csv'))

print('Loaded all necessary files...')

In [None]:
#Creating semantic n/w rel2desc dataframe & concatenating it with metamap rel2desc to get the entire collection
import numpy as np

file = np.loadtxt(os.path.abspath('../../sem_nw_rels.txt'), dtype=str)
sem_nw_rel = []
sem_new_rel_desc = []
for line in file:
    sem_nw_rel.append(line)
    sem_new_rel_desc.append(line.replace('_',' '))
sem_nw_rel_df = pd.DataFrame(zip(sem_nw_rel, sem_new_rel_desc), columns = ['REL', 'Description'])
'''
Even though there are certain duplicate elements such as 'isa' in both metathesaurus & sem n/w
we have to keep both since each has a different vector.
'''
total_rel2desc = pd.concat([metamap_rel2desc, sem_nw_rel_df], ignore_index=True)

print('Complete rel2desc created...')

In [None]:
#Creating training dataset
import torch
from tqdm import tqdm

vocab_size = tokenizer.vocab_size

mean_embeddings = []
multiple_hot_targets = []

def gen_sample(triples_collection):
    
    def gen_target_vector(triple):
        '''
        We can expand using this scheme since we've taken care of the correct direction during KGT construction.
        It will, always be E1 - REL - E2.
        '''
        natural_text = triple.E1 + ' ' + \
                        total_rel2desc.query('REL==@triple.Rel').Description.values[0] + ' ' + triple.E2

        #Creating the target multiple-hot vector.
        target = np.zeros(vocab_size)
        
        #Replacing those elements in the target vector with 1, which are activated for this sample.
        np.put(target, tokenizer(natural_text)['input_ids'], 1)
        
        return target
        
    def gen_mean_embeddings(triple):
        E1_tensor = torch.from_numpy(ent_embeddings.iloc[entity2id[triple.E1]].to_numpy()).float()
        Rel_tensor = torch.from_numpy(rel_embeddings.iloc[relation2id[triple.Rel]].to_numpy()).float()
        E2_tensor = torch.from_numpy(ent_embeddings.iloc[entity2id[triple.E2]].to_numpy()).float()
        
        return torch.mean(torch.stack([E1_tensor, Rel_tensor, E2_tensor]), dim=0)
  
    for triple in triples_collection.itertuples():
        mean_embeddings.append(gen_mean_embeddings(triple))
        multiple_hot_targets.append(gen_target_vector(triple))

print('Creating training samples according to the conversion scheme...')
for current_entity in tqdm(entity2id.keys()):
    gen_sample(KGT.query('E1==@current_entity or E2==@current_entity'))

#Saving the dataset as pandas dataframe
pd.DataFrame(zip(mean_embeddings, multiple_hot_targets), \
             columns = ['mean_embedding', 'vocab_mapping']).to_pickle('Homogenization_data.pkl')

print('FFN training dataset created...')