In [None]:
def create_rel2desc(results, file_name):
    RELs = []
    Descriptions = []
    for val in results:
        RELs.append(val[0].strip())
        Descriptions.append(val[0].replace('_',' ').strip())

    pd.DataFrame(zip(RELs, Descriptions), columns=['REL', 'Description']).to_pickle(f'{file_name}_rel2desc.pkl')
    print(f'Created {file_name}_rel2desc...')

In [None]:
'''
It turns out that all relations mentioned in metathesaurus_rel2desc aren't present in MRRREL. The latter has 950 
while the former has 976. So, we don't use the former & create a custom mapping (REL -> DESC) for all relations 
present in MRREL. 
'''
import sqlite3
import os
import pandas as pd

conn = sqlite3.connect(os.path.join(os.path.abspath('../..'), 'umls.db'))
cursor = conn.cursor()

cursor.execute("SELECT DISTINCT RELA FROM MRREL")
results = cursor.fetchall()

create_rel2desc(results, 'MRREL')
conn.close()

In [None]:
#Creating semantic n/w rel2desc dataframe
import mysql.connector

mydb = mysql.connector.connect(host="localhost", user="root", password="Saptarshi123!", database="umls")
mycursor = mydb.cursor()

mycursor.execute("SELECT DISTINCT RL FROM SRSTR")
results = mycursor.fetchall()

create_rel2desc(results, 'SEM_NW')
mycursor.close()

In [None]:
#Reading in the necessary files
import pickle5 as pickle
import os
import pandas as pd
from transformers import AutoTokenizer

BERT_variant = 'phiyodr/bert-base-finetuned-squad2'
tokenizer = AutoTokenizer.from_pretrained(BERT_variant)

UMLS_KG_path = os.path.abspath('../../../UMLS_KG')

with open(os.path.join(UMLS_KG_path, 'KGT.pkl'), 'rb') as f:
    KGT = pickle.load(f)
    
with open(os.path.join(UMLS_KG_path, 'entity2idx.pkl'), 'rb') as f:
    entity2id = pickle.load(f)

with open(os.path.join(UMLS_KG_path, 'relation2idx.pkl'), 'rb') as f:
    relation2id = pickle.load(f)

KGE_path = os.path.join(UMLS_KG_path, os.path.relpath('embeddings/distmult'))

ent_embeddings = pd.read_csv(os.path.join(KGE_path, 'ent_embedding.tsv'), sep='\t', header=None)
rel_embeddings = pd.read_csv(os.path.join(KGE_path, 'rel_embedding.tsv'), sep='\t', header=None)    

MRREL_rel2desc = pd.read_pickle('MRREL_rel2desc.pkl')
SEM_NW_rel2desc = pd.read_pickle('SEM_NW_rel2desc.pkl')

total_rel2desc = pd.concat([MRREL_rel2desc, SEM_NW_rel2desc], ignore_index=True)

print('Loaded all necessary files...')

In [None]:
#Creating training dataset
import torch
from tqdm import tqdm
import numpy as np

'''
#Instead of storing the full target vector, I am storing the indices of the natural text word pieces. 
This allows us to create the target representation at runtime.
'''
mean_embeddings = []
multiple_hot_targets_indices = []

def gen_sample(triple):
    
    '''
    We can expand using this scheme since we've taken care of the correct direction during KGT construction.
    It will, always be E1 - REL - E2.
    '''
    natural_text = triple.E1 + ' ' + \
                    total_rel2desc.query('REL==@triple.Rel').Description.values[0] + ' ' + triple.E2

    '''
    #Creating the target multiple-hot vector.
    target = np.zeros(vocab_size)

    #Replacing those elements in the target vector with 1, which are activated for this sample.
    np.put(target, tokenizer(natural_text)['input_ids'], 1)
    '''
        
    #Creating the mean embedding for the triple
    E1_tensor = torch.from_numpy(ent_embeddings.iloc[entity2id[triple.E1]].to_numpy()).float()
    Rel_tensor = torch.from_numpy(rel_embeddings.iloc[relation2id[triple.Rel]].to_numpy()).float()
    E2_tensor = torch.from_numpy(ent_embeddings.iloc[entity2id[triple.E2]].to_numpy()).float()

    return torch.mean(torch.stack([E1_tensor, Rel_tensor, E2_tensor]), dim=0), tokenizer(natural_text)['input_ids']
    
print('Creating training samples according to the conversion scheme...')
for trple in tqdm(KGT.itertuples()):
    train, test = gen_sample(trple)
    mean_embeddings.append(train)
    multiple_hot_targets_indices.append(test)

pd.DataFrame(zip(mean_embeddings, multiple_hot_targets_indices), columns=['train', 'test']).to_pickle('Homogenization_data.pkl')

print('FFN training dataset created...')