In [None]:
#Generate tuples (Question, Token, SemType)
%cd ~/Desktop/CDQA-project

from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import re
s = pd.read_json('metamap_output.json')

documents = s.iloc[0][0]['Document']['Utterances']
Metamap_Tokenizations = [] #To be used for final QA

def retrieve_tokens(SyntaxUnits):
    tokens = []
    for i in range(len(SyntaxUnits)):
        tokens.append(SyntaxUnits[i]['InputMatch'])
    return tokens

def retrieve_mappings(Mappings):
    mapped_semantic_types = []
    #No mappings found
    if len(Mappings) == 0:
        return [] #These words will get their embeddings from BERT
    else:
        candidates = Mappings[0]['MappingCandidates'] #Choosing Only top mappings
        for cnd in candidates:
            mapped_semantic_types.append([' '.join(cnd['MatchedWords']), cnd['CandidateCUI'], \
                                              cnd['CandidatePreferred'], cnd['SemTypes'][0]])
            entities.add(cnd['CandidatePreferred'])
    return mapped_semantic_types

entities = set()
for doc in tqdm(documents):
    Phrases = doc['Phrases']
    Phrase_Tokenizations = []
    Mappings = []
    for ph in Phrases:
        Phrase_Tokenizations.append(retrieve_tokens(ph['SyntaxUnits']))
        Mappings.append(retrieve_mappings(ph['Mappings']))
    #Flattening the Lists
    Phrase_Tokenizations = [item for sublist in Phrase_Tokenizations for item in sublist]
    Mappings = [item for sublist in Mappings for item in sublist]
    #Creating the final list
    Metamap_Tokenizations.append((doc['UttText'], Phrase_Tokenizations, Mappings))
    
#Removing extra spaces from each question
for index, tup in enumerate(Metamap_Tokenizations):
    temp = list(tup)
    #The only question which was causing an issue
    if type(temp[0]) != str:
        temp[0] = 'What were the impact of event scale–revised scores?'
    else:
        temp[0] = temp[0].strip()
    Metamap_Tokenizations[index] = tuple(temp)
    
print(f"Number of entities discovered: {len(entities)}")

In [None]:
#Replacing each shorthand mapping with KG concept
import mysql.connector

mydb = mysql.connector.connect(host="localhost", user="root", password="Saptarshi123!", database="umls")
mycursor = mydb.cursor()

for i in range(len(Metamap_Tokenizations)):
    for j in range(len(Metamap_Tokenizations[i][2])):
        mycursor.execute("select STY_RL from SRDEF where ABR = '%s' " % Metamap_Tokenizations[i][2][j][3])
        Metamap_Tokenizations[i][2][j][3] = mycursor.fetchall()[0][0]

mycursor.close()

#Saving Metamap_Tokenizations for use during question embedding creation
pd.DataFrame(Metamap_Tokenizations, columns=['Question','Tokenization',\
                                            'Mappings']).to_pickle('Metamap_Tokenizations.pkl')

In [None]:
#Creating the CUI_Preferred_Concept_Semantic_Type_Lookup_Table
cuis = [y[1] for x in Metamap_Tokenizations for y in x[2]]
pc = [y[2] for x in Metamap_Tokenizations for y in x[2]]
st = [y[3] for x in Metamap_Tokenizations for y in x[2]]
CUI_Preferred_Concept_Semantic_Type_Lookup_Table = pd.DataFrame(zip(cuis, pc, st), \
                                                            columns=['CUI','Preferred_Concept', 'Semantic_Type']).drop_duplicates()

In [None]:
#New version of creating KGT (Metathesaurus + Semantic Network)
import sqlite3
from collections import namedtuple

mydb = mysql.connector.connect(host="localhost", user="root", password="Saptarshi123!", database="umls")
mycursor = mydb.cursor()

#For sqlite (Metathesaurus)
conn = sqlite3.connect('umls.db')
cursor = conn.cursor()

#All Knowledge Graph Triples
KGT = set()

#To make the code more readable (pythonic)
CUIREL = namedtuple('CUIREL', ['CUI', 'REL'])
STREL = namedtuple('STREL', ['ST', 'REL'])

for row in tqdm(CUI_Preferred_Concept_Semantic_Type_Lookup_Table.itertuples(index=False)):
    '''--->Extracting relations from the Metathesaurus<---'''
    #This gives the "incoming" relations i.e. CUI2 - RELA - CUI1
    incoming_relations = []
    cursor.execute('''SELECT DISTINCT CUI2, RELA
                                  FROM MRREL
                                  WHERE CUI1 = '%s' AND RELA <> '';''' % row.CUI)
    results = cursor.fetchall()
    #Checking which of the returned tuples are present in our lookup table 
    for tupA in results:
        CUIREL2 = CUIREL(*tupA)
        for tupB in CUI_Preferred_Concept_Semantic_Type_Lookup_Table.query('CUI==@CUIREL2.CUI').itertuples(index=False):
            KGT.add((tupB.Preferred_Concept, CUIREL2.REL, row.Preferred_Concept))
    
    #This gives the "outgoing" relations i.e. CUI1 - RELA - CUI2
    outgoing_relations = []
    cursor.execute('''SELECT DISTINCT CUI1, RELA 
                                  FROM MRREL
                                  WHERE CUI2 = '%s' AND STYPE2 = 'CUI' AND RELA <> '';''' % row.CUI)
    results = cursor.fetchall()
    #Checking which of the returned tuples are present in our lookup table
    for tupA in results:
        CUIREL2 = CUIREL(*tupA)
        for tupB in CUI_Preferred_Concept_Semantic_Type_Lookup_Table.query('CUI==@CUIREL2.CUI').itertuples(index=False):
            KGT.add((row.Preferred_Concept, CUIREL2.REL, tupB.Preferred_Concept))
    
    '''--->Extracting relations from the Semantic Network<---'''
    #Generating possible semantic types connected to the current one
    mycursor.execute('''SELECT STY_RL2, RL
                        FROM SRSTR 
                        WHERE STY_RL1 = '%s';''' % row.Semantic_Type)
    possible_semantic_types_2 = [STREL(*x) for x in mycursor.fetchall()]
    for sem_type_2 in possible_semantic_types_2:
        for result in CUI_Preferred_Concept_Semantic_Type_Lookup_Table.query('Semantic_Type==@sem_type_2.ST').itertuples(index=False):
            KGT.add((row.Preferred_Concept, sem_type_2.REL, result.Preferred_Concept))

conn.close()
mycursor.close()

print(f"Number of triples extracted: {len(KGT)}")

In [None]:
#Creating Train/Validation/Test splits for training KGE's
import pandas as pd
import numpy as np
import os

#Converting set to pandas dataframe for easy splitting
KGT = pd.DataFrame(KGT)

#Giving the KGT dataframe meaningful column names
KGT.rename(columns={0: "E1", 1: "Rel", 2: "E2"}, inplace=True)

#80/10/10 split
train, validation, test = np.split(KGT.sample(frac=1, random_state=42), [int(.8*len(KGT)), int(.9*len(KGT))])

#Creating folder where dataset files will be saved
try:
    os.mkdir(os.path.join(os.path.abspath(os.getcwd()), "UMLS_KG"))
    print("KG Directory Created")
except:
    print("KG Directory Already Exists")

dataset_path = os.path.abspath("UMLS_KG")

#Saving datasets as .txt files to be used for training the KGE
np.savetxt(os.path.join(dataset_path, 'UMLS_KG-train.txt'), train.values, delimiter="\t", fmt="%s")
np.savetxt(os.path.join(dataset_path, 'UMLS_KG-valid.txt'), validation.values, delimiter="\t", fmt="%s")
np.savetxt(os.path.join(dataset_path, 'UMLS_KG-test.txt'), test.values, delimiter="\t", fmt="%s")

print('KG dataset saved...')

KGT.to_pickle(os.path.join(dataset_path, "KGT.pkl"))
print('KGT saved as dataframe')