In [1]:
#Generate tuples (Question, Token, SemType)
%cd ~/Desktop/CDQA-project

from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
s = pd.read_json('metamap_output.json')

documents = s.iloc[0][0]['Document']['Utterances']
Metamap_Tokenizations = [] #To be used for final QA

def retrieve_tokens(SyntaxUnits):
    tokens = []
    for i in range(len(SyntaxUnits)):
        tokens.append(SyntaxUnits[i]['InputMatch'])
    return tokens

def retrieve_mappings(Mappings):
    mapped_semantic_types = []
    #No mappings found
    if len(Mappings) == 0:
        return [] #These words will get their embeddings from BERT
    else:
        candidates = Mappings[0]['MappingCandidates'] #Choosing Only top mappings
        for cnd in candidates:
            mapped_semantic_types.append([' '.join(cnd['MatchedWords']),cnd['SemTypes'][0]])
    return mapped_semantic_types
    
for doc in tqdm(documents):
    Phrases = doc['Phrases']
    Phrase_Tokenizations = []
    Mappings = []
    for ph in Phrases:
        Phrase_Tokenizations.append(retrieve_tokens(ph['SyntaxUnits']))
        Mappings.append(retrieve_mappings(ph['Mappings']))
    #Flattening the Lists
    Phrase_Tokenizations = [item for sublist in Phrase_Tokenizations for item in sublist]
    Mappings = [item for sublist in Mappings for item in sublist]
    #Creating the final list
    Metamap_Tokenizations.append((doc['UttText'], Phrase_Tokenizations, Mappings))

/home/ubuntu/Desktop/CDQA-project


HBox(children=(FloatProgress(value=0.0, max=2007.0), HTML(value='')))




In [2]:
#Replacing each shorthand mapping with KG concept
import mysql.connector
from numba import jit

mydb = mysql.connector.connect(host="localhost", user="root", password="Saptarshi123!", database="umls")
mycursor = mydb.cursor()

@jit(nopython=True)
def shrthand_to_mapped_concept(Metamap_Tokenizations):
    for i in range(len(Metamap_Tokenizations)):
        for j in range(len(Metamap_Tokenizations[i][2])):
            mycursor.execute("select STY_RL from SRDEF where ABR = '%s' " % Metamap_Tokenizations[i][2][j][1])
            Metamap_Tokenizations[i][2][j][1] = mycursor.fetchall()[0][0]
    return Metamap_Tokenizations

mycursor.close()

Metamap_Tokenizations = shrthand_to_mapped_concept(Metamap_Tokenizations)

#Saving Metamap_Tokenizations for use during question embedding creation
pd.DataFrame(Metamap_Tokenizations,columns=['Question','Tokenization',\
                                            'Mappings']).to_pickle('Metamap_Tokenizations.pkl')

In [3]:
#Generating the KG triples (KGT)
from itertools import permutations

All_Mappings = [y for x in Metamap_Tokenizations for y in x[2]]
All_Concept_Pairs = permutations(All_Mappings, 2)

mydb = mysql.connector.connect(host="localhost", user="root", password="Saptarshi123!", database="umls")
mycursor = mydb.cursor()

KGT = set() #I'm using a set to avoid repeated triples.

for term_pair in tqdm(All_Concept_Pairs):
    semantic_type1 = term_pair[0][1]
    semantic_type2 = term_pair[1][1]
    mycursor.execute("select RL from SRSTR where STY_RL1 = '%s' and STY_RL2 = '%s' " % (semantic_type1, semantic_type2))
    relation = mycursor.fetchall()
    if relation != []:
        KGT.add((semantic_type1, relation[0][0], semantic_type2))

mycursor.close()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

KeyboardInterrupt: 

In [None]:
#Creating Train/Validation/Test splits for training KGE's
import pandas as pd
import numpy as np
import os

#Converting set to pandas dataframe for easily split 
KGT = pd.DataFrame(KGT)

#Giving the KGT dataframe meaningful column names
KGT.rename(columns={0: "E1", 1: "Rel", 2: "E2"}, inplace=True)

#80/10/10 split
train, validation, test = np.split(KGT.sample(frac=1, random_state=42), [int(.8*len(KGT)), int(.9*len(KGT))])

#Creating folder where dataset files will be saved
try:
    os.mkdir(os.path.join(os.path.abspath(os.getcwd()), "UMLS_KG"))
    print("KG Directory Created")
except:
    print("KG Directory Already Exists")

dataset_path = os.path.abspath("UMLS_KG")

#Saving datasets as .txt files to be used for training the KGE
np.savetxt(os.path.join(dataset_path, 'UMLS_KG-train.txt'), train.values, delimiter="\t", fmt="%s")
np.savetxt(os.path.join(dataset_path, 'UMLS_KG-valid.txt'), validation.values, delimiter="\t", fmt="%s")
np.savetxt(os.path.join(dataset_path, 'UMLS_KG-test.txt'), test.values, delimiter="\t", fmt="%s")

print('KG dataset saved...')

KGT.to_pickle(os.path.join(dataset_path, "KGT.pkl"))
print('KGT saved as dataframe')