In [None]:
#Run this cell once to install pykg2vec
!git clone https://github.com/Sujit-O/pykg2vec.git
%cd pykg2vec/
!python setup.py install

In [286]:
#Generate tuples (Question, Token, SemType)
%cd ~/Desktop/covid-project

import pandas as pd
s = pd.read_json('sample.json')

documents = s.iloc[0][0]['Document']['Utterances']
Metamap_Tokenizations = [] #To be used for final QA

def retrieve_tokens(SyntaxUnits):
    tokens = []
    for i in range(len(SyntaxUnits)):
        tokens.append(SyntaxUnits[i]['InputMatch'])
    return tokens

def retrieve_mappings(Mappings):
    mapped_semantic_types = []
    #No mappings found
    if len(Mappings) == 0:
        return [] #These words will get their embeddings from BERT
    else:
        candidates = Mappings[0]['MappingCandidates'] #Choosing Only top mappings
        for cnd in candidates:
            mapped_semantic_types.append([' '.join(cnd['MatchedWords']),cnd['SemTypes'][0]])
    return mapped_semantic_types
    
for doc in documents:
    Phrases = doc['Phrases']
    Phrase_Tokenizations = []
    Mappings = []
    for ph in Phrases:
        Phrase_Tokenizations.append(retrieve_tokens(ph['SyntaxUnits']))
        Mappings.append(retrieve_mappings(ph['Mappings']))
    #Flattening the Lists
    Phrase_Tokenizations = [item for sublist in Phrase_Tokenizations for item in sublist]
    Mappings = [item for sublist in Mappings for item in sublist]
    #Creating the final list
    Metamap_Tokenizations.append((doc['UttText'], Phrase_Tokenizations, Mappings))

/home/ubuntu/Desktop/covid-project


In [287]:
#Replacing each shorthand mapping with KG concept
import mysql.connector

mydb = mysql.connector.connect(host="localhost", user="root", password="Saptarshi123!", database="umls")
mycursor = mydb.cursor()

for i in range(len(Metamap_Tokenizations)):
    for j in range(len(Metamap_Tokenizations[i][2])):
        mycursor.execute("select STY_RL from SRDEF where ABR = '%s' " % Metamap_Tokenizations[i][2][j][1])
        Metamap_Tokenizations[i][2][j][1] = mycursor.fetchall()[0][0]

mycursor.close()

True

In [288]:
#Generating the KG triples (KGT)
from itertools import permutations
from tqdm.notebook import tqdm

All_Mappings = [y for x in Metamap_Tokenizations for y in x[2]]
All_Concept_Pairs = permutations(All_Mappings, 2)

mydb = mysql.connector.connect(host="localhost", user="root", password="Saptarshi123!", database="umls")
mycursor = mydb.cursor()

KGT = set()

for term_pair in tqdm(All_Concept_Pairs):
    semantic_type1 = term_pair[0][1]
    semantic_type2 = term_pair[1][1]
    mycursor.execute("select RL from SRSTR where STY_RL1 = '%s' and STY_RL2 = '%s' " % (semantic_type1, semantic_type2))
    relation = mycursor.fetchall()
    if relation != []:
        KGT.add((semantic_type1, relation[0][0], semantic_type2))

mycursor.close()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




True

In [289]:
#Creating Train/Validation/Test splits for training KGE's
import pandas as pd
import numpy as np
import os

#Converting set to pandas dataframe for easily split 
KGT = pd.DataFrame(KGT)

#Giving the KGT dataframe meaningful column names
KGT.rename(columns={0: "E1", 1: "Rel", 2: "E2"}, inplace=True)

KGT.loc[len(KGT.index)] = ['Age Group', 'isa', 'Group']


#80/10/10 split
train, validation, test = np.split(KGT.sample(frac=1, random_state=42), [int(.8*len(KGT)), int(.9*len(KGT))])

#Saving datasets as .txt files
dataset_path = os.path.abspath('UMLS_KG')
np.savetxt(os.path.join(dataset_path, 'UMLS_KG-train.txt'), train.values, delimiter="\t", fmt="%s")
np.savetxt(os.path.join(dataset_path, 'UMLS_KG-valid.txt'), validation.values, delimiter="\t", fmt="%s")
np.savetxt(os.path.join(dataset_path, 'UMLS_KG-test.txt'), test.values, delimiter="\t", fmt="%s")

In [291]:
#Run this cell to execute pykg2vec programs
%cd ~/Desktop/covid-project/pykg2vec/scripts

/home/ubuntu/Desktop/covid-project/pykg2vec/scripts


In [None]:
#Tune KGE model
!python pykg2vec_tune.py -mn DistMult -ds UMLS_KG -dsp ~/Desktop/covid-project/UMLS_KG \
-hpf ~/Desktop/covid-project/UMLS_KG/hyperparams.yaml

In [292]:
#Train KGE
!python pykg2vec_train.py -mn DistMult -ds UMLS_KG -dsp ~/Desktop/covid-project/UMLS_KG \
-lr 0.01 -l1 True -k 768 -b 128 -l 1000 -mg 1.00 -opt "sgd" -s "bern" -ngr 1

2021-05-24 20:50:51,891 - pykg2vec.config - INFO - 
------------------Global Setting--------------------
                  lmbda : 0.1
             batch_size : 128
                 margin : 1.0
              optimizer : sgd
               sampling : bern
               neg_rate : 1
                 epochs : 1000
          learning_rate : 0.01
            hidden_size : 768
        ent_hidden_size : 50
        rel_hidden_size : 50
          hidden_size_1 : 10
                l1_flag : True
                  alpha : 0.1
           filter_sizes : [1, 2, 3]
            num_filters : 50
    feature_map_dropout : 0.2
          input_dropout : 0.3
         hidden_dropout : 0.3
        hidden_dropout1 : 0.4
        hidden_dropout2 : 0.5
        label_smoothing : 0.1
                   cmax : 0.05
                   cmin : 5.0
    feature_permutation : 1
         reshape_height : 20
          reshape_width : 10
            kernel_size : 9
            in_channels : 9
                    way : pa

2021-05-24 20:50:52,020 - pykg2vec.utils.trainer - INFO - Epoch[54/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,021 - pykg2vec.utils.trainer - INFO - Epoch[55/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,021 - pykg2vec.utils.trainer - INFO - Epoch[56/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,022 - pykg2vec.utils.trainer - INFO - Epoch[57/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,022 - pykg2vec.utils.trainer - INFO - Epoch[58/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,023 - pykg2vec.utils.trainer - INFO - Epoch[59/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,024 - pykg2vec.utils.trainer - INFO - Epoch[60/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,024 - pykg2vec.utils.evaluator - INFO - Mini-Testing on [1/1] Triples in the valid set.
  0%|                                                     | 0/1 [00:00<?, ?it/s]100%|██████

2021-05-24 20:50:52,071 - pykg2vec.utils.evaluator - INFO - Mini-Testing on [1/1] Triples in the valid set.
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 261.44it/s]
2021-05-24 20:50:52,078 - pykg2vec.utils.evaluator - INFO - 
------Test Results for UMLS_KG: Epoch: 110 --- time: 0.01------------
--# of entities, # of relations: 7, 4
--mr,  filtered mr             : 6.5000, 6.5000
--mrr, filtered mrr            : 0.1548, 0.1548
--hits1                        : 0.0000 
--filtered hits1               : 0.0000 
--hits3                        : 0.0000 
--filtered hits3               : 0.0000 
--hits5                        : 0.0000 
--filtered hits5               : 0.0000 
--hits10                        : 1.0000 
--filtered hits10               : 1.0000 
---------------------------------------------------------

2021-05-24 20:50:52,078 - pykg2vec.utils.trainer - INFO - Reset the patience count to 3
2021-05-24 20:50:52,078 - pykg2vec.utils.trainer - INFO - Epoch[111/1

0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,169 - pykg2vec.utils.evaluator - INFO - Mini-Testing on [1/1] Triples in the valid set.
  0%|                                                     | 0/1 [00:00<?, ?it/s]100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1475.83it/s]
2021-05-24 20:50:52,171 - pykg2vec.utils.evaluator - INFO - 
------Test Results for UMLS_KG: Epoch: 180 --- time: 0.00------------
--# of entities, # of relations: 7, 4
--mr,  filtered mr             : 6.5000, 6.5000
--mrr, filtered mrr            : 0.1548, 0.1548
--hits1                        : 0.0000 
--filtered hits1               : 0.0000 
--hits3                        : 0.0000 
--filtered hits3               : 0.0000 
--hits5                        : 0.0000 
--filtered hits5               : 0.0000 
--hits10                        : 1.0000 
--filtered hits10               : 1.0000 
---------------------------------------------------------

2021-05-24 20:50:52

0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,228 - pykg2vec.utils.evaluator - INFO - Mini-Testing on [1/1] Triples in the valid set.
  0%|                                                     | 0/1 [00:00<?, ?it/s]100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 215.06it/s]
2021-05-24 20:50:52,234 - pykg2vec.utils.evaluator - INFO - 
------Test Results for UMLS_KG: Epoch: 240 --- time: 0.01------------
--# of entities, # of relations: 7, 4
--mr,  filtered mr             : 6.5000, 6.5000
--mrr, filtered mrr            : 0.1548, 0.1548
--hits1                        : 0.0000 
--filtered hits1               : 0.0000 
--hits3                        : 0.0000 
--filtered hits3               : 0.0000 
--hits5                        : 0.0000 
--filtered hits5               : 0.0000 
--hits10                        : 1.0000 
--filtered hits10               : 1.0000 
---------------------------------------------------------

2021-05-24 20:50:5

0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,280 - pykg2vec.utils.trainer - INFO - Epoch[289/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,281 - pykg2vec.utils.trainer - INFO - Epoch[290/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,281 - pykg2vec.utils.evaluator - INFO - Mini-Testing on [1/1] Triples in the valid set.
  0%|                                                     | 0/1 [00:00<?, ?it/s]100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1013.12it/s]
2021-05-24 20:50:52,284 - pykg2vec.utils.evaluator - INFO - 
------Test Results for UMLS_KG: Epoch: 290 --- time: 0.00------------
--# of entities, # of relations: 7, 4
--mr,  filtered mr             : 6.5000, 6.5000
--mrr, filtered mrr            : 0.1548, 0.1548
--hits1                        : 0.0000 
--filtered hits1               : 0.0000 
--hits3                        : 0.0000 
--filtered hits3               : 0.0000 
--hits5            

0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,330 - pykg2vec.utils.trainer - INFO - Epoch[345/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,332 - pykg2vec.utils.trainer - INFO - Epoch[346/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,333 - pykg2vec.utils.trainer - INFO - Epoch[347/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,333 - pykg2vec.utils.trainer - INFO - Epoch[348/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,334 - pykg2vec.utils.trainer - INFO - Epoch[349/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,335 - pykg2vec.utils.trainer - INFO - Epoch[350/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,336 - pykg2vec.utils.evaluator - INFO - Mini-Testing on [1/1] Triples in the valid set.
  0%|                                                     | 0/1 [00:00<?, ?it/s]100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 752.07it/s

0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,382 - pykg2vec.utils.trainer - INFO - Epoch[404/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,382 - pykg2vec.utils.trainer - INFO - Epoch[405/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,383 - pykg2vec.utils.trainer - INFO - Epoch[406/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,386 - pykg2vec.utils.trainer - INFO - Epoch[407/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,388 - pykg2vec.utils.trainer - INFO - Epoch[408/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,389 - pykg2vec.utils.trainer - INFO - Epoch[409/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,389 - pykg2vec.utils.trainer - INFO - Epoch[410/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,390 - pykg2vec.utils.evaluator - INFO - Mini-Testing on [1/1] Triples in the valid set.
  0%|                                        

0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,437 - pykg2vec.utils.evaluator - INFO - Mini-Testing on [1/1] Triples in the valid set.
  0%|                                                     | 0/1 [00:00<?, ?it/s]100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 858.61it/s]
2021-05-24 20:50:52,439 - pykg2vec.utils.evaluator - INFO - 
------Test Results for UMLS_KG: Epoch: 460 --- time: 0.00------------
--# of entities, # of relations: 7, 4
--mr,  filtered mr             : 6.5000, 6.5000
--mrr, filtered mrr            : 0.1548, 0.1548
--hits1                        : 0.0000 
--filtered hits1               : 0.0000 
--hits3                        : 0.0000 
--filtered hits3               : 0.0000 
--hits5                        : 0.0000 
--filtered hits5               : 0.0000 
--hits10                        : 1.0000 
--filtered hits10               : 1.0000 
---------------------------------------------------------

2021-05-24 20:50:5

0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,484 - pykg2vec.utils.trainer - INFO - Epoch[518/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,485 - pykg2vec.utils.trainer - INFO - Epoch[519/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,488 - pykg2vec.utils.trainer - INFO - Epoch[520/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,489 - pykg2vec.utils.evaluator - INFO - Mini-Testing on [1/1] Triples in the valid set.
  0%|                                                     | 0/1 [00:00<?, ?it/s]100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1473.24it/s]
2021-05-24 20:50:52,491 - pykg2vec.utils.evaluator - INFO - 
------Test Results for UMLS_KG: Epoch: 520 --- time: 0.00------------
--# of entities, # of relations: 7, 4
--mr,  filtered mr             : 6.5000, 6.5000
--mrr, filtered mrr            : 0.1548, 0.1548
--hits1                        : 0.0000 
--filtered hits1              

2021-05-24 20:50:52,536 - pykg2vec.utils.trainer - INFO - Reset the patience count to 3
2021-05-24 20:50:52,536 - pykg2vec.utils.trainer - INFO - Epoch[571/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,536 - pykg2vec.utils.trainer - INFO - Epoch[572/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,537 - pykg2vec.utils.trainer - INFO - Epoch[573/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,539 - pykg2vec.utils.trainer - INFO - Epoch[574/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,540 - pykg2vec.utils.trainer - INFO - Epoch[575/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,541 - pykg2vec.utils.trainer - INFO - Epoch[576/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,541 - pykg2vec.utils.trainer - INFO - Epoch[577/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,542 - pykg2vec.utils.trainer - INFO - Epoch[578/1000]
0it [00:00, ?it/s]0it [00:00,

2021-05-24 20:50:52,587 - pykg2vec.utils.evaluator - INFO - 
------Test Results for UMLS_KG: Epoch: 630 --- time: 0.00------------
--# of entities, # of relations: 7, 4
--mr,  filtered mr             : 6.5000, 6.5000
--mrr, filtered mrr            : 0.1548, 0.1548
--hits1                        : 0.0000 
--filtered hits1               : 0.0000 
--hits3                        : 0.0000 
--filtered hits3               : 0.0000 
--hits5                        : 0.0000 
--filtered hits5               : 0.0000 
--hits10                        : 1.0000 
--filtered hits10               : 1.0000 
---------------------------------------------------------

2021-05-24 20:50:52,588 - pykg2vec.utils.trainer - INFO - Reset the patience count to 3
2021-05-24 20:50:52,588 - pykg2vec.utils.trainer - INFO - Epoch[631/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,589 - pykg2vec.utils.trainer - INFO - Epoch[632/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 2

2021-05-24 20:50:52,639 - pykg2vec.utils.evaluator - INFO - 
------Test Results for UMLS_KG: Epoch: 690 --- time: 0.00------------
--# of entities, # of relations: 7, 4
--mr,  filtered mr             : 6.5000, 6.5000
--mrr, filtered mrr            : 0.1548, 0.1548
--hits1                        : 0.0000 
--filtered hits1               : 0.0000 
--hits3                        : 0.0000 
--filtered hits3               : 0.0000 
--hits5                        : 0.0000 
--filtered hits5               : 0.0000 
--hits10                        : 1.0000 
--filtered hits10               : 1.0000 
---------------------------------------------------------

2021-05-24 20:50:52,639 - pykg2vec.utils.trainer - INFO - Reset the patience count to 3
2021-05-24 20:50:52,639 - pykg2vec.utils.trainer - INFO - Epoch[691/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,639 - pykg2vec.utils.trainer - INFO - Epoch[692/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 2

0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,689 - pykg2vec.utils.trainer - INFO - Epoch[744/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,690 - pykg2vec.utils.trainer - INFO - Epoch[745/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,691 - pykg2vec.utils.trainer - INFO - Epoch[746/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,691 - pykg2vec.utils.trainer - INFO - Epoch[747/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,692 - pykg2vec.utils.trainer - INFO - Epoch[748/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,692 - pykg2vec.utils.trainer - INFO - Epoch[749/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,693 - pykg2vec.utils.trainer - INFO - Epoch[750/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,694 - pykg2vec.utils.evaluator - INFO - Mini-Testing on [1/1] Triples in the valid set.
  0%|                                        

0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,741 - pykg2vec.utils.trainer - INFO - Epoch[802/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,741 - pykg2vec.utils.trainer - INFO - Epoch[803/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,744 - pykg2vec.utils.trainer - INFO - Epoch[804/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,745 - pykg2vec.utils.trainer - INFO - Epoch[805/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,745 - pykg2vec.utils.trainer - INFO - Epoch[806/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,746 - pykg2vec.utils.trainer - INFO - Epoch[807/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,747 - pykg2vec.utils.trainer - INFO - Epoch[808/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,747 - pykg2vec.utils.trainer - INFO - Epoch[809/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,748 - pykg2vec.utils

0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,792 - pykg2vec.utils.trainer - INFO - Epoch[857/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,792 - pykg2vec.utils.trainer - INFO - Epoch[858/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,793 - pykg2vec.utils.trainer - INFO - Epoch[859/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,795 - pykg2vec.utils.trainer - INFO - Epoch[860/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,796 - pykg2vec.utils.evaluator - INFO - Mini-Testing on [1/1] Triples in the valid set.
  0%|                                                     | 0/1 [00:00<?, ?it/s]100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 816.97it/s]
2021-05-24 20:50:52,799 - pykg2vec.utils.evaluator - INFO - 
------Test Results for UMLS_KG: Epoch: 860 --- time: 0.00------------
--# of entities, # of relations: 7, 4
--mr,  filtered mr             : 6.5000, 6.5000
--mrr,

0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,843 - pykg2vec.utils.trainer - INFO - Epoch[913/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,843 - pykg2vec.utils.trainer - INFO - Epoch[914/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,844 - pykg2vec.utils.trainer - INFO - Epoch[915/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,844 - pykg2vec.utils.trainer - INFO - Epoch[916/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,845 - pykg2vec.utils.trainer - INFO - Epoch[917/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,847 - pykg2vec.utils.trainer - INFO - Epoch[918/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,847 - pykg2vec.utils.trainer - INFO - Epoch[919/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,849 - pykg2vec.utils.trainer - INFO - Epoch[920/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,850 - pykg2vec.utils

0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,894 - pykg2vec.utils.trainer - INFO - Epoch[965/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,894 - pykg2vec.utils.trainer - INFO - Epoch[966/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,895 - pykg2vec.utils.trainer - INFO - Epoch[967/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,898 - pykg2vec.utils.trainer - INFO - Epoch[968/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,899 - pykg2vec.utils.trainer - INFO - Epoch[969/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,900 - pykg2vec.utils.trainer - INFO - Epoch[970/1000]
0it [00:00, ?it/s]0it [00:00, ?it/s]
2021-05-24 20:50:52,901 - pykg2vec.utils.evaluator - INFO - Mini-Testing on [1/1] Triples in the valid set.
  0%|                                                     | 0/1 [00:00<?, ?it/s]100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1116.99it/s

In [293]:
#Converting KGE to BERT embeddings (Domain Term Encoding (DTE)) - part1 (generating associated triples)
#[Entity Expansion]
%cd ~/Desktop/covid-project/UMLS_KG/

import numpy as np
import pandas as pd
import pickle

#Mapping b/w entity and corresponding ID
with open('entity2idx.pkl', 'rb') as f:
    entity2id = pickle.load(f)

#Mapping b/w relation and corresponding ID
with open('relation2idx.pkl', 'rb') as f:
    relation2id = pickle.load(f)

def triple_gen(current_entity):
    results = KGT.query("E1==@current_entity")
    connected_entities = results.E2.to_list()
    outgoing_relations = results.Rel.to_list()
    a = []
    for i in range(len(results)):
        a.append([current_entity, outgoing_relations[i], connected_entities[i]])
    a = [y for x in a for y in x]
    return a

triple_list = []
for entity in tqdm(entity2id.keys()):
    triple_list.append(triple_gen(entity))

/home/ubuntu/Desktop/covid-project/UMLS_KG


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




In [274]:
#Converting KGE to BERT embeddings (Domain Term Encoding (DTE)) - part2 (each KG item -> (KG item, KGE))
#KGE located here
%cd ~/Desktop/covid-project/UMLS_KG/embeddings/distmult

ent_embeddings = pd.read_csv('ent_embedding.tsv', sep='\t', header=None)
rel_embeddings = pd.read_csv('rel_embedding.tsv', sep='\t', header=None)

'''
Associating each item in the triple list with respective embeddings. 
This is done to create an easy Domain Term BERT embedding matrix.
'''
for TL in tqdm(triple_list):
    if TL == []:
        continue
    i = 0
    for index, item in enumerate(TL):
        if (i%3 == 0) or (i%3 == 2): #This item is an entity
            TL[index] = (item, ent_embeddings.iloc[entity2id[item]].to_numpy())
        else: #This item is a relation
            TL[index] = (item, rel_embeddings.iloc[relation2id[item]].to_numpy())
        i += 1

/home/ubuntu/Desktop/covid-project/UMLS_KG/embeddings/distmult


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




In [275]:
#Converting KGE to BERT embeddings (Domain Term Encoding (DTE)) - part3 (Passing KGE's through BERT) 
#[Creating DTE Lookup Table]
from transformers import BertModel
import torch

model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

matrix = model.get_input_embeddings() #BERT embeddings

CLS_embedding = matrix(torch.LongTensor([101]))
SEP_embedding = matrix(torch.LongTensor([102]))

DTE_BERT_Matrix = {}

with torch.no_grad():
    for seq in tqdm(triple_list):
        if seq == []:
            continue
        
        outputs = model(inputs_embeds = torch.unsqueeze(\
                                        torch.cat(\
                                        (CLS_embedding,\
                                         torch.FloatTensor([x[1] for x in seq]), SEP_embedding)), dim=1))
        
        #Collecting all the embeddings for the current domain term in e[]
        e = []
        
        '''
        Starting at 1 & ending at (len -1) to a/c for [CLS] & [SEP].
        Step size is 3 since the required entity occurs in spaces of 3, according to the expansion scheme.
        '''
        for i in range(1, (len(seq) - 1), 3): 
            e.append(outputs[0][i])
        
        '''
        The BERT embedding for each entity will be the average of all its occurrences.
        *e provides all the elements of e (unpacking).
        '''
        DTE_BERT_Matrix[seq[0][0]] = torch.mean(torch.stack([*e], dim=0), dim=0)

'''
Saving DTE_BERT embeddings to a lookup table (dataframe) & clearing DTE_BERT_Matrix.
Dataframes allow quicker lookups
'''
s = pd.DataFrame(list(DTE_BERT_Matrix.items()),columns = ['Term','Embedding'])
s.to_csv('DTE_BERT_Matrix.csv')
DTE_BERT_Matrix.clear()

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




In [278]:
#Extracting embeddings for non-domain terms. I'm simply using BERT's tokenizer for the nDT's.
#Creating question representations in this block.
from transformers import BertTokenizer, BertModel
import torch
import re

#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
question_representations = []

for tup in Metamap_Tokenizations:
    metamap_tokenized_question = tup[1]
    
    #Removing punctuations/spaces from domain-terms for easy comparison
    domain_terms = [re.sub(r'[\W\s]','',x[0]).lower() for x in tup[2]] 

    '''
    Note: is_split_into_words is not the same as pre-tokenized. BERT uses subwords tokenization.
    Thus, when the above is set to True, it simply tells the tokenizer to run BERT's scheme on the resp. words.
    '''
    encoded_input = tokenizer(metamap_tokenized_question, is_split_into_words=True, return_tensors="pt")
    
    model.eval()
    with torch.no_grad():
        outputs = model(**encoded_input)
    
    question_embeddings = []
    start_index = 1
    for word in metamap_tokenized_question:
        filtered_word = re.sub(r'\W','',word).lower()
        number_of_subwords = len(tokenizer(word)['input_ids']) - 2 #1 for CLS & 1 for SEP
        end_index = start_index + number_of_subwords
        
        if filtered_word in domain_terms: #Use DTE_BERT_Matrix
            mapped_concept = tup[2][domain_terms.index(filtered_word)][1]
            DTE = s.query("Term==@mapped_concept")['Embedding'].values[0]
            question_embeddings.append(DTE)
        
        else: #Use Regular BERT subword embeddings
            question_embeddings.append(outputs.last_hidden_state[0][start_index:end_index])
        
        start_index = end_index
    
    #In this way, I don't have to add the CLS & SEP embeddings during fine-tuning.
    final_representation = torch.unsqueeze(torch.cat(\
                                                     (CLS_embedding, \
                                                      torch.FloatTensor([x for x in question_embeddings]),\
                                                      SEP_embedding)), dim=1)
    
    question_representations.append(final_representation)

#Saving the question vectors to disk
with open('question_representation.data', 'wb') as filehandle:
    pickle.dump(question_representations, filehandle)

IndexError: index 0 is out of bounds for axis 0 with size 0

In [294]:
entity2id

{'Daily or Recreational Activity': 0,
 'Disease or Syndrome': 1,
 'Functional Concept': 2,
 'Idea or Concept': 3,
 'Pathologic Function': 4,
 'Qualitative Concept': 5,
 'Virus': 6}