In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from tqdm import tqdm
from biobert_embedding.embedding import BiobertEmbedding

data = pd.read_csv('complete_knowledge_base_with_bi_relations_temp_stable_icd.txt', sep = "\t", header = None)
data.head()
nodes_dic = {}

for i in tqdm(range(0,data.shape[0])):
    if data[0][i] in nodes_dic and data[2][i] in nodes_dic:
        continue
    if data[1][i] == '_associated_to':
        nodes_dic[str(data[0][i])+" homo sapiens disease"] = 'disease'
        nodes_dic[str(data[2][i])+" homo sapiens disease"] = 'disease'
    elif data[1][i] == '_is_treated_by':
        nodes_dic[str(data[0][i])+" homo sapiens disease"] = 'disease'
        nodes_dic[str(data[2][i])+" homo sapiens drug"] = 'drug'
    elif data[1][i] == '_treats':
        nodes_dic[str(data[0][i])+" homo sapiens drug"] = 'drug'
        nodes_dic[str(data[2][i])+" homo sapiens disease"] = 'disease'
    elif data[1][i] == '_is_caused_by':
        nodes_dic[str(data[0][i])+" homo sapiens disease"] = 'disease'
        nodes_dic[str(data[2][i])+" homo sapiens gene"] = 'gene'
    elif data[1][i] == '_causes':
        nodes_dic[str(data[0][i])+" homo sapiens gene"] = 'gene'
        nodes_dic[str(data[2][i])+" homo sapiens disease"] = 'disease'
    elif data[1][i] == '_has_disease_phenotype':
        nodes_dic[str(data[0][i])+" homo sapiens disease"] = 'disease'
        nodes_dic[str(data[2][i])+" homo sapiens phenotype"] = 'phenotype'
    elif data[1][i] == '_disrupts':
        nodes_dic[str(data[0][i])+" homo sapiens disease"] = 'disease'
        nodes_dic[str(data[2][i])+" homo sapiens pathway"] = 'pathway'
    elif data[1][i] == '_has_side_effects':
        nodes_dic[str(data[0][i])+" homo sapiens drug"] = 'drug'
        nodes_dic[str(data[2][i])+" homo sapiens phenotype"] = 'phenotype'
    elif data[1][i] == '_side_effect_of':
        nodes_dic[str(data[0][i])+" homo sapiens phenotype"] = 'phenotype'
        nodes_dic[str(data[2][i])+" homo sapiens drug"] = 'drug'
    elif data[1][i] == '_is_gene_phenotype_of':
        nodes_dic[str(data[0][i])+" homo sapiens phenotype"] = 'phenotype'
        nodes_dic[str(data[2][i])+" homo sapiens gene"] = 'gene'
    elif data[1][i] == '_has_gene_phenotype':
        nodes_dic[str(data[0][i])+" homo sapiens gene"] = 'gene'
        nodes_dic[str(data[2][i])+" homo sapiens phenotype"] = 'phenotype'
    elif data[1][i] == '_involves':
        nodes_dic[str(data[0][i])+" homo sapiens pathway"] = 'pathway'
        nodes_dic[str(data[2][i])+" homo sapiens gene"] = 'gene'
    elif data[1][i] == '_has_interaction':
        nodes_dic[str(data[0][i])+" homo sapiens gene"] = 'gene'
        nodes_dic[str(data[2][i])+" homo sapiens gene"] = 'gene'
    elif data[1][i] == '_has_function':
        nodes_dic[str(data[0][i])+" homo sapiens gene"] = 'gene'
        nodes_dic[str(data[2][i])+" homo sapiens GO"] = 'GO'
    elif data[1][i] == '_is_function_of':
        nodes_dic[str(data[0][i])+" homo sapiens GO"] = 'GO'
        nodes_dic[str(data[2][i])+" homo sapiens gene"] = 'gene'
        
node0 = {}
node1 = {}
node2 = {}
node3 = {}
node4 = {}
node5 = {}

for key in nodes_dic:
    key = str(key)
    if nodes_dic[key] == 'disease':
        node0[key] = 'disease'
    elif nodes_dic[key] == 'drug':
        node1[key] = 'drug'
    elif nodes_dic[key] == 'pathway':
        node2[key] = 'pathway'
    elif nodes_dic[key] == 'gene':
        node3[key] = 'gene'
    elif nodes_dic[key] == 'phenotype':
        node4[key] = 'phenotype'
    elif nodes_dic[key] == 'GO':
        node5[key] = 'GO'

100%|██████████████████████████████████████████████████████████████████████| 2087736/2087736 [05:08<00:00, 6765.98it/s]


In [2]:
values = list(nodes_dic.keys()) 
#print(values)

biobert = BiobertEmbedding(model_path = "./biobert_v1.1_pubmed_pytorch_model")

sentence_embedding = []
for i in tqdm(range(0,len(values))):
    sentence_embedding.append(biobert.sentence_vector(values[i]))

100%|████████████████████████████████████████████████████████████████████████████| 52382/52382 [50:21<00:00, 17.33it/s]


In [3]:
print(sentence_embedding[0].shape)

with open('feature_embeddings.txt', 'w') as f:
    for line in sentence_embedding:
        f.write(f"{line}\n")

torch.Size([768])


In [4]:
values = list(nodes_dic.keys()) 
#print(values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
#print(integer_encoded)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
#print(onehot_encoded)

lab = {}
feats = {}
i = 0
for key in nodes_dic:
    lab[key] = integer_encoded[i]
    feats[key] = sentence_embedding[i]
    i = i+1
    
#print(lab)

In [5]:
values = list(node0.keys()) 
newvalues = []

for i in values:
    newvalues.append(i.split(" ")[0])

k = len(values)
df = pd.DataFrame(list(range(0,len(values))), columns = ['node_id'])
labels = []
features = []
for key in node0:
    labels.append(lab[key][0])
    features.append(feats[key])
    
df['label'] = values
feat = []
for i in features:
    st = ','.join([str(x) for x in i.numpy()])
    feat.append(st)

node0dict = pd.Series(df.node_id.values, index = df.label).to_dict()
df['feat'] = feat
df['label'] = labels
df['entity'] = newvalues
df.to_csv('nodes_0.csv', index = False)
df.head()

Unnamed: 0,node_id,label,feat,entity
0,0,5112,"0.05508331,-0.33484593,0.10192123,-0.26511663,...",201.2
1,1,16175,"0.060138892,-0.34264126,0.006986717,-0.3092374...",577.8
2,2,19271,"0.0026293222,-0.36949572,0.13884567,-0.2916490...",759.3
3,3,9807,"-0.058700405,-0.36050797,0.043681093,-0.369164...",345.9
4,4,13740,"0.07480559,-0.30327848,0.00028393665,-0.302976...",521.0


In [6]:
values = list(node1.keys()) 
newvalues = []

for i in values:
    newvalues.append(i.split(" ")[0])
    
df = pd.DataFrame(list(range(k,k+len(values))), columns = ['node_id'])
k = k+len(values)
labels = []
features = []
for key in node1:
    labels.append(lab[key][0])
    features.append(feats[key])
    
df['label'] = values
feat = []

for i in features:
    st = ','.join([str(x) for x in i.numpy()])
    feat.append(st)

node1dict = pd.Series(df.node_id.values, index = df.label).to_dict()
df['feat'] = feat
df['label'] = labels
df['entity'] = newvalues
df.to_csv('nodes_1.csv', index = False)
df.head()

Unnamed: 0,node_id,label,feat,entity
0,2464,24311,"0.06522106,-0.2211061,0.11408888,0.040378723,0...",CID100060864
1,2465,24000,"0.15641543,-0.1384391,0.061920777,0.02426165,0...",CID100005212
2,2466,23988,"0.088535234,-0.11837815,0.19366908,-0.01153388...",CID100005095
3,2467,23729,"6.449209e-06,-0.20198692,0.0611973,0.010203489...",CID100003715
4,2468,23434,"0.118697315,-0.13195342,0.047936473,0.04756776...",CID100002182


In [7]:
values = list(node2.keys()) 
newvalues = []

for i in values:
    newvalues.append(i.split(" ")[0])

df = pd.DataFrame(list(range(k,k+len(values))), columns = ['node_id'])
k = k+len(values)
labels = []
features = []
for key in node2:
    labels.append(lab[key][0])
    features.append(feats[key])
    
df['label'] = values
feat = []

for i in features:
    st = ','.join([str(x) for x in i.numpy()])
    feat.append(st)

node2dict = pd.Series(df.node_id.values, index = df.label).to_dict()
df['feat'] = feat
df['label'] = labels
df['entity'] = newvalues
df.to_csv('nodes_2.csv', index = False)
df.head()

Unnamed: 0,node_id,label,feat,entity
0,3924,50489,"0.13868761,-0.3368274,0.04618159,-0.020018913,...",R-HSA-1643685
1,3925,52069,"0.23453543,-0.3526016,-0.0079329135,-0.1296043...",R-HSA-74217
2,3926,52022,"0.27988893,-0.3734045,-0.028199142,-0.17374587...",R-HSA-72613
3,3927,50432,"0.2299288,-0.33930093,0.011758104,-0.11767156,...",R-HSA-157118
4,3928,51457,"0.17196472,-0.24499781,0.013350292,0.002207587...",R-HSA-5218920


In [8]:
values = list(node3.keys()) 
newvalues = []

for i in values:
    newvalues.append(i.split(" ")[0])

df = pd.DataFrame(list(range(k, k+len(values))), columns = ['node_id'])
k = k + len(values)

labels = []
features = []
for key in node3:
    labels.append(lab[key][0])
    features.append(feats[key])
    
df['label'] = values
feat = []

for i in features:
    st = ','.join([str(x) for x in i.numpy()])
    feat.append(st)

node3dict = pd.Series(df.node_id.values, index = df.label).to_dict()
df['feat'] = feat
df['label'] = labels
df['entity'] = newvalues
df.to_csv('nodes_3.csv', index = False)

In [9]:
values = list(node4.keys()) 
newvalues = []

for i in values:
    newvalues.append(i.split(" ")[0])

df = pd.DataFrame(list(range(k,k+len(values))), columns = ['node_id'])
k = k+len(values)
labels = []
features = []
for key in node4:
    labels.append(lab[key][0])
    features.append(feats[key])
    
df['label'] = values
feat = []

for i in features:
    st = ','.join([str(x) for x in i.numpy()])
    feat.append(st)

node4dict = pd.Series(df.node_id.values, index = df.label).to_dict()
df['feat'] = feat
df['label'] = labels
df['entity'] = newvalues
df.to_csv('nodes_4.csv', index = False)

In [10]:
values = list(node5.keys()) 
newvalues = []

for i in values:
    newvalues.append(i.split(" ")[0])

df = pd.DataFrame(list(range(k,k+len(values))), columns = ['node_id'])
k = k + len(values)
labels = []
features = []
for key in node5:
    labels.append(lab[key][0])
    features.append(feats[key])
    
df['label'] = values
feat = []

for i in features:
    st = ','.join([str(x) for x in i.numpy()])
    feat.append(st)

node5dict = pd.Series(df.node_id.values, index = df.label).to_dict()
df['feat'] = feat
df['label'] = labels
df['entity'] = newvalues
df.to_csv('nodes_5.csv', index = False)

In [11]:
src = []
dst = []
label = []
feat = []
etype = []

for i in tqdm(range(0,data.shape[0])):
    if data[1][i] == '_associated_to':
        src.append(node0dict[str(data[0][i])+" homo sapiens disease"])
        dst.append(node0dict[str(data[2][i])+" homo sapiens disease"])
        label.append(0)
        feat.append(0)
        etype.append(0)
        
        
    elif data[1][i] == '_is_treated_by':
        src.append(node0dict[str(data[0][i])+" homo sapiens disease"])
        dst.append(node1dict[str(data[2][i])+" homo sapiens drug"])
        label.append(1)
        feat.append(1)
        etype.append(1)
        
        
    elif data[1][i] == '_treats':
        src.append(node1dict[str(data[0][i])+" homo sapiens drug"])
        dst.append(node0dict[str(data[2][i])+" homo sapiens disease"])
        label.append(2)
        feat.append(2)
        etype.append(2)
        
    elif data[1][i] == '_is_caused_by':
        src.append(node0dict[str(data[0][i])+" homo sapiens disease"])
        dst.append(node3dict[str(data[2][i])+" homo sapiens gene"])
        label.append(4)
        feat.append(4)
        etype.append(4)
        
       
    elif data[1][i] == '_causes':
        src.append(node3dict[str(data[0][i])+" homo sapiens gene"])
        dst.append(node0dict[str(data[2][i])+" homo sapiens disease"])
        label.append(5)
        feat.append(5)
        etype.append(5)
       
        
    elif data[1][i] == '_has_disease_phenotype':
        src.append(node0dict[str(data[0][i])+" homo sapiens disease"])
        dst.append(node4dict[str(data[2][i])+" homo sapiens phenotype"])
        label.append(6)
        feat.append(6)
        etype.append(6)
        
        
    elif data[1][i] == '_disrupts':
        src.append(node0dict[str(data[0][i])+" homo sapiens disease"])
        dst.append(node2dict[str(data[2][i])+" homo sapiens pathway"])
        label.append(3)
        feat.append(3)
        etype.append(3)
        
        
    elif data[1][i] == '_has_side_effects':
        src.append(node1dict[str(data[0][i])+" homo sapiens drug"])
        dst.append(node4dict[str(data[2][i])+" homo sapiens phenotype"])
        label.append(7)
        feat.append(7)
        etype.append(7)
        
        
    elif data[1][i] == '_side_effect_of':
        src.append(node4dict[str(data[0][i])+" homo sapiens phenotype"])
        dst.append(node1dict[str(data[2][i])+" homo sapiens drug"])
        label.append(8)
        feat.append(8)
        etype.append(8)
       
        
    elif data[1][i] == '_is_gene_phenotype_of':
        src.append(node4dict[str(data[0][i])+" homo sapiens phenotype"])
        dst.append(node3dict[str(data[2][i])+" homo sapiens gene"])
        label.append(10)
        feat.append(10)
        etype.append(10)
        
        
    elif data[1][i] == '_has_gene_phenotype':
        src.append(node3dict[str(data[0][i])+" homo sapiens gene"])
        dst.append(node4dict[str(data[2][i])+" homo sapiens phenotype"])
        label.append(11)
        feat.append(11)
        etype.append(11)
        
        
    elif data[1][i] == '_involves':
        src.append(node2dict[str(data[0][i])+" homo sapiens pathway"])
        dst.append(node3dict[str(data[2][i])+" homo sapiens gene"])
        label.append(9)
        feat.append(9)
        etype.append(9)
        
    elif data[1][i] == '_has_interaction':
        src.append(node3dict[str(data[0][i])+" homo sapiens gene"])
        dst.append(node3dict[str(data[2][i])+" homo sapiens gene"])
        label.append(14)
        feat.append(14)
        etype.append(14)
       
    elif data[1][i] == '_has_function':
        src.append(node3dict[str(data[0][i])+" homo sapiens gene"])
        dst.append(node5dict[str(data[2][i])+" homo sapiens GO"])
        label.append(13)
        feat.append(13)
        etype.append(13)
      
    elif data[1][i] == '_is_function_of':
        src.append(node5dict[str(data[0][i])+" homo sapiens GO"])
        dst.append(node3dict[str(data[2][i])+" homo sapiens gene"])
        label.append(12)
        feat.append(12)
        etype.append(12)
    
edges = pd.DataFrame(list(zip(src,dst,label,feat,etype)), columns = ['src_id','dst_id','label','feat','type'])
edges.head()

100%|██████████████████████████████████████████████████████████████████████| 2087736/2087736 [05:26<00:00, 6394.28it/s]


Unnamed: 0,src_id,dst_id,label,feat,type
0,6096,34800,13,13,13
1,0,6097,4,4,4
2,34801,6098,12,12,12
3,1,2,0,0,0
4,2464,26945,7,7,7


In [12]:
df = edges[edges.type == 0].drop(['type'], axis = 1)
df.to_csv('edges_0.csv', index = False)

df = edges[edges.type == 1].drop(['type'], axis = 1)
df.to_csv('edges_1.csv', index = False)

df = edges[edges.type == 2].drop(['type'], axis = 1)
df.to_csv('edges_2.csv', index = False)

df = edges[edges.type == 3].drop(['type'], axis = 1)
df.to_csv('edges_3.csv', index = False)

df = edges[edges.type == 4].drop(['type'], axis = 1)
df.to_csv('edges_4.csv', index = False)

df = edges[edges.type == 5].drop(['type'], axis = 1)
df.to_csv('edges_5.csv', index = False)

df = edges[edges.type == 6].drop(['type'], axis = 1)
df.to_csv('edges_6.csv', index = False)

df = edges[edges.type == 7].drop(['type'], axis = 1)
df.to_csv('edges_7.csv', index = False)

df = edges[edges.type == 8].drop(['type'], axis = 1)
df.to_csv('edges_8.csv', index = False)

df = edges[edges.type == 9].drop(['type'], axis = 1)
df.to_csv('edges_9.csv', index = False)

df = edges[edges.type == 10].drop(['type'], axis = 1)
df.to_csv('edges_10.csv', index = False)

df = edges[edges.type == 11].drop(['type'], axis = 1)
df.to_csv('edges_11.csv', index = False)

df = edges[edges.type == 12].drop(['type'], axis = 1)
df.to_csv('edges_12.csv', index = False)

df = edges[edges.type == 13].drop(['type'], axis = 1)
df.to_csv('edges_13.csv', index = False)

df = edges[edges.type == 14].drop(['type'], axis = 1)
df.to_csv('edges_14.csv', index = False)

In [13]:
import pandas as pd

node0 = pd.read_csv("nodes_0.csv")
node1 = pd.read_csv("nodes_1.csv")
node2 = pd.read_csv("nodes_2.csv")
node3 = pd.read_csv("nodes_3.csv")
node4 = pd.read_csv("nodes_4.csv")
node5 = pd.read_csv("nodes_5.csv")

In [14]:
node0

Unnamed: 0,node_id,label,feat,entity
0,0,5112,"0.05508331,-0.33484593,0.10192123,-0.26511663,...",201.20
1,1,16175,"0.060138892,-0.34264126,0.006986717,-0.3092374...",577.80
2,2,19271,"0.0026293222,-0.36949572,0.13884567,-0.2916490...",759.30
3,3,9807,"-0.058700405,-0.36050797,0.043681093,-0.369164...",345.90
4,4,13740,"0.07480559,-0.30327848,0.00028393665,-0.302976...",521.00
...,...,...,...,...
2459,2459,10187,"0.03987622,-0.35188746,0.0136106135,-0.3479922...",362.53
2460,2460,18448,"0.046785254,-0.31343025,0.06520327,-0.2893391,...",708.40
2461,2461,3586,"-0.0036240383,-0.44269314,-0.013500067,-0.2713...",143.10
2462,2462,5881,"0.066017166,-0.36556908,0.062845275,-0.2936905...",230.30


In [15]:
node0_new = node0.drop(columns = {'label','feat'})
node1_new = node1.drop(columns = {'label','feat'})
node2_new = node2.drop(columns = {'label','feat'})
node3_new = node3.drop(columns = {'label','feat'})
node4_new = node4.drop(columns = {'label','feat'})
node5_new = node5.drop(columns = {'label','feat'})

In [16]:
df = pd.concat([node0_new, node1_new, node2_new, node3_new,node4_new, node5_new], axis=0)

In [17]:
df.to_csv('old_id_to_entity.csv', index = False)