In [1]:
# make the file headers correct
# https://neo4j.com/docs/operations-manual/current/tools/import/file-header-format/

In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
import shelve
import re
from collections import defaultdict, Counter
from tqdm import tqdm
import requests
from pyquery import PyQuery as pq

In [2]:
# https://biolink.github.io/biolink-model/docs/NamedThing.html
nodes = pd.read_csv("nodes_biolink.csv")
nodes.head()

Unnamed: 0,ID,label,TYPE,bl_type
0,C0038195,State Medicine,Activities & Behaviors,activity
1,C1290952,Taking medication,Activities & Behaviors,activity
2,C0085092,Parenting behavior,Activities & Behaviors,activity
3,C1096771,Murderer,Activities & Behaviors,activity
4,C1299651,Serious reportable event,Activities & Behaviors,activity


In [3]:
del nodes['TYPE']

In [4]:
nodes.ID = "UMLS:" + nodes.ID
nodes['category:STRING'] = nodes.bl_type
nodes['id:STRING'] = nodes.ID

In [5]:
nodes.rename(columns = {'ID': ':ID', 'label': 'name:STRING', 'bl_type': ':LABEL'}, inplace=True)

In [6]:
nodes.head()

Unnamed: 0,:ID,name:STRING,:LABEL,category:STRING,id:STRING
0,UMLS:C0038195,State Medicine,activity,activity,UMLS:C0038195
1,UMLS:C1290952,Taking medication,activity,activity,UMLS:C1290952
2,UMLS:C0085092,Parenting behavior,activity,activity,UMLS:C0085092
3,UMLS:C1096771,Murderer,activity,activity,UMLS:C1096771
4,UMLS:C1299651,Serious reportable event,activity,activity,UMLS:C1299651


In [7]:
nodes.to_csv("nodes_neo4j.csv", index=False)

In [None]:
###### EDGES

In [15]:
# https://biolink.github.io/biolink-model/docs/Association.html
edges = pd.read_csv('edges_biolink.csv')

In [16]:
edges.START_ID = "UMLS:" + edges.START_ID
edges.END_ID = "UMLS:" + edges.END_ID

In [17]:
edges.head()

Unnamed: 0,START_ID,END_ID,pmids,n_pmids,NEG,bl_pred
0,UMLS:C0021769,UMLS:C1182654,1597294,1,False,ADMINISTERED_TO
1,UMLS:C0001271,UMLS:C0007635,6684662;11037792,2,False,ADMINISTERED_TO
2,UMLS:C0003339,UMLS:C0014467,11857592,1,False,ADMINISTERED_TO
3,UMLS:C0028612,UMLS:C0085080,1985199,1,False,ADMINISTERED_TO
4,UMLS:C0059249,UMLS:C0431085,12739069,1,False,ADMINISTERED_TO


In [18]:
edges.bl_pred = edges.bl_pred.str.lower()
edges.rename(columns = {'START_ID': ':START_ID', 'END_ID': ':END_ID', 
                        'bl_pred': ':TYPE', 'NEG': 'negated'}, inplace=True)

In [19]:
edges['is_defined_by'] = "semmeddb"
edges['relation'] = "semmeddb:" + edges[":TYPE"].str.lower()
edges['provided_by'] = "semmeddb_sulab"

In [20]:
edges.head()

Unnamed: 0,:START_ID,:END_ID,pmids,n_pmids,negated,:TYPE,is_defined_by,relation,provided_by
0,UMLS:C0021769,UMLS:C1182654,1597294,1,False,administered_to,semmeddb,semmeddb:administered_to,semmeddb_sulab
1,UMLS:C0001271,UMLS:C0007635,6684662;11037792,2,False,administered_to,semmeddb,semmeddb:administered_to,semmeddb_sulab
2,UMLS:C0003339,UMLS:C0014467,11857592,1,False,administered_to,semmeddb,semmeddb:administered_to,semmeddb_sulab
3,UMLS:C0028612,UMLS:C0085080,1985199,1,False,administered_to,semmeddb,semmeddb:administered_to,semmeddb_sulab
4,UMLS:C0059249,UMLS:C0431085,12739069,1,False,administered_to,semmeddb,semmeddb:administered_to,semmeddb_sulab


In [21]:
edges.to_csv("edges_neo4j.csv", index=False)

In [23]:
print(len(nodes))
nodes = nodes[nodes[':ID'].isin(set(list(edges[':START_ID']) + list(edges[':END_ID'])))]
print(len(nodes))

254259
242644


In [24]:
nodes.to_csv("nodes_neo4j.csv", index=False)