In [1]:
# make the file headers correct
# https://neo4j.com/docs/operations-manual/current/tools/import/file-header-format/

In [2]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
import shelve
import re
from collections import defaultdict, Counter
from tqdm import tqdm
import requests
from pyquery import PyQuery as pq

In [3]:
# https://biolink.github.io/biolink-model/docs/NamedThing.html
nodes = pd.read_csv("nodes_biolink.csv")
nodes.head()

Unnamed: 0,ID,label,TYPE,bl_type
0,C0038195,State Medicine,Activities & Behaviors,activity_and_behavior
1,C1290952,Taking medication,Activities & Behaviors,activity_and_behavior
2,C0085092,Parenting behavior,Activities & Behaviors,activity_and_behavior
3,C1096771,Murderer,Activities & Behaviors,activity_and_behavior
4,C1299651,Serious reportable event,Activities & Behaviors,activity_and_behavior


In [4]:
nodes.bl_type.value_counts()

chemical_substance       60964
individual_organism      41618
disease                  38245
protein                  28504
genomic_entity           22456
anatomical_entity        15181
biological_process        9163
named_thing               1342
activity_and_behavior     1011
Name: bl_type, dtype: int64

In [5]:
del nodes['TYPE']

In [6]:
nodes.ID = "UMLS:" + nodes.ID
nodes['category:STRING'] = nodes.bl_type
nodes['id:STRING'] = nodes.ID

In [7]:
nodes.rename(columns = {'ID': ':ID', 'label': 'name:STRING', 'bl_type': ':LABEL'}, inplace=True)

In [8]:
nodes.head()

Unnamed: 0,:ID,name:STRING,:LABEL,category:STRING,id:STRING
0,UMLS:C0038195,State Medicine,activity_and_behavior,activity_and_behavior,UMLS:C0038195
1,UMLS:C1290952,Taking medication,activity_and_behavior,activity_and_behavior,UMLS:C1290952
2,UMLS:C0085092,Parenting behavior,activity_and_behavior,activity_and_behavior,UMLS:C0085092
3,UMLS:C1096771,Murderer,activity_and_behavior,activity_and_behavior,UMLS:C1096771
4,UMLS:C1299651,Serious reportable event,activity_and_behavior,activity_and_behavior,UMLS:C1299651


In [9]:
nodes.to_csv("nodes_neo4j.csv", index=False)

In [10]:
###### EDGES

In [11]:
# https://biolink.github.io/biolink-model/docs/Association.html
edges = pd.read_csv('edges_biolink.csv')

In [12]:
edges.START_ID = "UMLS:" + edges.START_ID
edges.END_ID = "UMLS:" + edges.END_ID

In [13]:
edges.head()

Unnamed: 0,START_ID,END_ID,pmids,n_pmids,NEG,bl_pred,bl_type
0,UMLS:C0086931,UMLS:C0037369,26656404;8816000;19745413;17710714;21412223;19...,6,False,AFFECTS,activity_and_behavior.AFFECTS.activity_and_beh...
1,UMLS:C0950156,UMLS:C0009671,12584106,1,False,AFFECTS,activity_and_behavior.AFFECTS.activity_and_beh...
2,UMLS:C0024810,UMLS:C0035651,15775958;11173168,2,False,AFFECTS,activity_and_behavior.AFFECTS.activity_and_beh...
3,UMLS:C0556297,UMLS:C0037369,12164677;9373700,2,False,AFFECTS,activity_and_behavior.AFFECTS.activity_and_beh...
4,UMLS:C1154333,UMLS:C0015745,23872405,1,False,AFFECTS,activity_and_behavior.AFFECTS.activity_and_beh...


In [14]:
edges.bl_pred = edges.bl_pred.str.lower()
edges.rename(columns = {'START_ID': ':START_ID', 'END_ID': ':END_ID', 
                        'bl_pred': ':TYPE', 'NEG': 'negated'}, inplace=True)

In [15]:
edges['is_defined_by'] = "semmeddb"
edges['relation'] = "semmeddb:" + edges[":TYPE"].str.lower()
edges['provided_by'] = "semmeddb_sulab"

In [16]:
edges.head()

Unnamed: 0,:START_ID,:END_ID,pmids,n_pmids,negated,:TYPE,bl_type,is_defined_by,relation,provided_by
0,UMLS:C0086931,UMLS:C0037369,26656404;8816000;19745413;17710714;21412223;19...,6,False,affects,activity_and_behavior.AFFECTS.activity_and_beh...,semmeddb,semmeddb:affects,semmeddb_sulab
1,UMLS:C0950156,UMLS:C0009671,12584106,1,False,affects,activity_and_behavior.AFFECTS.activity_and_beh...,semmeddb,semmeddb:affects,semmeddb_sulab
2,UMLS:C0024810,UMLS:C0035651,15775958;11173168,2,False,affects,activity_and_behavior.AFFECTS.activity_and_beh...,semmeddb,semmeddb:affects,semmeddb_sulab
3,UMLS:C0556297,UMLS:C0037369,12164677;9373700,2,False,affects,activity_and_behavior.AFFECTS.activity_and_beh...,semmeddb,semmeddb:affects,semmeddb_sulab
4,UMLS:C1154333,UMLS:C0015745,23872405,1,False,affects,activity_and_behavior.AFFECTS.activity_and_beh...,semmeddb,semmeddb:affects,semmeddb_sulab


In [17]:
edges.to_csv("edges_neo4j.csv", index=False)