In [1]:
# make the file headers correct
# https://neo4j.com/docs/operations-manual/current/tools/import/file-header-format/

In [2]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
import shelve
import re
from collections import defaultdict, Counter
from tqdm import tqdm
import requests
from pyquery import PyQuery as pq

In [8]:
# https://biolink.github.io/biolink-model/docs/NamedThing.html
nodes = pd.read_csv("nodes_biolink.csv")
del nodes['umls_type']
nodes.head()

Unnamed: 0,ID,label,umls_type_label,blm_type
0,C1290952,Taking medication,Daily or Recreational Activity,activity_and_behavior
1,C0085092,Parenting behavior,Social Behavior,activity_and_behavior
2,C1096771,Murderer,Individual Behavior,activity_and_behavior
3,C0006875,Cannibalism,Social Behavior,activity_and_behavior
4,C0871454,Study Habits,Individual Behavior,activity_and_behavior


In [9]:
nodes.blm_type.value_counts()

chemical_substance                59088
disease_or_phenotypic_feature     36967
gene                              18548
biological_entity                 15105
protein                           12883
gross_anatomical_structure         8948
biological_process_or_activity     6887
anatomical_entity                  2831
cell_component                     1636
cell                               1173
activity_and_behavior               916
phenotypic_feature                  416
genomic_entity                      179
Name: blm_type, dtype: int64

In [10]:
nodes.ID = "UMLS:" + nodes.ID
nodes['category:STRING'] = nodes.blm_type
nodes['id:STRING'] = nodes.ID
nodes.umls_type_label = nodes.umls_type_label.str.replace("|", ";")

In [11]:
nodes.rename(columns = {'ID': ':ID', 
                        'label': 'name:STRING', 
                        'blm_type': ':LABEL',
                        'umls_type_label': 'umls_type:STRING[]'}, inplace=True)

In [12]:
nodes.head()

Unnamed: 0,:ID,name:STRING,umls_type:STRING[],:LABEL,category:STRING,id:STRING
0,UMLS:C1290952,Taking medication,Daily or Recreational Activity,activity_and_behavior,activity_and_behavior,UMLS:C1290952
1,UMLS:C0085092,Parenting behavior,Social Behavior,activity_and_behavior,activity_and_behavior,UMLS:C0085092
2,UMLS:C1096771,Murderer,Individual Behavior,activity_and_behavior,activity_and_behavior,UMLS:C1096771
3,UMLS:C0006875,Cannibalism,Social Behavior,activity_and_behavior,activity_and_behavior,UMLS:C0006875
4,UMLS:C0871454,Study Habits,Individual Behavior,activity_and_behavior,activity_and_behavior,UMLS:C0871454


In [13]:
nodes.to_csv("nodes_neo4j.csv", index=False)

In [10]:
###### EDGES

In [14]:
# https://biolink.github.io/biolink-model/docs/Association.html
edges = pd.read_csv('edges_biolink.csv')

In [15]:
edges.START_ID = "UMLS:" + edges.START_ID
edges.END_ID = "UMLS:" + edges.END_ID

In [16]:
edges.head()

Unnamed: 0,START_ID,END_ID,pmids,n_pmids,NEG,bl_pred
0,UMLS:C0086931,UMLS:C0037369,26656404;8816000;19745413;17710714;21412223;19...,6,False,AFFECTS
1,UMLS:C0950156,UMLS:C0009671,12584106,1,False,AFFECTS
2,UMLS:C0024810,UMLS:C0035651,15775958;11173168,2,False,AFFECTS
3,UMLS:C0556297,UMLS:C0037369,12164677;9373700,2,False,AFFECTS
4,UMLS:C1154333,UMLS:C0015745,23872405,1,False,AFFECTS


In [17]:
edges.bl_pred = edges.bl_pred.str.lower()
edges.rename(columns = {'START_ID': ':START_ID', 'END_ID': ':END_ID', 
                        'bl_pred': ':TYPE', 'NEG': 'negated'}, inplace=True)

In [18]:
edges['is_defined_by'] = "semmeddb"
edges['relation'] = "semmeddb:" + edges[":TYPE"].str.lower()
edges['provided_by'] = "semmeddb_sulab"

In [19]:
edges.head()

Unnamed: 0,:START_ID,:END_ID,pmids,n_pmids,negated,:TYPE,is_defined_by,relation,provided_by
0,UMLS:C0086931,UMLS:C0037369,26656404;8816000;19745413;17710714;21412223;19...,6,False,affects,semmeddb,semmeddb:affects,semmeddb_sulab
1,UMLS:C0950156,UMLS:C0009671,12584106,1,False,affects,semmeddb,semmeddb:affects,semmeddb_sulab
2,UMLS:C0024810,UMLS:C0035651,15775958;11173168,2,False,affects,semmeddb,semmeddb:affects,semmeddb_sulab
3,UMLS:C0556297,UMLS:C0037369,12164677;9373700,2,False,affects,semmeddb,semmeddb:affects,semmeddb_sulab
4,UMLS:C1154333,UMLS:C0015745,23872405,1,False,affects,semmeddb,semmeddb:affects,semmeddb_sulab


In [None]:
edges.to_csv("edges_neo4j.csv", index=False)