In [10]:
import pandas as pd
# import numpy as np
import getpass
from neo4j import GraphDatabase

# import matplotlib 
# import matplotlib.pyplot as plt
# plt.style.use('fivethirtyeight')

pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option("max_colwidth", 999)
pd.set_option("max_rows", 999)

In [11]:
# Define connection uri, username & password parameters to establish driver connection (using neo4j official driver)
uri = 'neo4j://0.0.0.0:7687'
user = 'neo4j'
password = getpass.getpass("Please enter password to Neo4j Graph: ") # getpass2

········


##### Create Unique Property Constraints

In [13]:
# Define driver
driver = GraphDatabase.driver(uri=uri, auth=(user, password))
print(driver)

<neo4j.Neo4jDriver object at 0x12ad5a250>


In [15]:
# Apply all unique property contraints & indexes to the graph
with driver.session(database="neo4j") as session:
    display(session.run("CREATE CONSTRAINT ON (aui:Atom) ASSERT aui.AtomId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (attribute:Attribute) ASSERT attribute.AtuiId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (cui:Concept) ASSERT cui.ConceptId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (code:Code) ASSERT code.CodeId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (atc:ATC) ASSERT atc.CodeId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (drugbank:DRUGBANK) ASSERT drugbank.CodeId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (ensembl:ENSEMBLGENE_ID) assert ensembl.AtuiId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (entrez:ENTREZGENE_ID) assert entrez.AtuiId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (go:GO) ASSERT go.CodeId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (hgnc:HGNC) ASSERT hgnc.CodeId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (hpo:HPO) ASSERT hpo.CodeId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (icdo:ICDO3Code) ASSERT icdo.AtuiId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (icd9cm:ICD9CM) ASSERT icd9cm.CodeId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (icd10cm:ICD10CM) ASSERT icd10cm.CodeId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (icd10pcs:ICD10PCS) ASSERT icd10pcs.ICD10PCS IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (ndc:NDC) ASSERT ndc.AtuiId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (drugclass:IS_DRUG_CLASS) ASSERT drugclass.AtuiId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (nci:NCI) ASSERT nci.CodeId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (msh:MSH) ASSERT msh.CodeId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (medrt:`MED-RT`) ASSERT medrt.CodeId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (rxnorm:RXNORM) ASSERT rxnorm.CodeId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (sct:SNOMEDCT_US) ASSERT sct.CodeId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (sty:SemanticType) ASSERT sty.SemanticTypeId IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (sty:SemanticType) ASSERT sty.sty IS UNIQUE").consume().counters)
    display(session.run("CREATE CONSTRAINT ON (sty:SemanticType) ASSERT sty.stn IS UNIQUE").consume().counters)
    display(session.run("CREATE INDEX FOR (code:Code) ON (code.ontology)").consume().counters)
    display(session.run("CREATE INDEX FOR (code:Code) ON (code.tty)").consume().counters)
    display(session.run("CREATE INDEX FOR (code:Code) ON (code.code)").consume().counters)

{'constraints_added': 1}

{'constraints_added': 1}

{'constraints_added': 1}

{'constraints_added': 1}

{'constraints_added': 1}

{'constraints_added': 1}

{'constraints_added': 1}

{'constraints_added': 1}

{'constraints_added': 1}

{'indexes_added': 1}

{'indexes_added': 1}

{'indexes_added': 1}

# All Labels/Nodes in Graph

- Note: Labels & nodes have been created in a redundant way in-order to handle individual ontologies/vocabularies (i.e. SNOMEDCT_US, NCI HGNC) as labels.
  - This design implementation allows use of `Code` label (which encompasses all discrete ontologies/vocabularies in the graph - in other words all the other labels except a handful). This enables a broader and more granular approach to querying the graph.
    - For example, if the goal is to create a SNOMEDCT_US -> ICD10CM crosswalk... It is more intuitive, precise & less time intensive to traverse the graph explicitly based on those labels.
    - If we flip the scenario, where it is uncertain what crosswalk(s) may exist between any of the vocabularies, then the `Code` label serves as a "catch-all" label for all vocabularies/ontologies within the graph. Thus preventing tedious trial & error to establish what relationships may exist between certain vocabularies.

- **In the next .ipynb (part2) we will explore this in more detail**


In [16]:
result = {"label": [], "count": []}
with driver.session(database="neo4j") as session:
    for row in session.run("CALL db.labels()"):
        label = row["label"]
        query = f"MATCH (:`{label}`) RETURN count(*) as count"
        count = session.run(query).single()["count"]
        result["label"].append(label)
        result["count"].append(count)
nodes_df = pd.DataFrame(data=result)
nodes_df.sort_values(by="count",ascending=False)

Unnamed: 0,label,count
1,Atom,3347400
2,Code,1399051
0,Concept,1206002
15,SNOMEDCT_US,361461
17,Attribute,360096
11,MSH,347565
18,NDC,247545
9,ICD10PCS,190673
13,NCI,156922
14,RXNORM,104437


# All Relationships/Edges in the Graph

- Should investigate latter half of list (extremely how counts)
  - If cannot resolve then possibly best to trim where not providing any value


In [17]:
result = {"relType": [], "count": []}
with driver.session(database="neo4j") as session:
    for row in session.run("CALL db.relationshipTypes() YIELD relationshipType RETURN relationshipType"):
        relationship_type = row["relationshipType"]
        query = f"MATCH ()-[:`{relationship_type}`]->() RETURN count(*) as count"
        count = session.run(query).single()["count"]
        result["relType"].append(relationship_type)
        result["count"].append(count)
rels_df = pd.DataFrame(data=result)
rels_df.sort_values(by="count",ascending=False)

Unnamed: 0,relType,count
2,HAS_UMLS_AUI,3347400
3,HAS_CUI,2468118
0,HAS_STY,1440022
1,IS_STY_OF,1440022
4,HAS_CHILD,1203510
11,ISA,576174
20,INVERSE_ISA,576174
42,PAR,270933
39,CHD,270933
6,NDC,247545


In [18]:
# Close connection
driver.close()

##### Establishing a connection using py2neo
To be covered later

In [None]:
## py2neo connection (community supported python driver)

# from py2neo import Graph, Relationship, Subgraph, database

# g = Graph(uri=uri, auth=(user, password))


In [None]:
# result = {"label": [], "count": []}
# for label in g.run("CALL db.labels()").to_series():
#     query = f"MATCH (:`{label}`) RETURN count(*) AS count"
#     count = g.run(query).to_data_frame().iloc[0]['count']
#     result["label"].append(label)
#     result["count"].append(count)
# nodes_df = pd.DataFrame(data=result)
# nodes_df.sort_values(by="count", ascending=False)
