In [None]:
import pandas as pd
import getpass
from neo4j import GraphDatabase

pd.set_option("max_colwidth", 999)
pd.set_option("max_rows", 999)


In [None]:
# Class for opening/closing & querying db via neo4j driver (not currently used in this .ipynb)

class Neo4jConnection:

    def __init__(self, uri, username, password):
        self.__uri = uri
        self.__username = username
        self.__password = password
        self.__driver = None

        try:
            self.__driver = GraphDatabase.driver(
                self.__uri, auth=(self.__username, self.__password))
        except Exception as e:
            print("Failed to create driver", e)

    def close(self):
        if self.__driver is not None:
            self.__driver.close()

    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized."
        session = None
        response = None

        try:
            session = self.__driver.session(
                database=db) if db is not None else self.__driver.session()
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query has failed:", e)
        finally:
            if session is not None:
                session.close()
        return response


In [None]:
# Define connection uri, username & password parameters to establish driver connection (using neo4j official driver)

uri = 'neo4j://localhost:7687'
user = 'neo4j'
password = getpass.getpass("Please enter password: ")


In [None]:
# Define driver
driver = GraphDatabase.driver(uri=uri, auth=(user, password))
print(driver)

### Delete Orphan Nodes Prior to Indexing

In [None]:
with driver.session(database="neo4j") as session:
    display(session.run(
        "MATCH (orphans) WHERE size((orphans)--())=0 DELETE orphans").consume().counters)


### Create Unique Property Constraints & Indexes

Note: Contraints must be applied s/p import of data when utilizing `neo4j-admin import` tool

In [None]:
# Apply all unique property constraints & indexes to the graph
with driver.session(database="neo4j") as session:
    display(session.run(
        "CREATE CONSTRAINT ON (atom:Atom) ASSERT atom.AtomId IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (concept:Concept) ASSERT concept.ConceptId IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (code:Code) ASSERT code.CodeId IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (atc:ATC) ASSERT atc.CodeId IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (cvx:CVX) ASSERT cvx.CodeId IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (go:GO) ASSERT goo.CodeId IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (hgnc:HGNC) ASSERT hgnc.CodeId IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (icd9cm:ICDO3) ASSERT icd9cm.CodeId IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (icd10cm:ICD10CM) ASSERT icd10cm.CodeId IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (icd10pcs:ICD10PCS) ASSERT icd10pcs.CodeId IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (lnc:LNC) ASSERT lnc.CodeId IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (medrt:`MED-RT`) ASSERT medrt.CodeId IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (mvx:MVX) ASSERT mvx.CodeId IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (nci:NCI) ASSERT nci.CodeId IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (rxnorm:RXNORM) ASSERT rxnorm.CodeId IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (sct:SNOMEDCT_US) ASSERT sct.CodeId IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (sty:SemanticType) ASSERT sty.SemanticTypeId IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (sty:SemanticType) ASSERT sty.sty IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE CONSTRAINT ON (sty:SemanticType) ASSERT sty.stn IS UNIQUE").consume().counters)
    display(session.run(
        "CREATE INDEX FOR (atom:Atom) ON (atom.vocab)").consume().counters)
    display(session.run(
        "CREATE INDEX FOR (atom:Atom) ON (atom.tty)").consume().counters)
    display(session.run(
        "CREATE INDEX FOR (atom:Atom) ON (atom.code)").consume().counters)
    display(session.run(
        "CREATE INDEX FOR (code:Code) ON (code.code)").consume().counters)


# All Labels/Nodes in Graph

- Note: Labels & nodes have been created in a redundant way in-order to handle individual ontologies/vocabularies (i.e. SNOMEDCT_US, NCI, RXNORM etc...) as labels.
  - This design implementation allows use of `Code` label (which encompasses all discrete ontologies/vocabularies in the graph - in other words all the other labels except a handful). This enables a broader and more granular approach to querying the graph.
    - For example, if the goal is to create a SNOMEDCT_US -> ICD10CM crosswalk... It is more intuitive, precise & less time intensive to traverse the graph explicitly based on those labels.
    - If we flip the scenario, where it is uncertain what crosswalk(s) may exist between any of the vocabularies, then the `Code` label serves as a "catch-all" label for all vocabularies/ontologies within the graph. Thus preventing tedious trial & error to establish what relationships may exist between certain vocabularies.

- **In the next .ipynb (part2) we will explore this in more detail**


In [None]:
result = {"label": [], "count": []}
with driver.session(database="neo4j") as session:
    for row in session.run("CALL db.labels()"):
        label = row["label"]
        query = f"MATCH (:`{label}`) RETURN count(*) as count"
        count = session.run(query).single()["count"]
        result["label"].append(label)
        result["count"].append(count)
nodes_df = pd.DataFrame(data=result)
nodes_df.sort_values(by="count",ascending=False)

# All Relationships/Edges in the Graph

- Should investigate latter half of list (extremely how counts)
  - If cannot resolve then possibly best to trim where not providing any value


In [None]:
result = {"relType": [], "count": []}
with driver.session(database="neo4j") as session:
    for row in session.run("CALL db.relationshipTypes() YIELD relationshipType RETURN relationshipType"):
        relationship_type = row["relationshipType"]
        query = f"MATCH ()-[:`{relationship_type}`]->() RETURN count(*) as count"
        count = session.run(query).single()["count"]
        result["relType"].append(relationship_type)
        result["count"].append(count)
rels_df = pd.DataFrame(data=result)
rels_df.sort_values(by="count",ascending=False)

In [None]:
# Close connection
driver.close()
