In [1]:
import neo4j

import csv

import math
import numpy as np
import pandas as pd
import seaborn as sns

import psycopg2

In [2]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))
session = driver.session(database="neo4j")

In [3]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

### Clean DB to see if it works

In [4]:
my_neo4j_wipe_out_database()

### Support code that makes things run

In [5]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [6]:
cursor = connection.cursor()

In [7]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [8]:
def my_neo4j_nodes_relationships():
    "print all the nodes and relationships"
   
    print("-------------------------")
    print("  Nodes:")
    print("-------------------------")
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    display(df)
    
    print("-------------------------")
    print("  Relationships:")
    print("-------------------------")
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    display(df)
    
    density = (2 * number_relationships) / (number_nodes * (number_nodes - 1))
    
    print("-------------------------")
    print("  Density:", f'{density:.1f}')
    print("-------------------------")
    

In [9]:
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

<neo4j._sync.work.result.Result at 0x7f7aee127e20>

### Building code that builds provider notes

In [10]:
def my_neo4j_create_provider_node(provider_npi):
    "create a node with label Provider"
    
    query = """
    
    MERGE (:Provider {name: $provider_npi})
    
    """
    
    session.run(query, provider_npi=provider_npi)

### Building code for drug nodes

In [11]:
def my_neo4j_create_drug_node(drug):
    "create a node with label Drug"
    
    query = """
    
    MERGE (:Drug {code: $drug_code})
    
    """
    
    session.run(query, drug_code=drug_code)

### Building code for procedures

In [12]:
def my_neo4j_create_procedure_node(procedure):
    "create a node with label Procedure"
    
    query = """
    
    MERGE (:Procedure {code: $procedure_code})
    
    """
    
    session.run(query, procedure_code=procedure_code)

### Building 1 way relationship between provides and drug prescriptions with a weight

In [13]:
def my_neo4j_create_relationship_one_way(provider_npi, drug_code, contrib_payment_percent):
    """
    Create a one-way relationship from a provider to a drug they have prescribed with a weight
    """
    query = """
    MATCH (p:Provider {name: $provider_npi}),
          (d:Drug {code: $drug_code})
    MERGE (p)-[r:Prescribes {weight:$contrib_payment_percent}]->(d)
    RETURN p, r, d

    """
    session.run(query, provider_npi=provider_npi, drug_code=drug_code,contrib_payment_percent=contrib_payment_percent)

### Building 1 way relationship between provides and procedures with a weight

In [14]:
def my_neo4j_create_relationship_one_way_procedure(provider_npi, procedure_code, contrib_payment_percent):
    """
    Create a one-way relationship from a provider to a drug they have prescribed with a weight
    """
    query = """
    MATCH (p:Provider {name: $provider_npi}),
          (d:Procedure {code: $procedure_code})
    MERGE (p)-[r:Conducts {weight:$contrib_payment_percent}]->(d)
    RETURN p, r, d

    """
    session.run(query, provider_npi=provider_npi, procedure_code=procedure_code,contrib_payment_percent=contrib_payment_percent)

### Building Nodes

In [15]:
# Query to fetch provider NPI and create provider nodes
connection.rollback()
provider_query = """
SELECT provider_npi
FROM providers
WHERE provider_state = 'WY'
"""
cursor.execute(provider_query)
connection.rollback()
providers = cursor.fetchall()

for row in providers:
    provider_npi = row[0]
    my_neo4j_create_provider_node(provider_npi)
    

# Query to fetch distinct drugs and create drug nodes
drug_query = """
SELECT DISTINCT r.code AS code
FROM services_rendered r
     INNER JOIN (
         SELECT provider_npi
         FROM providers
         WHERE provider_state = 'WY') k
     ON k.provider_npi = r.provider_npi
WHERE r.drug_indicator = 'Y'
"""
cursor.execute(drug_query)
connection.rollback()
drugs = cursor.fetchall()

for row in drugs:
    drug_code = row[0]
    my_neo4j_create_drug_node(drug_code)
    

# Query to fetch distinct procedure and create procedure nodes
    
procedure_query = """
SELECT DISTINCT r.code AS code
FROM services_rendered r
     INNER JOIN (
         SELECT provider_npi
         FROM providers
         WHERE provider_state = 'WY') k
     ON k.provider_npi = r.provider_npi
WHERE r.drug_indicator = 'N'
"""
cursor.execute(procedure_query)
connection.rollback()
procedures = cursor.fetchall()

for row in procedures:
    procedure_code = row[0]
    my_neo4j_create_procedure_node(procedure_code)



## Relationship building drug relationship


In [16]:
# Query to create drug relationships
relationship_query = """
select 
r.provider_npi,
r.code,
r.total_amount_paid/t.total_medicare_payments::float as contrib_payment_percent,
r.total_beneficiaries/t.total_beneficiaries::float as beneficiary_frequency
from services_rendered r
     inner join (
            select provider_npi
            from providers
            where provider_state = 'WY'
            ) k on k.provider_npi = r.provider_npi
        left join (
            select 
            provider_npi,
            sum(total_amount_paid) as total_medicare_payments,
            sum(total_beneficiaries) as total_beneficiaries
            from services_rendered 
             group by 1
            ) t on t.provider_npi = r.provider_npi
    where  r.drug_indicator = 'Y'
"""
cursor.execute(relationship_query)
connection.rollback()
relationships = cursor.fetchall()

for row in relationships:
    provider_npi = row[0]
    drug_code = row[1]
    weight = row[2]

    
    my_neo4j_create_relationship_one_way(provider_npi, drug_code, weight)

### Building procedure relationships

In [17]:
# Query to create procedure relationships
relationship_query = """
select 
r.provider_npi,
r.code,
r.total_amount_paid/t.total_medicare_payments::float as contrib_payment_percent,
r.total_beneficiaries/t.total_beneficiaries::float as beneficiary_frequency
from services_rendered r
     inner join (
            select provider_npi
            from providers
            where provider_state = 'WY'
            ) k on k.provider_npi = r.provider_npi
        left join (
            select 
            provider_npi,
            sum(total_amount_paid) as total_medicare_payments,
            sum(total_beneficiaries) as total_beneficiaries
            from services_rendered 
             group by 1
            ) t on t.provider_npi = r.provider_npi
    where  r.drug_indicator = 'N'
"""
cursor.execute(relationship_query)
connection.rollback()
relationships = cursor.fetchall()

for row in relationships:
    provider_npi = row[0]
    procedure_code = row[1]
    weight = row[2]

    
    my_neo4j_create_relationship_one_way_procedure(provider_npi, procedure_code, weight)

 ### Jaccard Similarity
 

In [18]:
query = "CALL gds.graph.drop('myGraph', false) yield graphName"
session.run(query)

query = """
CALL gds.graph.project(
    'myGraph',
    ['Provider', 'Drug', 'Procedure'],
    {
      Conducts: {
        type: 'Conducts',
        properties: {
          weight: { property: 'weight', defaultValue: 0.0 }
        }
      },
      Prescribes: {
        type: 'Prescribes',
        properties: {
          weight: { property: 'weight', defaultValue: 0.0 }
        }
      }
    }
                  
)
"""
session.run(query)

<neo4j._sync.work.result.Result at 0x7f7aee07b5b0>

In [19]:
node_similarity_query = """
call gds.nodeSimilarity.stream('myGraph', { relationshipWeightProperty: 'weight' })
yield node1, node2, similarity
return
    gds.util.asNode(node1).name AS Node1,
    gds.util.asNode(node2).name AS Node2,
    similarity
order by  3 asc
"""

#putting result in dataframe

result = session.run(node_similarity_query)
data = [record.data() for record in result]
jaccard_df = pd.DataFrame(data)
jaccard_df

Unnamed: 0,Node1,Node2,similarity
0,1033555230,1043630619,0.249561
1,1033555230,1962676353,0.248686
2,1033555230,1699742957,0.208875
3,1033555230,1447233754,0.169624
4,1033555230,1821437971,0.158550
...,...,...,...
105,1962676353,1043630619,0.195770
106,1962676353,1770670325,0.143585
107,1962676353,1952317505,0.128354
108,1962676353,1447233754,0.111009


In [20]:
# identifying those with low similarity 

jaccard_df = jaccard_df.sort_values(by='similarity', ascending=True)

jaccard_df.head(20)

Unnamed: 0,Node1,Node2,similarity
29,1265653745,1629210992,0.032763
49,1629210992,1265653745,0.032763
28,1265653745,1942662788,0.050519
89,1942662788,1265653745,0.050519
48,1629210992,1699742957,0.064289
59,1699742957,1629210992,0.064289
27,1265653745,1821437971,0.064901
79,1821437971,1265653745,0.064901
39,1447233754,1629210992,0.064915
47,1629210992,1447233754,0.064915


## PageRank

This is going to get step more complicated - here we are going to make a second graph - where put relationships between providers that have a similiarty score higher than 0.30

In [21]:
my_neo4j_wipe_out_database()

In [22]:
#going to make nodes for each provider
for column in ['Node1', 'Node2']:
    for provider_npi in jaccard_df[column]:
        my_neo4j_create_provider_node(provider_npi)

In [23]:
#provider relationship

def my_neo4j_create_relationship_one_way_providers(node_1, node_2, similarity):

    query = """
    MATCH (p1:Provider {name: $node_1}),
          (p2:Provider {name: $node_2})
    MERGE (p1)-[r:Connected {weight: $similarity}]->(p2)
    RETURN p1, r, p2
    """
    session.run(query, node_1=node_1, node_2=node_2,similarity=similarity)

In [24]:
# build relationship

for _, row in jaccard_df.iterrows():
    node_1 = row['Node1']
    node_2 = row['Node2']
    similarity = row['similarity']
    
    my_neo4j_create_relationship_one_way_providers(node_1, node_2, similarity)

In [25]:
query = "CALL gds.graph.drop('myGraph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('myGraph', 'Provider', 'Connected', {relationshipProperties: 'weight'})"
session.run(query)


<neo4j._sync.work.result.Result at 0x7f7aee06efd0>

In [26]:
# PageRank query
pagerank_query = """
CALL gds.pageRank.stream(
'myGraph'
)

yield nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score
ORDER BY score DESC
"""

# gonna put results in dataframe
result = session.run(pagerank_query)
pagerank_data = [record.data() for record in result]
pagerank_df = pd.DataFrame(pagerank_data)


pagerank_df

Unnamed: 0,name,score
0,1265653745,0.96124
1,1629210992,0.96124
2,1942662788,0.96124
3,1699742957,0.96124
4,1821437971,0.96124
5,1447233754,0.96124
6,1043630619,0.96124
7,1770670325,0.96124
8,1033555230,0.96124
9,1952317505,0.96124
