## Web server interface at https://xxxx:7473

#### Update - since the videos were filmed, neo4j requires a longer, more complex password, so the newest password is here:

**Username: neo4j**

**Password: ucb_mids_w205**

**In the web server interface, run the same query from last week to return all nodes and all relationships:**

```
match (n) return n
```

In [1]:
import neo4j
import csv
import math
import numpy as np
import pandas as pd
import psycopg2

### Define variables for data path, state and zipcode.

In [2]:
# Where the medicare data directory is.
DATA_DIR = "/user/projects/project-3-ss3382/medicare_data_csv"
ZIPCODE = ''
STATE = 'FL'

In [3]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [4]:
session = driver.session(database="neo4j")

In [5]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [6]:
cursor = connection.cursor()

In [7]:
def my_select_query_pandas(query, rollback_before_flag=True, rollback_after_flag=True):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)

In [8]:
# Select a table or view by name and display the rows.
def display_table(table_name):
    query = f"""
    select * from {table_name};
    """
    return my_select_query_pandas(query, True, True)

In [9]:
def my_read_csv_file(file_name, limit):
    "read the csv file and print only the first limit rows"
    
    csv_file = open(file_name, "r")
    
    csv_data = csv.reader(csv_file)
    
    i = 0
    
    for row in csv_data:
        i += 1
        if i <= limit:
            print(row)
            
    print("\nPrinted ", min(limit, i), "lines of ", i, "total lines.")

In [10]:
def my_select_where_condition():
    return f"p.provider_state = '{STATE}'%s" % (f" and p.provider_zip = '{ZIPCODE}'" if ZIPCODE else "")

### Postgres Part - Create DB and Import data from csv files to postgres

In [11]:
# create providers
query = f"""
drop table if exists providers CASCADE;
create table providers (
    provider_npi varchar(32),
    provider_last_name varchar(32),
    provider_first_name varchar(32),
    provider_credentials varchar(32),
    provider_gender varchar(2),
    provider_state varchar(2),
    provider_zip varchar(32),
    primary key (provider_npi)
);
    
copy providers
from '{DATA_DIR}/providers.csv' delimiter ',' NULL '' csv header;
"""

cursor.execute(query)
connection.commit()
display_table("providers")

Unnamed: 0,provider_npi,provider_last_name,provider_first_name,provider_credentials,provider_gender,provider_state,provider_zip
0,1003003153,Morrison,Laura,MD,F,WA,98104
1,1003010687,Perri,Anthony,MD,M,TX,77304
2,1003011404,Mccoppin,Holly,M.D.,F,CO,80537
3,1003013384,Osleber,Michael,MD,M,AR,72205
4,1003014762,Wharton,Joshua,M.D.,M,AL,35976
...,...,...,...,...,...,...,...
11902,1992965461,Hossani-Madani,Ahmad,M.D.,M,MD,20774
11903,1992969414,Peng,Lisan,MD,F,AZ,85712
11904,1992970354,Hall,Rebecca,"M.D., C.M.",F,CT,6905
11905,1992996771,Flynn,Valerie,,F,SD,57108


In [12]:
# Create table and import data for procedures.

query = f"""
drop table if exists procedures;
create table procedures (
  procedure_code varchar(32),
  procedure_description text,
  if_drug boolean,
  primary key (procedure_code)
);
    
copy procedures
from '{DATA_DIR}/procedures.csv' delimiter ',' NULL '' csv header;
"""

cursor.execute(query)
connection.commit()
display_table("procedures")

Unnamed: 0,procedure_code,procedure_description,if_drug
0,11102,"Biopsy of related skin growth, first growth",False
1,11103,"Biopsy of related skin growth, each additional...",False
2,17000,"Destruction of precancer skin growth, 1 growth",False
3,17003,"Destruction of precancer skin growth, 2-14 gro...",False
4,17110,"Destruction of skin growth, 1-14 growths",False
...,...,...,...
501,88333,Pathology cytologic examination of specimen du...,False
502,88334,Pathology cytologic examination of specimen du...,False
503,Q4188,"Amnioarmor, per square centimeter",False
504,87118,Identification of mycobacteria (tb or tb like ...,False


In [13]:
# Create table and import data for drugs.
query = f"""
drop table if exists drugs;
create table drugs (
  drug_code varchar(32),
  drug_description text,
  if_drugg boolean,
  primary key (drug_code)
);
    
copy drugs
from '{DATA_DIR}/drugs.csv' delimiter ',' NULL '' csv header;
"""

cursor.execute(query)
connection.commit()
display_table("drugs")

Unnamed: 0,drug_code,drug_description,if_drugg
0,J3301,"Injection, triamcinolone acetonide, not otherw...",True
1,J7345,Aminolevulinic acid hcl for topical administra...,True
2,Q4186,"Epifix, per square centimeter",True
3,J7308,Aminolevulinic acid hcl for topical administra...,True
4,J9190,"Injection, fluorouracil, 500 mg",True
5,90662,"Influenza vaccine split virus, preservative free",True
6,J3300,"Injection, triamcinolone acetonide, preservati...",True
7,J1100,"Injection, dexamethasone sodium phosphate, 1 mg",True
8,J0702,"Injection, betamethasone acetate 3 mg and beta...",True
9,J3245,"Injection, tildrakizumab, 1 mg",True


In [14]:
# Create table and import data for services_rendered.

query = f"""
drop table if exists services_rendered;
create table services_rendered (
  provider_npi varchar(32),
  code varchar(32),
  drug_indicator varchar(32),
  total_beneficiaries float,
  total_services float,
  total_amount_paid float,
  primary key (provider_npi,code)
);
    
copy services_rendered
from '{DATA_DIR}/services_rendered.csv' delimiter ',' NULL '' csv header;
"""

cursor.execute(query)
connection.commit()
display_table("services_rendered")

Unnamed: 0,provider_npi,code,drug_indicator,total_beneficiaries,total_services,total_amount_paid
0,1003003153,11102,N,98,112.0,93.90
1,1003003153,11103,N,27,33.0,52.13
2,1003003153,17000,N,127,165.0,48.59
3,1003003153,17003,N,104,572.0,6.77
4,1003003153,17110,N,72,78.0,113.72
...,...,...,...,...,...,...
202904,1992997548,17261,N,20,24.0,128.12
202905,1992997548,99203,N,36,36.0,109.95
202906,1992997548,99212,N,56,62.0,54.99
202907,1992997548,99213,N,530,624.0,88.40


### EDA for import data.

In [15]:
# Get number of providers for each zip code in given state.
query = f"""
select provider_zip, count(*) as num_providers from providers 
where provider_state = '{STATE}'
group by provider_zip
having count(*) >= 10
order by num_providers desc
"""
my_select_query_pandas(query, True, True)

Unnamed: 0,provider_zip,num_providers
0,33136,17
1,33410,16
2,32308,14
3,32224,14
4,33437,14
5,32174,11
6,32610,11
7,33484,11
8,34239,11
9,33143,10


In [16]:
# Create 3 views as 
# services_95816: services rendered in zip code 95816
# services_drugs: drugs prescribed by providers in zip code 95816.
# services_procedures: procedures performed by providers zip code 95816.
query = f"""

drop view if exists provider_services_rendered;
drop view if exists provider_drugs;
drop view if exists provider_procedures;
create view provider_services_rendered as
select p.provider_npi, 
       p.provider_state, 
       p.provider_zip, 
       s.code, 
       s.drug_indicator, 
       s.total_beneficiaries, 
       s.total_services, 
       s.total_amount_paid,
       s.total_amount_paid / s.total_services as avg_amount_paid
from providers as p 
join services_rendered as s 
on p.provider_npi = s.provider_npi;

create view provider_drugs as
select * from provider_services_rendered
where drug_indicator = 'Y';

create view provider_procedures as 
select * from provider_services_rendered
where drug_indicator = 'N';

"""

cursor.execute(query)
connection.commit()

In [17]:
display_table("provider_services_rendered")

Unnamed: 0,provider_npi,provider_state,provider_zip,code,drug_indicator,total_beneficiaries,total_services,total_amount_paid,avg_amount_paid
0,1003003153,WA,98104,11102,N,98,112.0,93.90,0.838393
1,1003003153,WA,98104,11103,N,27,33.0,52.13,1.579697
2,1003003153,WA,98104,17000,N,127,165.0,48.59,0.294485
3,1003003153,WA,98104,17003,N,104,572.0,6.77,0.011836
4,1003003153,WA,98104,17110,N,72,78.0,113.72,1.457949
...,...,...,...,...,...,...,...,...,...
202904,1992997548,OH,43147,17261,N,20,24.0,128.12,5.338333
202905,1992997548,OH,43147,99203,N,36,36.0,109.95,3.054167
202906,1992997548,OH,43147,99212,N,56,62.0,54.99,0.886935
202907,1992997548,OH,43147,99213,N,530,624.0,88.40,0.141667


In [18]:
display_table("provider_procedures")

Unnamed: 0,provider_npi,provider_state,provider_zip,code,drug_indicator,total_beneficiaries,total_services,total_amount_paid,avg_amount_paid
0,1003003153,WA,98104,11102,N,98,112.0,93.90,0.838393
1,1003003153,WA,98104,11103,N,27,33.0,52.13,1.579697
2,1003003153,WA,98104,17000,N,127,165.0,48.59,0.294485
3,1003003153,WA,98104,17003,N,104,572.0,6.77,0.011836
4,1003003153,WA,98104,17110,N,72,78.0,113.72,1.457949
...,...,...,...,...,...,...,...,...,...
196913,1992997548,OH,43147,17261,N,20,24.0,128.12,5.338333
196914,1992997548,OH,43147,99203,N,36,36.0,109.95,3.054167
196915,1992997548,OH,43147,99212,N,56,62.0,54.99,0.886935
196916,1992997548,OH,43147,99213,N,530,624.0,88.40,0.141667


In [19]:
display_table("provider_drugs")

Unnamed: 0,provider_npi,provider_state,provider_zip,code,drug_indicator,total_beneficiaries,total_services,total_amount_paid,avg_amount_paid
0,1003011404,CO,80537,J3301,Y,22,42.0,1.13,0.026905
1,1003014762,AL,35976,J7345,Y,43,13800.0,1.55,0.000112
2,1003021973,OH,44320,J3301,Y,11,19.0,1.12,0.058947
3,1003021973,OH,44320,Q4186,Y,12,361.0,151.03,0.418366
4,1003043340,CA,94301,J7308,Y,16,27.0,388.60,14.392593
...,...,...,...,...,...,...,...,...,...
5986,1992894414,FL,33919,J7308,Y,24,28.0,388.91,13.889643
5987,1992927214,TN,38104,J3301,Y,33,81.0,1.13,0.013951
5988,1992927305,NC,28226,J3301,Y,21,47.0,1.15,0.024468
5989,1992948939,NY,10994,J3301,Y,11,29.0,1.13,0.038966


In [20]:
# Get the summarized data for each (provider, drug) pair
query = """
select s.provider_npi,
       s.code as drug_code,
       count(*) as num_types, 
       sum(total_services) as total_prescriptions, 
       sum(total_beneficiaries) as total_beneficiaries,
       sum(total_amount_paid) as total_amount_paid
from provider_drugs as s
group by s.provider_npi, s.code
"""
my_select_query_pandas(query)

Unnamed: 0,provider_npi,drug_code,num_types,total_prescriptions,total_beneficiaries,total_amount_paid
0,1003011404,J3301,1,42.0,22,1.13
1,1003014762,J7345,1,13800.0,43,1.55
2,1003021973,J3301,1,19.0,11,1.12
3,1003021973,Q4186,1,361.0,12,151.03
4,1003043340,J7308,1,27.0,16,388.60
...,...,...,...,...,...,...
5986,1992894414,J7308,1,28.0,24,388.91
5987,1992927214,J3301,1,81.0,33,1.13
5988,1992927305,J3301,1,47.0,21,1.15
5989,1992948939,J3301,1,29.0,11,1.13


In [21]:
# Get the number of drug types prescribed by each provider and total amount data for it.
query = """
select s.provider_npi,
       count(*) as num_drug_types, 
       sum(total_services) as total_prescriptions, 
       sum(total_beneficiaries) as total_beneficiaries,
       sum(total_amount_paid) as total_amount_paid
from provider_drugs as s
group by s.provider_npi
"""
my_select_query_pandas(query)

Unnamed: 0,provider_npi,num_drug_types,total_prescriptions,total_beneficiaries,total_amount_paid
0,1003011404,1,42.0,22,1.13
1,1003014762,1,13800.0,43,1.55
2,1003021973,2,380.0,23,152.15
3,1003043340,1,27.0,16,388.60
4,1003073958,2,9814.0,48,2.67
...,...,...,...,...,...
4575,1992894414,1,28.0,24,388.91
4576,1992927214,1,81.0,33,1.13
4577,1992927305,1,47.0,21,1.15
4578,1992948939,1,29.0,11,1.13


In [22]:
# Get number of providers and total prescriptions, beneficiaries and amount paid for each drug.
query = """
select code as drug_code, 
       count(*) as num_providers,
       sum(total_services) as total_prescriptions, 
       sum(total_beneficiaries) as total_beneficiaries,
       sum(total_amount_paid) as total_amount_paid       
from provider_drugs
group by drug_code
order by total_amount_paid
"""
my_select_query_pandas(query)

Unnamed: 0,drug_code,num_providers,total_prescriptions,total_beneficiaries,total_amount_paid
0,J2001,2,164.0,42,0.06
1,J2540,1,94.0,14,0.74
2,J0690,2,555.0,89,1.71
3,J9250,13,1428.0,321,2.89
4,J0696,6,637.0,148,2.94
5,J1100,31,5411.0,1430,3.72
6,J3489,1,75.0,15,7.24
7,J2920,2,655.0,185,8.39
8,J3111,1,10710.0,25,9.48
9,J1020,4,459.0,198,12.27


In [23]:
# Get number of providers and total prescriptions, beneficiaries and amount paid for each drug.
query = """
select code as procedure_code, 
       count(*) as num_providers,
       sum(total_services) as total_prescriptions, 
       sum(total_beneficiaries) as total_beneficiaries,
       sum(total_amount_paid) as total_amount_paid       
from provider_procedures as s
group by procedure_code
order by total_amount_paid
"""
my_select_query_pandas(query)

Unnamed: 0,procedure_code,num_providers,total_prescriptions,total_beneficiaries,total_amount_paid
0,J3490,1,477.0,21,0.00
1,Q2028,2,263131.0,52,1.52
2,85652,1,12.0,12,2.66
3,84520,1,12.0,11,3.35
4,85651,1,16.0,16,4.21
...,...,...,...,...,...
501,17110,9716,1696084.0,1363436,1082024.50
502,99203,9841,828895.0,828848,1124771.95
503,12032,4461,212000.0,191506,1213059.55
504,99214,9655,1942619.0,1418048,1264437.96


In [24]:
# Get the summarized procedure data for each provider
query = """
select s.provider_npi,
       count(*) as num_types, 
       sum(total_services) as total_procedures, 
       sum(total_beneficiaries) as total_beneficiaries,
       sum(total_amount_paid) as total_amount_paid
from provider_procedures as s
group by s.provider_npi
order by total_procedures desc, total_amount_paid desc
"""
my_select_query_pandas(query)

Unnamed: 0,provider_npi,num_types,total_procedures,total_beneficiaries,total_amount_paid
0,1083748461,55,252446.0,9010,12140.58
1,1932287653,62,133381.0,18077,10826.97
2,1740399963,36,49108.0,13849,5818.66
3,1164614004,40,49099.0,22488,9336.35
4,1386696904,71,39791.0,12663,11292.23
...,...,...,...,...,...
11902,1265637995,1,11.0,11,72.31
11903,1922399807,1,11.0,11,72.18
11904,1407845712,1,11.0,11,65.27
11905,1093201162,1,11.0,11,63.63


In [25]:
# Get the summarized procedure data for each provider
query = """
select s.code as procedure_code,
       count(*) as num_providers, 
       sum(total_services) as total_procedures, 
       sum(total_beneficiaries) as total_beneficiaries,
       sum(total_amount_paid) as total_amount_paid
from provider_procedures as s
group by s.code
order by num_providers desc, total_procedures desc, total_amount_paid desc
"""
my_select_query_pandas(query)

Unnamed: 0,procedure_code,num_providers,total_procedures,total_beneficiaries,total_amount_paid
0,99213,11414,5722051.1,4018746,1062786.56
1,17000,11005,3870015.0,2806802,608642.91
2,11102,10648,2174030.0,1785488,966893.80
3,17003,10616,12613617.0,2101912,70811.47
4,99203,9841,828895.0,828848,1124771.95
...,...,...,...,...,...
501,84403,1,11.0,11,25.50
502,94010,1,11.0,11,24.99
503,87389,1,11.0,11,23.77
504,73552,1,11.0,11,9.35


In [26]:
# Get distinct drugs in a given area.
query = """
select distinct code as drug from provider_services_rendered as p
where %s and p.drug_indicator = 'Y'
""" % my_select_where_condition()
my_select_query_pandas(query)

Unnamed: 0,drug
0,J0696
1,J0702
2,J1100
3,J3245
4,J3301
5,J7308
6,J7345
7,J9190
8,J9250
9,J9260


In [27]:
# Get distinct procedures in given area.
query = """
select distinct code as drug from provider_services_rendered as p
where %s and p.drug_indicator = 'N'
""" % my_select_where_condition()
my_select_query_pandas(query)

Unnamed: 0,drug
0,0394T
1,0598T
2,0658T
3,10040
4,10060
...,...
246,Q4197
247,Q4205
248,Q4217
249,Q4234


In [28]:
# Get distinct providers for drugs in given area.
query = """
select distinct provider_npi as drug_provider from provider_services_rendered as p
where %s and p.drug_indicator = 'Y'
""" % my_select_where_condition()
my_select_query_pandas(query)

Unnamed: 0,drug_provider
0,1003142084
1,1003811548
2,1003844937
3,1003870734
4,1003876053
...,...
476,1982711925
477,1992063267
478,1992728802
479,1992894414


In [29]:
# Get distinct providers for precedures in a state
query = """
select distinct provider_npi as procedure_provider from provider_services_rendered as p
where %s and p.drug_indicator = 'N'
""" % my_select_where_condition()
my_select_query_pandas(query)

Unnamed: 0,procedure_provider
0,1003038555
1,1003077652
2,1003083247
3,1003142084
4,1003811548
...,...
989,1992728802
990,1992837900
991,1992894414
992,1992897938


In [30]:
# Get distinct provider/procedure pairs in a state
query = """
select provider_npi as provider, code as procedure from provider_services_rendered as p
where %s and p.drug_indicator = 'N'
""" % my_select_where_condition()
my_select_query_pandas(query)

Unnamed: 0,provider,procedure
0,1003038555,11102
1,1003038555,11103
2,1003038555,11104
3,1003038555,11301
4,1003038555,12032
...,...,...
19901,1992956247,99203
19902,1992956247,99204
19903,1992956247,99212
19904,1992956247,99213


In [31]:
### Neo4j Part - Define functions for creating and querying graph DB.

In [32]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [33]:
# Select a table or view by name and display the rows.
def display_table(table_name):
    query = f"""
    select * from {table_name};
    """
    return my_select_query_pandas(query, True, True)

In [34]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    result = session.run(query, **kwargs)
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    return df

In [35]:
def my_neo4j_nodes_relationships():
    "print all the nodes and relationships"
   
    print("-------------------------")
    print("  Nodes:")
    print("-------------------------")
    
    query = """
        match (n) 
        return n.node_name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    display(df)
    
    print("-------------------------")
    print("  Relationships:")
    print("-------------------------")
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.node_name as node_1_name, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.node_name as node_2_name, labels(n2) as node_2_labels
        order by node_1_name, node_2_name
    """
    
    df = my_neo4j_run_query_pandas(query)
    number_relationships = df.shape[0]
    display(df)
    density = (2 * number_relationships) / (number_nodes * (number_nodes - 1))
    print("-------------------------")
    print("  Density:", f'{density:.1f}')
    print("-------------------------")
    

In [36]:
def my_neo4j_number_nodes(label=""):
    query = f"""
    MATCH (n:{label})
    RETURN count(n) AS numberOfNodes
    """
    result = session.run(query)
    record = result.single()
    return record["numberOfNodes"]

def my_neo4j_number_relationships(from_label="", to_label=""):
    query = f"""
    MATCH (n:{from_label})-[r]->(m:{to_label})
    RETURN count(r) AS numberOfRelationships
    """
    result = session.run(query)
    record = result.single()
    return record["numberOfRelationships"]    

def my_neo4j_number_nodes_relationships():
    number_providers = my_neo4j_number_nodes("Provider")
    number_drugs = my_neo4j_number_nodes("Drug")
    number_procedures = my_neo4j_number_nodes("Procedure")
    
    number_provider_drug_relationships =  my_neo4j_number_relationships("Provider", "Drug")
    number_provider_procedure_relationships =  my_neo4j_number_relationships("Provider", "Procedure")
    
    print("-------------------------")
    print("Providers:", number_providers)
    print("Drugs:", number_drugs)
    print("Procedures:", number_procedures)
    print("(Provider)->(Drug) Relationships:", number_provider_drug_relationships)
    print("(Provider)->(Procedure) Relationships:", number_provider_procedure_relationships)
    print("-------------------------")
    

In [37]:
my_neo4j_number_nodes_relationships()

-------------------------
Providers: 17
Drugs: 1
Procedures: 53
(Provider)->(Drug) Relationships: 6
(Provider)->(Procedure) Relationships: 197
-------------------------


In [38]:
def my_neo4j_create_nodes(label, attr_name, node_data):
    # Create Neo4j Nodes for a specific label, attribute name and data
    query = f"""
    UNWIND $nodes AS node
    CREATE (n:{label} {{{attr_name}: node.{attr_name}}})
    """
    # Execute the query
    with driver.session() as session:
        session.run(query, nodes=node_data)

In [39]:
def my_neo4j_create_relationships(is_drug):
    """Create provider->drug relationships"""
    # Use different label for drug and procedures
    drug_indicator = "Y" if is_drug else "N"
    to_label = "Drug" if is_drug else "Procedure"
    relation_type = "Prescribes" if is_drug else "Conducts"
    query = f"""
    select provider_npi, 
           code, 
           total_beneficiaries, 
           total_services, 
           total_amount_paid, 
           avg_amount_paid
    from provider_services_rendered as p
    where %s and drug_indicator = '{drug_indicator}' 
    """ % my_select_where_condition()
    df = my_select_query_pandas(query)
    data = [{
        "provider_npi": row.provider_npi, 
        "code": row.code, 
        "total_beneficiaries": row.total_beneficiaries,
        "total_services": row.total_services,
        "total_amount_paid": row.total_amount_paid,
        "avg_amount_paid": row.avg_amount_paid
    } for row in df.itertuples()]
    query = """
    UNWIND $relationships AS rel
    MATCH (a:Provider {node_name: rel.provider_npi}), (b:%s {node_name: rel.code})
    CREATE (a)-[:%s {total_beneficiaries: rel.total_beneficiaries, total_services:rel.total_services, total_amount_paid: rel.total_amount_paid, avg_amount_paid: rel.avg_amount_paid}]->(b);
    """ % (to_label, relation_type)
    # Execute the query
    with driver.session() as session:
        session.run(query, relationships=data)    
    

In [40]:
def get_distinct_values(col_name, table_name):
    """return distinct values for a column from a table"""
    query = f"""
    select distinct {col_name} from {table_name} as p
    where %s
    """ % my_select_where_condition()
    return my_select_query_pandas(query)


def set_provider_ytd_data():
    """set the total amount paid to a provider in a year"""
    query = """
    select provider_npi, 
           sum(total_amount_paid) as total_amount_paid,
           sum(total_beneficiaries) as total_beneficiaries,
           sum(total_services) as total_services
    from provider_procedures as p
    where %s
    group by provider_npi
    order by total_amount_paid desc, total_beneficiaries desc, total_services desc
    """ % my_select_where_condition()

    df = my_select_query_pandas(query, True, True)
    for row in df.itertuples():
        query = f"""
        match (n:Provider) where n.node_name = '{row.provider_npi}' 
        set n.total_amount_paid = {row.total_amount_paid}
        set n.total_beneficiaries = {row.total_beneficiaries}
        set n.total_services = {row.total_services}
        """
        session.run(query)

    
def my_neo4j_create_db():
    """Create nodes for providers."""
    # Wipe database if any.
    my_neo4j_wipe_out_database()
    
    # Create nodes for providers
    df = get_distinct_values("provider_npi", "provider_services_rendered")
    node_data = [{"node_name": row.provider_npi} for row in df.itertuples()]
    my_neo4j_create_nodes("Provider", "node_name", node_data)
    
    # Create nodes & relationships for drugs.
    df = get_distinct_values("code", "provider_drugs")
    node_data = [{"node_name": row.code} for row in df.itertuples()]
    my_neo4j_create_nodes("Drug", "node_name", node_data)
    my_neo4j_create_relationships(is_drug=True)

    # Create nodes & relationships for procedures.
    df = get_distinct_values("code", "provider_procedures")
    node_data = [{"node_name": row.code} for row in df.itertuples()]
    my_neo4j_create_nodes("Procedure", "node_name", node_data)
    my_neo4j_create_relationships(is_drug=False)
    
    # set aggregated amount paid for each provider
    set_provider_ytd_data()
    
    # Print numbers for node and relationship.
    my_neo4j_number_nodes_relationships()


    

In [41]:
# create db
my_neo4j_create_db()

-------------------------
Providers: 994
Drugs: 15
Procedures: 251
(Provider)->(Drug) Relationships: 682
(Provider)->(Procedure) Relationships: 19906
-------------------------


### Define a function to calculate degree centrality for a target node type and weight property

In [42]:
def my_neo4j_project_graph(to_node, relationship, weight_property):
    query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
    session.run(query)
    query = """
        CALL gds.graph.project(
          'ds_graph', 
          ['Provider', '%s'], 
          {
            %s: {
              properties: '%s',
              orientation: 'NATURAL'
            }
          }
        )
    """ % (to_node, relationship, weight_property)
    session.run(query)

def my_neo4j_degree_yield_query(weight_property):
    return """
    CALL gds.degree.stream(
      'ds_graph',
      {
        relationshipWeightProperty: '%s'
      }
    )
    YIELD nodeId, score
    WHERE gds.util.asNode(nodeId):Provider and score > 0
    """ % weight_property
    

def my_neo4j_degree_centrality(to_node, relationship, weight_property, score_name):
    my_neo4j_project_graph(to_node, relationship, weight_property)
    query = my_neo4j_degree_yield_query(weight_property)
    query += """
    WITH round(avg(score), 2) AS AvgDegree, max(score) AS MaxDegree, percentileCont(score, 0.5) as MedianDegree
    RETURN AvgDegree, MaxDegree, MedianDegree
    """
    result = session.run(query).data()
    avg_degree = result[0]["AvgDegree"]
    max_degree = result[0]["MaxDegree"]
    median_degree = result[0]["MedianDegree"]
    print(f"avg_degree=%.2f, {max_degree=}, {median_degree=}" % avg_degree)
    
    my_neo4j_project_graph(to_node, relationship, weight_property)
    query = my_neo4j_degree_yield_query(weight_property)
    query += f"""
    RETURN gds.util.asNode(nodeId).node_name AS ProviderID, 
           score AS {score_name}, 
           round(score/{max_degree}, 2) as RatioToMax, 
           round(score/{avg_degree}, 2) as RatioToAvg,
           round(score/{median_degree}, 2) as RatioToMedian
    ORDER BY {score_name} DESC
    """
    return my_neo4j_run_query_pandas(query)

### Calculate degree centrality for Drugs with property total_amount_paid. 

In [43]:
my_neo4j_degree_centrality("Drug", "Prescribes", "total_amount_paid", "TotalPaidCentrality")

avg_degree=151.08, max_degree=903.31, median_degree=1.58


Unnamed: 0,ProviderID,TotalPaidCentrality,RatioToMax,RatioToAvg,RatioToMedian
0,1144215898,903.31,1.00,5.98,571.72
1,1245399237,620.96,0.69,4.11,393.01
2,1720039829,544.65,0.60,3.61,344.72
3,1427342203,544.02,0.60,3.60,344.32
4,1780802595,543.76,0.60,3.60,344.15
...,...,...,...,...,...
476,1164565321,1.06,0.00,0.01,0.67
477,1740609544,1.05,0.00,0.01,0.66
478,1043207293,1.04,0.00,0.01,0.66
479,1033438742,1.00,0.00,0.01,0.63


### Calculate degree centrality for Drugs with property total_beneficiaries

In [44]:
my_neo4j_degree_centrality("Drug", "Prescribes", "total_beneficiaries", "TotalBeneficiaryCentrality")

avg_degree=46.23, max_degree=354.0, median_degree=30.0


Unnamed: 0,ProviderID,TotalBeneficiaryCentrality,RatioToMax,RatioToAvg,RatioToMedian
0,1144215898,354.0,1.00,7.66,11.80
1,1780802595,316.0,0.89,6.84,10.53
2,1700861382,295.0,0.83,6.38,9.83
3,1912968694,294.0,0.83,6.36,9.80
4,1952369779,279.0,0.79,6.04,9.30
...,...,...,...,...,...
476,1902291784,11.0,0.03,0.24,0.37
477,1902803760,11.0,0.03,0.24,0.37
478,1902810401,11.0,0.03,0.24,0.37
479,1972708329,11.0,0.03,0.24,0.37


### Calculate degree centrality for Procedures with property total_amount_paid.

In [45]:
my_neo4j_degree_centrality("Procedure", "Conducts", "total_amount_paid", "TotalPaidCentrality")

avg_degree=3315.26, max_degree=15035.030000000002, median_degree=2395.635


Unnamed: 0,ProviderID,TotalPaidCentrality,RatioToMax,RatioToAvg,RatioToMedian
0,1043204365,15035.03,1.00,4.54,6.28
1,1699115014,14973.52,1.00,4.52,6.25
2,1104810563,14239.54,0.95,4.30,5.94
3,1952369779,13775.73,0.92,4.16,5.75
4,1528385507,13472.22,0.90,4.06,5.62
...,...,...,...,...,...
989,1942290499,100.83,0.01,0.03,0.04
990,1689746356,95.07,0.01,0.03,0.04
991,1063670412,90.66,0.01,0.03,0.04
992,1013366582,89.48,0.01,0.03,0.04


### Calculate degree centrality for Procedures with property total_beneficiaries.

In [46]:
my_neo4j_degree_centrality("Procedure", "Conducts", "total_beneficiaries", "TotalBeneficiaryCentrality")

avg_degree=2525.80, max_degree=22488.0, median_degree=1766.0


Unnamed: 0,ProviderID,TotalBeneficiaryCentrality,RatioToMax,RatioToAvg,RatioToMedian
0,1164614004,22488.0,1.00,8.90,12.73
1,1902811219,14651.0,0.65,5.80,8.30
2,1912968694,14139.0,0.63,5.60,8.01
3,1740399963,13849.0,0.62,5.48,7.84
4,1417947961,13512.0,0.60,5.35,7.65
...,...,...,...,...,...
989,1407236227,16.0,0.00,0.01,0.01
990,1689746356,16.0,0.00,0.01,0.01
991,1225092539,13.0,0.00,0.01,0.01
992,1457882052,12.0,0.00,0.00,0.01


### Calculate degree centrality for Procedures with property total_services.

In [47]:
my_neo4j_degree_centrality("Procedure", "Conducts", "total_services", "TotalServicesCentrality")

avg_degree=5071.35, max_degree=49108.0, median_degree=3337.0


Unnamed: 0,ProviderID,TotalServicesCentrality,RatioToMax,RatioToAvg,RatioToMedian
0,1740399963,49108.0,1.00,9.68,14.72
1,1164614004,49099.0,1.00,9.68,14.71
2,1902811219,35044.0,0.71,6.91,10.50
3,1417947961,33591.0,0.68,6.62,10.07
4,1952369779,32690.0,0.67,6.45,9.80
...,...,...,...,...,...
989,1407236227,16.0,0.00,0.00,0.00
990,1689746356,16.0,0.00,0.00,0.00
991,1225092539,14.0,0.00,0.00,0.00
992,1457882052,13.0,0.00,0.00,0.00


In [50]:
def print_provider(provider_npis):
    condition = " or ".join([f"provider_npi = '{provider_npi}'" for provider_npi in provider_npis])
    query = f"""
    select provider_npi,
           provider_last_name as last_name,
           provider_first_name as first_name,
           provider_credentials as credentials,
           provider_gender as gender,
           provider_state as state,
           provider_zip as zipcode
    from providers where {condition} 
    """
    return my_select_query_pandas(query, True, True)

print_provider(['1043204365', '1164614004', '1740399963', '1699115014'])

Unnamed: 0,provider_npi,last_name,first_name,credentials,gender,state,zipcode
0,1043204365,Welton,William,MD,M,FL,34461
1,1164614004,Casper,David,M.D.,M,FL,32162
2,1699115014,Hyder,Luke,MD,M,FL,32503
3,1740399963,Cohen,Eliahou,MD,M,FL,33484
