In [1]:
import math
import numpy as np
import pandas as pd

import neo4j
import psycopg2

# Data loading and processing

In [3]:
distance_df = pd.read_csv("../data/distance_data.csv")
distance_df = distance_df.iloc[:,1:] # remove extra first column
distance_df.head()

Unnamed: 0,golf_course,school,golf_course_geo,school_geo,school_address,crow_distance,distance,duration
0,Pebble Beach Golf Links,Academy of Art University - San Francisco,"(36.5696553, -121.9497555)","(37.7879584, -122.4004678)","79 New Montgomery San Francisco, CA",87.756983,124 mi,2 hours 5 mins
1,Pebble Beach Golf Links,Cinta Aveda,"(36.5696553, -121.9497555)","(37.7909576, -122.4042735)",305 Kearny Street,88.014954,124 mi,2 hours 11 mins
2,Pebble Beach Golf Links,Contra Costa Medical Career College - Antioch,"(36.5696553, -121.9497555)","(37.97679979999999, -121.8015862)","4041 Lone Tree Way Antioch, CA",97.565091,143 mi,2 hours 22 mins
3,Pebble Beach Golf Links,Five Branches University - Santa Cruz,"(36.5696553, -121.9497555)","(36.9644346, -121.9980494)","200 7th Avenue Santa Cruz, CA",27.407312,45.8 mi,55 mins
4,Pebble Beach Golf Links,National Holistic Institute - Emeryville,"(36.5696553, -121.9497555)","(37.8418091, -122.2878972)","5900 Doyle Street Emeryville, CA",89.845364,120 mi,2 hours 4 mins


In [3]:
# Convert duration to minutes and distance to float for future filtering

def convert_time_to_minutes(time):
    '''Convert 2 hours 5 mins to 125 minutes and 55 mins to 55 minutes'''
    if time != time:
        return time
    if len(time.split(' ')) == 4:
        return int(time.split(' ')[0]) * 60 + int(time.split(' ')[2])
    else:
        return int(time.split(' ')[0])

# convert distance to float
distance_df['distance_mi'] = distance_df['distance'].apply(lambda x: float(str(x).split(' ')[0]))
# convert duration to minutes
distance_df['duration_min'] = distance_df['duration'].apply(lambda x: (convert_time_to_minutes(x)))    

distance_df.head()

Unnamed: 0,golf_course,school,golf_course_geo,school_geo,school_address,crow_distance,distance,duration,distance_mi,duration_min
0,Pebble Beach Golf Links,Academy of Art University - San Francisco,"(36.5696553, -121.9497555)","(37.7879584, -122.4004678)","79 New Montgomery San Francisco, CA",87.756983,124 mi,2 hours 5 mins,124.0,125.0
1,Pebble Beach Golf Links,Cinta Aveda,"(36.5696553, -121.9497555)","(37.7909576, -122.4042735)",305 Kearny Street,88.014954,124 mi,2 hours 11 mins,124.0,131.0
2,Pebble Beach Golf Links,Contra Costa Medical Career College - Antioch,"(36.5696553, -121.9497555)","(37.97679979999999, -121.8015862)","4041 Lone Tree Way Antioch, CA",97.565091,143 mi,2 hours 22 mins,143.0,142.0
3,Pebble Beach Golf Links,Five Branches University - Santa Cruz,"(36.5696553, -121.9497555)","(36.9644346, -121.9980494)","200 7th Avenue Santa Cruz, CA",27.407312,45.8 mi,55 mins,45.8,55.0
4,Pebble Beach Golf Links,National Holistic Institute - Emeryville,"(36.5696553, -121.9497555)","(37.8418091, -122.2878972)","5900 Doyle Street Emeryville, CA",89.845364,120 mi,2 hours 4 mins,120.0,124.0


In [8]:
# Filter out schools over 2 hours away

distance_df = distance_df[distance_df['duration_min'] < 120]
distance_df.head()

Unnamed: 0,golf_course,school,golf_course_geo,school_geo,school_address,crow_distance,distance,duration,distance_mi,duration_min
3,Pebble Beach Golf Links,Five Branches University - Santa Cruz,"(36.5696553, -121.9497555)","(36.9644346, -121.9980494)","200 7th Avenue Santa Cruz, CA",27.407312,45.8 mi,55 mins,45.8,55.0
5,Pebble Beach Golf Links,National Tractor Trailer School Inc - Corporat...,"(36.5696553, -121.9497555)","(36.503236, -121.451229)",175 Katherine St,28.053599,38.5 mi,53 mins,38.5,53.0
7,Pebble Beach Golf Links,The Salon Professional Academy - Rochester,"(36.5696553, -121.9497555)","(37.2920045, -121.9894762)","1600 Saratoga Ave, Ste 103",49.957768,72.5 mi,1 hour 24 mins,72.5,84.0
8,Pebble Beach Golf Links,The Salon Professional Academy - San Jose,"(36.5696553, -121.9497555)","(37.2920045, -121.9894762)","1600 Saratoga Ave Ste. 103 San Jose, CA 95129",49.957768,72.5 mi,1 hour 24 mins,72.5,84.0
10,Pebble Beach Golf Links,Toni & Guy Hairdressing Academy - San Jose,"(36.5696553, -121.9497555)","(37.3256289, -121.8139115)","2200 Eastridge Loop - San Jose, CA 95122",52.768645,72.9 mi,1 hour 18 mins,72.9,78.0


In [9]:
scores_df = pd.read_csv("../data/school_similarity_scores.csv")
scores_df = scores_df.rename(columns={'School': 'school', 'Score': 'score'})
scores_df.head()

Unnamed: 0,school,score
0,Eastwick College - Corporate Office,0.25
1,American Training Center,1.0
2,Academy of Art University - San Francisco,0.25
3,Toni&Guy Hairdressing Academy at Penn Commercial,0.833333
4,Aveda Institute - Rochester,1.0


In [10]:
# Joining scores data to main data
distance_with_scores_df = pd.merge(distance_df, scores_df, on='school', how='left')
distance_with_scores_df.head()

Unnamed: 0,golf_course,school,golf_course_geo,school_geo,school_address,crow_distance,distance,duration,distance_mi,duration_min,score
0,Pebble Beach Golf Links,Five Branches University - Santa Cruz,"(36.5696553, -121.9497555)","(36.9644346, -121.9980494)","200 7th Avenue Santa Cruz, CA",27.407312,45.8 mi,55 mins,45.8,55.0,1.0
1,Pebble Beach Golf Links,National Tractor Trailer School Inc - Corporat...,"(36.5696553, -121.9497555)","(36.503236, -121.451229)",175 Katherine St,28.053599,38.5 mi,53 mins,38.5,53.0,0.25
2,Pebble Beach Golf Links,The Salon Professional Academy - Rochester,"(36.5696553, -121.9497555)","(37.2920045, -121.9894762)","1600 Saratoga Ave, Ste 103",49.957768,72.5 mi,1 hour 24 mins,72.5,84.0,0.25
3,Pebble Beach Golf Links,The Salon Professional Academy - San Jose,"(36.5696553, -121.9497555)","(37.2920045, -121.9894762)","1600 Saratoga Ave Ste. 103 San Jose, CA 95129",49.957768,72.5 mi,1 hour 24 mins,72.5,84.0,0.25
4,Pebble Beach Golf Links,Toni & Guy Hairdressing Academy - San Jose,"(36.5696553, -121.9497555)","(37.3256289, -121.8139115)","2200 Eastridge Loop - San Jose, CA 95122",52.768645,72.9 mi,1 hour 18 mins,72.9,78.0,0.8


# Neo4j functions from labs

In [11]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [12]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [13]:
def my_neo4j_number_nodes_relationships():
    "print the number of nodes and relationships"
   
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")
    
    display(df)

In [14]:
def my_neo4j_create_school_node(school_name):
    "create a node with label School"
    
    query = """
    
    CREATE (:School {name: $school_name})
    
    """
    
    session.run(query, school_name=school_name)
    

In [15]:
def my_neo4j_create_golf_course_node(golf_course_name):
    "create a node with label Golf_Course"
    
    query = """
    
    CREATE (:Golf_Course {name: $golf_course_name})
    
    """
    
    session.run(query, golf_course_name=golf_course_name)

In [16]:
def my_neo4j_create_relationship_one_way(from_school, to_golf_course, weight):
    "create a relationship one way between school and golf course with a weight"
    
    query = """
    
    MATCH (from:School), 
          (to:Golf_Course)
    WHERE from.name = $from_school and to.name = $to_golf_course
    CREATE (from)-[:LINK {weight: $weight}]->(to)
    
    """
    
    session.run(query, from_school=from_school, to_golf_course=to_golf_course, weight=weight)

In [17]:
def my_neo4j_create_relationship_two_way(from_school, to_golf_course, weight):
    "create relationships two way between school and golf course with a weight"
    
    query = """
    
    MATCH (from:School), 
          (to:Golf_Course)
    WHERE from.name = $from_school and to.name = $to_golf_course
    CREATE (from)-[:LINK {weight: $weight}]->(to),
           (to)-[:LINK {weight: $weight}]->(from)
    
    """
    
    session.run(query, from_school=from_school, to_golf_course=to_golf_course, weight=weight)

In [18]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [19]:
cursor = connection.cursor()

# Creating graph of golf courses with nearest schools

In [20]:
# neo4j setup
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))
session = driver.session(database="neo4j")

my_neo4j_wipe_out_database()

In [21]:
# Check that graph is empty

my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 0
  Relationships: 0
-------------------------


Unnamed: 0,node_name_1,node_1_labels,relationship_type,node_name_2,node_2_labels


In [22]:
### Create graph of schools tied to golf courses

# Keep track of nodes to avoid duplicates
school_nodes = []
golf_course_nodes = []

for index, row in distance_with_scores_df.iterrows():
    
    # Nodes for golf courses and school
    if row['golf_course'] not in golf_course_nodes:
        my_neo4j_create_golf_course_node(row['golf_course'])
        golf_course_nodes.append(row['golf_course'])
    if row['school'] not in school_nodes:
        my_neo4j_create_school_node(row['school'])
        school_nodes.append(row['school'])
    
    # Use similarity score as weight
    weight = row['score']
    
    # Create two-way relationships between schools and golf courses to simulate path of travel
    my_neo4j_create_relationship_two_way(row['school'], row['golf_course'], weight)

In [23]:
## Check that nodes were captured correctly

num_golf_courses = len(set(distance_df['golf_course'].unique()))
print("Number of golf courses:", num_golf_courses)
print("Number of golf course nodes: ", len(golf_course_nodes))

num_schools = len(set(distance_df['school'].unique()))
print("Number of schools:", num_schools)
print("Number of school nodes: ", len(school_nodes))

Number of golf courses: 75
Number of golf course nodes:  75
Number of schools: 361
Number of school nodes:  361


In [24]:
## View nodes and relationships

my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 436
  Relationships: 1610
-------------------------


Unnamed: 0,node_name_1,node_1_labels,relationship_type,node_name_2,node_2_labels
0,AMG School of Licensed Practical Nursing - Bro...,[School],LINK,Bethpage Black Course,[Golf_Course]
1,AMG School of Licensed Practical Nursing - Bro...,[School],LINK,Trump Golf Links At Ferry Point,[Golf_Course]
2,ATA Career Education - Spring Hill,[School],LINK,Arnold Palmer’S Bay Hill Club (Champion),[Golf_Course]
3,ATA Career Education - Spring Hill,[School],LINK,Innisbrook Resort (Copperhead),[Golf_Course]
4,ATA Career Education - Spring Hill,[School],LINK,Streamsong Resort (Blue),[Golf_Course]
...,...,...,...,...,...
1605,Woodruff Medical Training and Testing - Corpor...,[School],LINK,Reynolds Lake Oconee (Great Waters),[Golf_Course]
1606,Woodruff Medical Training and Testing - Corpor...,[School],LINK,Reynolds Lake Oconee (Oconee),[Golf_Course]
1607,Wynn Las Vegas,[Golf_Course],LINK,Aveda Institute - Las Vegas,[School]
1608,Wynn Las Vegas,[Golf_Course],LINK,Casal Institute of Nevada - Las Vegas,[School]


# Degree centrality based on school scores

In [25]:
# Set up degree centrality

query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

# list of nodes to include both types in calculation
query = "CALL gds.graph.project('ds_graph', ['Golf_Course', 'School'], 'LINK', {relationshipProperties: 'weight'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fe18ce561c0>

In [26]:
# Run weighted centrality algorithm, show 20 top golf courses

# WHERE clause filters to use only Golf_Course label
query = """

CALL gds.degree.stream('ds_graph', {relationshipWeightProperty: 'weight'})
YIELD nodeId, score
WHERE gds.util.asNode(nodeId):Golf_Course 
RETURN gds.util.asNode(nodeId).name AS name, score as degree
ORDER BY degree DESC, name

"""

centrality_df = my_neo4j_run_query_pandas(query)
display(centrality_df[0:20])

Unnamed: 0,name,degree
0,Trump Golf Links At Ferry Point,22.883333
1,Bulle Rock,18.083333
2,Cog Hill (No. 4),15.25
3,Atlantic City Country Club,15.166667
4,The Glen Club,14.25
5,Bethpage Black Course,14.216667
6,Pelican Hill Golf Club (Ocean South),12.466667
7,Arnold Palmer’S Bay Hill Club (Champion),12.264286
8,Reynolds Lake Oconee (Great Waters),12.083333
9,Reynolds Lake Oconee (Oconee),12.083333


In [30]:
# Export all golf courses and degrees to CSV

centrality_df.to_csv('../data/degree_centrality.csv', index=False)