# Project 3 - Graph Algorithm Implementation

University of California, Berkeley

Master of Information and Data Science (MIDS) program

w205 - Fundamentals of Data Engineering


# Implementation of Page Rank Graph Algorithm

Page Rank from a single node; what's important to a specific user; target recommendations to a specific user

Use Page Rank Graph algorithm to determine which other stations are most influential from the perspective of a given station.



In [1]:
import neo4j

import csv

import math
import numpy as np
import pandas as pd

import psycopg2

In [2]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","w205"))

In [3]:
session = driver.session(database="neo4j")

In [4]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [5]:
def my_neo4j_nodes_relationships():
    "print all the nodes and relationships"
   
    print("-------------------------")
    print("  Nodes:")
    print("-------------------------")
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    display(df)
    
    print("-------------------------")
    print("  Relationships:")
    print("-------------------------")
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    display(df)
    
    density = (2 * number_relationships) / (number_nodes * (number_nodes - 1))
    
    print("-------------------------")
    print("  Density:", f'{density:.1f}')
    print("-------------------------")
    

In [6]:
# since the BART systems looked connected,
# we will be using gds page rank algorithm to rank all of the given station and output it in a tabular format
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
session.run(query)

<neo4j.work.result.Result at 0x7f368c58fdc0>

In [10]:
# declare the page rank algorithm
query = """

    MATCH (source:Station {name: $source})
    CALL gds.pageRank.stream(
        'ds_graph', 
        {
            maxIterations: $max_iterations,
            dampingFactor: $damping_factor,
            sourceNodes: [source]
        }
    )
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).name AS name, score as page_rank
    ORDER BY score DESC, name ASC

"""

source = "depart Downtown Berkeley"
max_iterations = 20
damping_factor = 0.80

df = my_neo4j_run_query_pandas(query, source=source, max_iterations=max_iterations, damping_factor=damping_factor)
df = df[(df['page_rank'] > 0)]
df['page_rank'] = round(df['page_rank'], 3)
df.head(10)

Unnamed: 0,name,page_rank
0,depart Downtown Berkeley,0.2
1,orange Downtown Berkeley,0.115
2,red Downtown Berkeley,0.115
3,arrive Downtown Berkeley,0.046
4,orange North Berkeley,0.031
5,red North Berkeley,0.031
6,orange Ashby,0.03
7,red Ashby,0.03
8,arrive North Berkeley,0.012
9,arrive Ashby,0.012


In [9]:
source = "depart Richmond"
max_iterations = 20
damping_factor = 0.80

df = my_neo4j_run_query_pandas(query, source=source, max_iterations=max_iterations, damping_factor=damping_factor)
df = df[(df['page_rank'] > 0)]
df['page_rank'] = round(df['page_rank'], 3)
df.head(10)

Unnamed: 0,name,page_rank
0,depart Richmond,0.2
1,orange Richmond,0.121
2,red Richmond,0.121
3,arrive Richmond,0.064
4,orange El Cerrito del Norte,0.043
5,red El Cerrito del Norte,0.043
6,arrive El Cerrito del Norte,0.017
7,orange El Cerrito Plaza,0.012
8,red El Cerrito Plaza,0.012
9,arrive El Cerrito Plaza,0.005
