# Chapter 4 | Pathfinding and Graph Search Algorithms

In [1]:
import os.path as op
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from graphframes import *

spark = SparkSession.builder.getOrCreate() 

In [11]:
## Pull
!wget -P ../../data/ https://resources.oreilly.com/examples/0636920233145/raw/master/data/transport-nodes.csv
!wget -P ../../data/ https://resources.oreilly.com/examples/0636920233145/raw/master/data/transport-relationships.csv

--2021-05-23 10:11:30--  https://resources.oreilly.com/examples/0636920233145/raw/master/data/transport-nodes.csv
Resolving resources.oreilly.com (resources.oreilly.com)... 199.27.144.213, 199.27.144.212
Connecting to resources.oreilly.com (resources.oreilly.com)|199.27.144.213|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 465 [text/plain]
Saving to: ‘../../data/transport-nodes.csv’


2021-05-23 10:11:31 (26.3 MB/s) - ‘../../data/transport-nodes.csv’ saved [465/465]

--2021-05-23 10:11:31--  https://resources.oreilly.com/examples/0636920233145/raw/master/data/transport-relationships.csv
Resolving resources.oreilly.com (resources.oreilly.com)... 199.27.144.212, 199.27.144.213
Connecting to resources.oreilly.com (resources.oreilly.com)|199.27.144.212|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 550 [text/plain]
Saving to: ‘../../data/transport-relationships.csv’


2021-05-23 10:11:31 (27.1 MB/s) - ‘../../data/transport-relationship

In [2]:
data_path = ("../../data/")
rels_fname = op.join(data_path, 'transport-relationships.csv')
node_fname = op.join(data_path, 'transport-nodes.csv')

In [3]:
def create_transport_graph():
    node_fields = [
        StructField("id", StringType(), True),
        StructField("latitude", FloatType(), True),
        StructField("longitude", FloatType(), True),
        StructField("population", IntegerType(), True)
    ]    
    nodes = spark.read.csv(node_fname, header=True, schema=StructType(node_fields))
    
    rels = spark.read.csv(rels_fname, header=True)
    reversed_rels = (rels.withColumn("newSrc", rels.dst)
        .withColumn("newDst", rels.src)
        .drop("dst", "src")
        .withColumnRenamed("newSrc", "src")
        .withColumnRenamed("newDst", "dst")
        .select("src", "dst", "relationship", "cost"))
    relationships = rels.union(reversed_rels)
    return GraphFrame(nodes, relationships)


In [4]:
g = create_transport_graph()

# Breadth First Search with Apache Spark

In [6]:
(g.vertices
    .filter("population > 100000 and population < 300000")
    .sort("population")
    .show())

+----------+--------+---------+----------+
|        id|latitude|longitude|population|
+----------+--------+---------+----------+
|Colchester|51.88921|  0.90421|    104390|
|   Ipswich|52.05917|  1.15545|    133384|
+----------+--------+---------+----------+



In [7]:
from_expr = "id='Den Haag'"
to_expr = "population > 100000 and population < 300000 and id <> 'Den Haag'"
result = g.bfs(from_expr, to_expr)

print(result.columns)

['from', 'e0', 'v1', 'e1', 'v2', 'e2', 'to']
