# Integrating GraphFrames package

One option is to configure `PYSPARK_SUBMIT_ARGS`. With that all sessions will be initialized with the specified package.

In [1]:
import os

os.environ['PYSPARK_SUBMIT_ARGS'] = '--driver-memory 2g --packages graphframes:graphframes:0.8.1-spark3.0-s_2.12 pyspark-shell'

Better option is to provide a builder configuration option `spark.jars.packages`.

In [14]:
from pyspark import StorageLevel
from pyspark.sql import functions as F, SQLContext, SparkSession, Window
from pyspark.sql.types import *
from random import randint
import time
import datetime

spark = (SparkSession.builder
         .appName("graphframes")
         .master("spark://spark-master:7077")
         .config("spark.jars.packages", "graphframes:graphframes:0.8.1-spark3.0-s_2.12")
         .enableHiveSupport()
         .getOrCreate()
         )

In [10]:
import sys
sys.path

['/opt/workspace',
 '/tmp/spark-c1cc1732-44c6-41fb-a5cf-1ed1798160c6/userFiles-c40a5f68-ec40-40f0-be4e-603f65daaf61/org.slf4j_slf4j-api-1.7.16.jar',
 '/tmp/spark-c1cc1732-44c6-41fb-a5cf-1ed1798160c6/userFiles-c40a5f68-ec40-40f0-be4e-603f65daaf61/graphframes_graphframes-0.8.1-spark3.0-s_2.12.jar',
 '/tmp/spark-c1cc1732-44c6-41fb-a5cf-1ed1798160c6/userFiles-c40a5f68-ec40-40f0-be4e-603f65daaf61',
 '/usr/lib/python37.zip',
 '/usr/lib/python3.7',
 '/usr/lib/python3.7/lib-dynload',
 '',
 '/usr/local/lib/python3.7/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/usr/local/lib/python3.7/dist-packages/IPython/extensions',
 '/root/.ipython']

In [4]:
import graphframes

dir(graphframes.graphframe)

['Column',
 'DataFrame',
 'GraphFrame',
 'Pregel',
 'SQLContext',
 'SparkContext',
 'StorageLevel',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_from_java_gf',
 '_java_api',
 '_test',
 'basestring',
 'sys']

In [5]:
from pyspark.sql.functions import *

# Create a Vertex DataFrame with unique ID column "id"
v = spark.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
], ["id", "name", "age"])
# Create an Edge DataFrame with "src" and "dst" columns
e = spark.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
], ["src", "dst", "relationship"])

In [6]:
from graphframes import *
g = GraphFrame(v, e)

In [7]:
g.inDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|  c|       1|
|  b|       2|
+---+--------+



In [8]:
g.edges.filter("relationship = 'follow'").count()

2

In [9]:
results = g.pageRank(resetProbability=0.01, maxIter=20)
results.vertices.select("id", "pagerank").show()

+---+------------------+
| id|          pagerank|
+---+------------------+
|  b|1.0905890109440908|
|  a|              0.01|
|  c|1.8994109890559092|
+---+------------------+



## Bike Rides

In [19]:
bikeStations = spark.read.option("header",True).csv("data/graphs/station.csv")
bikeStations.printSchema()
tripData = spark.read.option("header",True).csv("data/graphs/trip.csv")
tripData.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- long: string (nullable = true)
 |-- dock_count: string (nullable = true)
 |-- city: string (nullable = true)
 |-- installation_date: string (nullable = true)

root
 |-- id: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- start_station_name: string (nullable = true)
 |-- start_station_id: string (nullable = true)
 |-- end_date: string (nullable = true)
 |-- end_station_name: string (nullable = true)
 |-- end_station_id: string (nullable = true)
 |-- bike_id: string (nullable = true)
 |-- subscription_type: string (nullable = true)
 |-- zip_code: string (nullable = true)



### Prepare Vertices

In [35]:
stationVertices = bikeStations.distinct()
stationVertices.show(truncate=False)

+---+---------------------------------+------------------+-------------------+----------+-------------+-----------------+
|id |name                             |lat               |long               |dock_count|city         |installation_date|
+---+---------------------------------+------------------+-------------------+----------+-------------+-----------------+
|67 |Market at 10th                   |37.776619000000004|-122.41738500000001|27        |San Francisco|8/23/2013        |
|10 |San Jose City Hall               |37.337391         |-121.886995        |15        |San Jose     |8/6/2013         |
|11 |MLK Library                      |37.335885         |-121.88566000000002|19        |San Jose     |8/6/2013         |
|34 |Palo Alto Caltrain Station       |37.443988         |-122.164759        |23        |Palo Alto    |8/14/2013        |
|42 |Davis at Jackson                 |37.79728          |-122.398436        |15        |San Francisco|8/19/2013        |
|32 |Castro Street and E

### Prepare Edges

In [46]:
tripEdges = tripData\
    .withColumnRenamed("start_station_name", "src")\
    .withColumnRenamed("end_station_name", "dst")

tripEdges.show(truncate=False)

+----+--------+---------------+------------------------------+----------------+---------------+------------------------------------+--------------+-------+-----------------+--------+
|id  |duration|start_date     |src                           |start_station_id|end_date       |dst                                 |end_station_id|bike_id|subscription_type|zip_code|
+----+--------+---------------+------------------------------+----------------+---------------+------------------------------------+--------------+-------+-----------------+--------+
|4576|63      |8/29/2013 14:13|South Van Ness at Market      |66              |8/29/2013 14:14|South Van Ness at Market            |66            |520    |Subscriber       |94127   |
|4607|70      |8/29/2013 14:42|San Jose City Hall            |10              |8/29/2013 14:43|San Jose City Hall                  |10            |661    |Subscriber       |95138   |
|4130|71      |8/29/2013 10:16|Mountain View City Hall       |27              |8/29/2

### Initialize the GraphFrame

In [25]:
stationGraph = GraphFrame(stationVertices, tripEdges)

### Simple Graph computations

In [52]:
print("Total Number of Stations: " + str(stationGraph.vertices.count()))
print("Total Number of Trips in Graph: " + str(stationGraph.edges.count()))
print("Total Number of Trips in Original Data: " + str(tripData.count()))

Total Number of Stations: 70
Total Number of Trips in Graph: 669959
Total Number of Trips in Original Data: 669959


### Most popular trips

In [37]:
topTrips = stationGraph\
    .edges\
    .groupBy("src", "dst")\
    .count()\
    .orderBy(desc("count"))\
    .limit(10)

topTrips.show(truncate=False)

+----------------------------------------+----------------------------------------+-----+
|src                                     |dst                                     |count|
+----------------------------------------+----------------------------------------+-----+
|San Francisco Caltrain 2 (330 Townsend) |Townsend at 7th                         |6216 |
|Harry Bridges Plaza (Ferry Building)    |Embarcadero at Sansome                  |6164 |
|Townsend at 7th                         |San Francisco Caltrain (Townsend at 4th)|5041 |
|2nd at Townsend                         |Harry Bridges Plaza (Ferry Building)    |4839 |
|Harry Bridges Plaza (Ferry Building)    |2nd at Townsend                         |4357 |
|Embarcadero at Sansome                  |Steuart at Market                       |4269 |
|Embarcadero at Folsom                   |San Francisco Caltrain (Townsend at 4th)|3967 |
|Steuart at Market                       |2nd at Townsend                         |3903 |
|2nd at So

### In Degree

In [44]:
inDeg = stationGraph.inDegrees
inDeg\
    .orderBy(desc("inDegree"))\
    .limit(5)\
    .show(truncate=False)

+----------------------------------------+--------+
|id                                      |inDegree|
+----------------------------------------+--------+
|San Francisco Caltrain (Townsend at 4th)|63179   |
|San Francisco Caltrain 2 (330 Townsend) |35117   |
|Harry Bridges Plaza (Ferry Building)    |33193   |
|Embarcadero at Sansome                  |30796   |
|2nd at Townsend                         |28529   |
+----------------------------------------+--------+



### Out Degree

In [43]:
outDeg = stationGraph.outDegrees
outDeg\
    .orderBy(desc("outDegree"))\
    .limit(5)\
    .show(truncate=False)

+---------------------------------------------+---------+
|id                                           |outDegree|
+---------------------------------------------+---------+
|San Francisco Caltrain (Townsend at 4th)     |49092    |
|San Francisco Caltrain 2 (330 Townsend)      |33742    |
|Harry Bridges Plaza (Ferry Building)         |32934    |
|Embarcadero at Sansome                       |27713    |
|Temporary Transbay Terminal (Howard at Beale)|26089    |
+---------------------------------------------+---------+



### Degree Ratio

In [49]:
degreeRatio = inDeg\
    .join(outDeg, inDeg["id"] == outDeg["id"])\
    .drop(outDeg["id"])\
    .selectExpr("id", "double(inDegree)/double(outDegree) as degreeRatio")

degreeRatio.cache()

DataFrame[id: string, degreeRatio: double]

#### Descending

In [48]:
degreeRatio\
    .orderBy(desc("degreeRatio"))\
    .limit(10)\
    .show(truncate=False)

+----------------------------------------+------------------+
|id                                      |degreeRatio       |
+----------------------------------------+------------------+
|Redwood City Medical Center             |1.4533762057877813|
|Redwood City Public Library             |1.300469483568075 |
|San Francisco Caltrain (Townsend at 4th)|1.286951030717836 |
|Washington at Kearny                    |1.2723671947809878|
|MLK Library                             |1.233038348082596 |
|SJSU 4th at San Carlos                  |1.2282051282051283|
|San Mateo County Center                 |1.2195121951219512|
|Broadway at Main                        |1.208955223880597 |
|University and Emerson                  |1.2056878306878307|
|Washington at Kearney                   |1.203804347826087 |
+----------------------------------------+------------------+



#### Ascending

In [47]:
degreeRatio\
    .orderBy(asc("degreeRatio"))\
    .limit(10)\
    .show(truncate=False)

+-------------------------------+------------------+
|id                             |degreeRatio       |
+-------------------------------+------------------+
|Grant Avenue at Columbus Avenue|0.564700110388814 |
|2nd at Folsom                  |0.6056461731493099|
|Powell at Post (Union Square)  |0.6887003841229193|
|San Jose City Hall             |0.6928541579607188|
|San Francisco City Hall        |0.7497243660418964|
|Beale at Market                |0.774906104780699 |
|Redwood City Caltrain Station  |0.8075933075933076|
|Golden Gate at Polk            |0.8153091800599291|
|Evelyn Park and Ride           |0.8218356328734253|
|Ryland Park                    |0.8248425872925015|
+-------------------------------+------------------+



In [8]:
spark.stop()