## Demo GraphFrames con PySpark



Autor: [Sebastian Ruiz Martinez](https://www.linkedin.com/in/sebastianruizmartinez/)

### Dataset

Referencia: [SF Bay Area Bike Share](https://www.kaggle.com/benhamner/sf-bay-area-bike-share)

In [0]:
path = '/databricks-datasets/definitive-guide/data/bike-data/201508_trip_data.csv'

df_trips = spark.read.csv(path, header=True)

df_trips = df_trips.select(
  'Trip ID', 
  'Start Station', 
  'Start Terminal', 
  'End Station', 
  'End Terminal'
)

df_trips.createOrReplaceTempView('trips')

In [0]:
%sql
SELECT * FROM trips LIMIT 5

Trip ID,Start Station,Start Terminal,End Station,End Terminal
913460,Harry Bridges Plaza (Ferry Building),50,San Francisco Caltrain (Townsend at 4th),70
913459,San Antonio Shopping Center,31,Mountain View City Hall,27
913455,Post at Kearny,47,2nd at South Park,64
913454,San Jose City Hall,10,San Salvador at 1st,8
913453,Embarcadero at Folsom,51,Embarcadero at Sansome,60


### Edges

In [0]:
edges = df_trips.selectExpr(
  '`Start Terminal` AS src', 
  '`End Terminal` AS dst', 
  '`Trip ID` AS tripid'
)

display(edges.limit(5))

src,dst,tripid
50,70,913460
31,27,913459
47,64,913455
10,8,913454
51,60,913453


### Vertices

In [0]:
df_start_terminal = df_trips.selectExpr('`Start Terminal` AS id', '`Start Station` AS desc_terminal').distinct()
df_end_terminal = df_trips.selectExpr('`End Terminal` AS id', '`End Station` AS desc_terminal').distinct()

vertices = df_start_terminal.union(df_end_terminal).distinct()

display(vertices.limit(5))

id,desc_terminal
31,San Antonio Shopping Center
71,Powell at Post (Union Square)
10,San Jose City Hall
41,Clay at Battery
22,Redwood City Caltrain Station


### GraphFrames

In [0]:
from graphframes import *

g = GraphFrame(vertices, edges)

In [0]:
display(g.vertices.limit(5))

id,desc_terminal
31,San Antonio Shopping Center
71,Powell at Post (Union Square)
10,San Jose City Hall
41,Clay at Battery
22,Redwood City Caltrain Station


In [0]:
display(g.edges.limit(5))

src,dst,tripid
50,70,913460
31,27,913459
47,64,913455
10,8,913454
51,60,913453


### Degrees

<img border="0" alt="Degrees" src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/77/DirectedDegrees.svg/390px-DirectedDegrees.svg.png" width="400" height="300">

(inDegrees, outDegrees)

### inDegrees 
Numero de edges que ingresan a un vertex

In [0]:
%sql
SELECT count(*) FROM trips WHERE `End Terminal` = '70'

count(1)
34810


In [0]:
display(g.inDegrees.filter("id = 70"))

id,inDegree
70,34810


### outDegrees 
Numero de edges que salen de un vertex

In [0]:
%sql
SELECT count(*) FROM trips WHERE `Start Terminal` = '70'

count(1)
26304


In [0]:
display(g.outDegrees.filter("id = 70"))

id,outDegree
70,26304


### Triplets

<img border="0" alt="Triplets" src="https://spark.apache.org/docs/latest/img/triplet.png" width="400" height="50">

In [0]:
g.triplets.filter("edge.src = '31' AND edge.dst = '33'").rdd.count()

In [0]:
display(g.triplets.filter("edge.src = '31' AND edge.dst = '33'").limit(5))

src,edge,dst
"List(31, San Antonio Shopping Center)","List(31, 33, 909889)","List(33, Rengstorff Avenue / California Street)"
"List(31, San Antonio Shopping Center)","List(31, 33, 904348)","List(33, Rengstorff Avenue / California Street)"
"List(31, San Antonio Shopping Center)","List(31, 33, 891372)","List(33, Rengstorff Avenue / California Street)"
"List(31, San Antonio Shopping Center)","List(31, 33, 872016)","List(33, Rengstorff Avenue / California Street)"
"List(31, San Antonio Shopping Center)","List(31, 33, 868145)","List(33, Rengstorff Avenue / California Street)"


### PageRank

<img border="0" alt="PageRank" src="https://upload.wikimedia.org/wikipedia/en/thumb/8/8b/PageRanks-Example.jpg/1024px-PageRanks-Example.jpg" width="400" height="300">

In [0]:
results = g.pageRank(resetProbability=0.15, maxIter=10)

display(results.vertices.orderBy(results.vertices.pagerank.desc()).limit(5))

id,desc_terminal,pagerank
2,San Jose Diridon Caltrain Station,3.953111146508694
70,San Francisco Caltrain (Townsend at 4th),3.356879832961734
28,Mountain View Caltrain Station,2.453326956741646
22,Redwood City Caltrain Station,2.2698152011252226
69,San Francisco Caltrain 2 (330 Townsend),2.229142799484096
