Connected Components of Graphframe in Pyspark

In [2]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import Row
from graphframes import *
import timeit

In [3]:
start = timeit.default_timer()
input_path = 'p2p-Gnutella09.txt'
records = sc.textFile(input_path)
records_txt = records.filter(lambda x: "#" not in x[0]) 
records_txt.count()

26025

In [4]:
temp_var = records_txt.map(lambda x: x.split("\t"))
edges = temp_var.toDF(["src", "dst"])
print(edges.count())
edges.show(5)

26025
+---+---+
|src|dst|
+---+---+
|  0|  1|
|  0|  2|
|  0|  3|
|  0|  4|
|  0|  5|
+---+---+
only showing top 5 rows



In [5]:
temp_var = records_txt.flatMap(lambda k: k.split("\t")).distinct()
print(temp_var.count())
row = Row("id") 
vertices = temp_var.map(row).toDF()
vertices.show(5)

8127
+---+
| id|
+---+
|  0|
|  1|
|  4|
|  8|
|  9|
+---+
only showing top 5 rows



In [6]:
g = GraphFrame(vertices, edges)
print(g)

GraphFrame(v:[id: string], e:[src: string, dst: string])


In [7]:
sc.setCheckpointDir("graphframes_cps")
result = g.connectedComponents()
result.show()

+----+---------+
|  id|component|
+----+---------+
|   0|        0|
|   1|        0|
|   4|        0|
|   8|        0|
|   9|        0|
|  10|        0|
| 540|        0|
| 581|        0|
|1009|        0|
|2044|        0|
|2045|        0|
|2046|        0|
|2047|        0|
|2048|        0|
| 671|        0|
|1562|        0|
|1572|        0|
|2004|        0|
|2035|        0|
|2036|        0|
+----+---------+
only showing top 20 rows



In [8]:
grouped = result.groupby('component').count()

In [9]:
largest = grouped.sort('count', ascending=False).take(1)
print(largest)

[Row(component=0, count=8104)]


In [10]:
smallest = grouped.sort('count', ascending=True).take(1)
print(smallest)

[Row(component=1228360646692, count=1)]


In [11]:
stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  1101.8554163
