In [10]:
from graphframes import GraphFrame

In [11]:
inputPathVertexes = "data/Ex57/data/vertexes.csv"
inputPathEdges = "data/Ex57/data/edges.csv"
outputPath = "resOut_ex57/"

In [12]:
# Read the content of vertexes.csv
vDF = spark.read.load(inputPathVertexes,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [13]:
vDF.printSchema()
vDF.show()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)

+---+-----+---+
| id| name|age|
+---+-----+---+
| u1|Alice| 34|
| u2|  Bob| 36|
| u3| John| 30|
| u4|David| 29|
| u5| Paul| 32|
| u6| Adel| 36|
| u7| Eddy| 60|
+---+-----+---+



In [14]:
# Read the content of edges.csv
eDF = spark.read.load(inputPathEdges,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [15]:
eDF.printSchema()
eDF.show()

root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- linktype: string (nullable = true)

+---+---+--------+
|src|dst|linktype|
+---+---+--------+
| u1| u2|  friend|
| u1| u4|  friend|
| u1| u5|  friend|
| u2| u1|  friend|
| u2| u3|  follow|
| u3| u2|  follow|
| u4| u1|  friend|
| u4| u5|  friend|
| u5| u1|  friend|
| u5| u4|  friend|
| u5| u6|  follow|
| u6| u3|  follow|
+---+---+--------+



In [17]:
# Create the input graph
g = GraphFrame(vDF, eDF)

In [18]:
# Compute for each vertex the length of the shortest path to u1
shortPathsLengDF = g.shortestPaths(['u1'])

In [20]:
shortPathsLengDF.printSchema()
shortPathsLengDF.show()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- distances: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = false)

+---+-----+---+---------+
| id| name|age|distances|
+---+-----+---+---------+
| u6| Adel| 36|[u1 -> 3]|
| u3| John| 30|[u1 -> 2]|
| u2|  Bob| 36|[u1 -> 1]|
| u4|David| 29|[u1 -> 1]|
| u5| Paul| 32|[u1 -> 1]|
| u1|Alice| 34|[u1 -> 0]|
| u7| Eddy| 60|       []|
+---+-----+---+---------+



In [23]:
# Select only the users who can reach u1 in less than 3 "hops"
selectedUsersDF=shortPathsLengDF.filter("distances['u1']<3 AND id<>'u1' ")

In [24]:
selectedUsersDF.printSchema()
selectedUsersDF.show()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- distances: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = false)

+---+-----+---+---------+
| id| name|age|distances|
+---+-----+---+---------+
| u3| John| 30|[u1 -> 2]|
| u2|  Bob| 36|[u1 -> 1]|
| u4|David| 29|[u1 -> 1]|
| u5| Paul| 32|[u1 -> 1]|
+---+-----+---+---------+



In [25]:
# Create a DataFrame with Columns name and numHops
usersNameNumHopsDF=selectedUsersDF.selectExpr("name", "distances['u1'] AS numHops")

In [26]:
usersNameNumHopsDF.printSchema()
usersNameNumHopsDF.show()

root
 |-- name: string (nullable = true)
 |-- numHops: integer (nullable = true)

+-----+-------+
| name|numHops|
+-----+-------+
| John|      2|
|  Bob|      1|
|David|      1|
| Paul|      1|
+-----+-------+



In [28]:
# Save the result in the output folder
usersNameNumHopsDF.write.csv(outputPath, header=True)