In [19]:
from graphframes import GraphFrame

In [47]:
inputPathVertexes = "data/Ex55/data/vertexes.csv"
inputPathEdges = "data/Ex55/data/edges.csv"
outputPath = "resOut_ex55/"

In [48]:
# Read the content of vertexes.csv
vDF = spark.read.load(inputPathVertexes,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [49]:
vDF.printSchema()
vDF.show()

root
 |-- id: string (nullable = true)
 |-- entityName: string (nullable = true)
 |-- name: string (nullable = true)

+---+----------+--------+
| id|entityName|    name|
+---+----------+--------+
| V1|      user|   Paolo|
| V2|     topic|     SQL|
| V3|      user|   David|
| V4|     topic|Big Data|
| V5|      user|    John|
+---+----------+--------+



In [50]:
# Read the content of edges.csv
eDF = spark.read.load(inputPathEdges,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [51]:
eDF.printSchema()
eDF.show()

root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- linktype: string (nullable = true)

+---+---+----------+
|src|dst|  linktype|
+---+---+----------+
| V1| V2|      like|
| V1| V3|    follow|
| V1| V4|    follow|
| V3| V2|      like|
| V3| V4|      like|
| V5| V2|  expertOf|
| V2| V4|correlated|
| V4| V2|correlated|
+---+---+----------+



In [65]:
# Only the "follow" edges are useful
# Filter the input edge dataframe before creating the graph
filteredEdfes = eDF.filter("linktype='follow' ")

In [66]:
# Create the input graph
g = GraphFrame(vDF, filteredEdfes)

In [67]:
pathsDF = g.find("(v1)-[]->(v2)")

In [68]:
pathsDF.printSchema()
pathsDF.show()

root
 |-- v1: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- v2: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)

+-----------------+--------------------+
|               v1|                  v2|
+-----------------+--------------------+
|[V1, user, Paolo]|   [V3, user, David]|
|[V1, user, Paolo]|[V4, topic, Big D...|
|[V3, user, David]|    [V2, topic, SQL]|
|[V3, user, David]|[V4, topic, Big D...|
+-----------------+--------------------+



In [69]:
# Select the pair (user,topic)
selectedPathsDF = pathsDF.filter("v1.entityName='user' AND v2.entityName='topic' ")

In [70]:
selectedPathsDF.printSchema()
selectedPathsDF.show()

root
 |-- v1: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- v2: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)

+-----------------+--------------------+
|               v1|                  v2|
+-----------------+--------------------+
|[V1, user, Paolo]|[V4, topic, Big D...|
|[V3, user, David]|    [V2, topic, SQL]|
|[V3, user, David]|[V4, topic, Big D...|
+-----------------+--------------------+



In [74]:
# Select name of the user and "name" of the topic
userTopicDF = selectedPathsDF.selectExpr("v1.name as username", "v2.name as topic")

In [75]:
userTopicDF.show()

+--------+--------+
|username|   topic|
+--------+--------+
|   Paolo|Big Data|
|   David|     SQL|
|   David|Big Data|
+--------+--------+



In [73]:
# Save the result in the output folder
userTopicDF.write.csv(outputPath, header=True)