In [1]:
from graphframes import GraphFrame

In [2]:
inputPathVertexes = "data/Ex56/data/vertexes.csv"
inputPathEdges = "data/Ex56/data/edges.csv"
outputPath = "resOut_ex56/"

In [3]:
# Read the content of vertexes.csv
vDF = spark.read.load(inputPathVertexes,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [4]:
vDF.printSchema()
vDF.show()

root
 |-- id: string (nullable = true)
 |-- entityName: string (nullable = true)
 |-- name: string (nullable = true)

+---+----------+--------+
| id|entityName|    name|
+---+----------+--------+
| V1|      user|   Paolo|
| V2|     topic|     SQL|
| V3|      user|   David|
| V4|     topic|Big Data|
| V5|      user|    John|
+---+----------+--------+



In [7]:
# Read the content of edges.csv
eDF = spark.read.load(inputPathEdges,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [8]:
eDF.printSchema()
eDF.show()

root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- linktype: string (nullable = true)

+---+---+----------+
|src|dst|  linktype|
+---+---+----------+
| V1| V2|      like|
| V1| V3|    follow|
| V1| V4|    follow|
| V3| V2|    follow|
| V3| V4|    follow|
| V5| V2|  expertOf|
| V2| V4|correlated|
| V4| V2|correlated|
+---+---+----------+



In [9]:
# Only the "follow"  and "correlated" edges are useful
# Filter the input edge dataframe before creating the graph
filteredEdfes = eDF.filter("linktype='follow' OR linktype='correlated' ")

In [10]:
# Create the input graph
g = GraphFrame(vDF, filteredEdfes)

In [13]:
pathsDF = g.find("(v1)-[e1]->(v2);(v2)-[e2]->(v3)")

In [14]:
pathsDF.printSchema()
pathsDF.show()

root
 |-- v1: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- e1: struct (nullable = false)
 |    |-- src: string (nullable = true)
 |    |-- dst: string (nullable = true)
 |    |-- linktype: string (nullable = true)
 |-- v2: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- e2: struct (nullable = false)
 |    |-- src: string (nullable = true)
 |    |-- dst: string (nullable = true)
 |    |-- linktype: string (nullable = true)
 |-- v3: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  v1|                  e1|                  v2|                 

In [18]:
# Select the triples user -> follow -> topic -> correlated -> topic="Big data"
selectedPathsDF = pathsDF.filter("""v1.entityName='user' 
AND e1.linktype='follow'
AND v2.entityName='topic'
AND e2.linktype='correlated'
AND v3.entityName='topic' AND v3.name='Big Data' """)

In [19]:
selectedPathsDF.printSchema()
selectedPathsDF.show()

root
 |-- v1: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- e1: struct (nullable = false)
 |    |-- src: string (nullable = true)
 |    |-- dst: string (nullable = true)
 |    |-- linktype: string (nullable = true)
 |-- v2: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- e2: struct (nullable = false)
 |    |-- src: string (nullable = true)
 |    |-- dst: string (nullable = true)
 |    |-- linktype: string (nullable = true)
 |-- v3: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)

+-----------------+----------------+----------------+--------------------+--------------------+
|               v1|              e1|              v2|                  e2|                  

In [21]:
# Select name of the selected users
usersDF = selectedPathsDF.selectExpr("v1.name as username")

In [22]:
usersDF.show()

+--------+
|username|
+--------+
|   David|
+--------+



In [23]:
# Save the result in the output folder
usersDF.write.csv(outputPath, header=True)