In [9]:
from graphframes import GraphFrame

In [10]:
inputPathVertexes = "data/Ex52/data/vertexes.csv"
inputPathEdges = "data/Ex52/data/edges.csv"
outputPath = "resOut_ex52/"

In [11]:
# Read the content of vertexes.csv
vDF = spark.read.load(inputPathVertexes,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [12]:
vDF.printSchema()
vDF.show()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)

+---+-----+---+
| id| name|age|
+---+-----+---+
| u1|Alice| 34|
| u2|  Bob| 36|
| u3| John| 30|
| u4|David| 29|
| u5| Paul| 32|
| u6| Adel| 36|
| u7| Eddy| 60|
+---+-----+---+



In [13]:
# Read the content of edges.csv
eDF = spark.read.load(inputPathEdges,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [14]:
eDF.printSchema()
eDF.show()

root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- linktype: string (nullable = true)

+---+---+--------+
|src|dst|linktype|
+---+---+--------+
| u1| u2|  friend|
| u1| u4|  friend|
| u1| u5|  friend|
| u2| u1|  friend|
| u2| u3|  follow|
| u3| u2|  follow|
| u4| u1|  friend|
| u4| u5|  friend|
| u5| u1|  friend|
| u5| u4|  friend|
| u5| u6|  follow|
| u6| u3|  follow|
| u7| u6|  follow|
+---+---+--------+



In [23]:
# Only the "follow" edges are useful
# Filter the input edge dataframe before creating the graph
filteredEdges = eDF.filter("linktype='follow' ")

In [24]:
# Create the input graph
g = GraphFrame(vDF, filteredEdges)

In [20]:
# Count the number of followers for each user (i.e., the number of in-links) 
userNumFollowersDF = g.inDegrees.withColumnRenamed("inDegree","numFollowers")

In [21]:
userNumFollowersDF.printSchema()
userNumFollowersDF.show()

root
 |-- id: string (nullable = true)
 |-- numFollowers: integer (nullable = false)

+---+------------+
| id|numFollowers|
+---+------------+
| u3|           2|
| u6|           2|
| u2|           1|
+---+------------+



In [22]:
# Save the result in the output folder
userNumFollowersDF.write.csv(outputPath, header=True)