In [1]:
from graphframes import *
from pyspark.sql.functions import *

In [19]:
# Vertics DataFrame
v = spark.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 37),
  ("d", "David", 29),
  ("e", "Esther", 32),
  ("f", "Fanny", 38),
  ("g", "Gabby", 60)
], ["id", "name", "age"]) #id is compulsory

# Edges DataFrame
e = spark.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
  ("f", "c", "follow"),
  ("e", "f", "follow"),
  ("e", "d", "friend"),
  ("d", "a", "friend"),
  ("a", "e", "friend"),
  ("g", "e", "follow")
], ["src", "dst", "relationship"]) #src, dst are compulsory

# Create a GraphFrame
g = GraphFrame(v, e)

g.vertices.show()
g.edges.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  a|  Alice| 34|
|  b|    Bob| 36|
|  c|Charlie| 37|
|  d|  David| 29|
|  e| Esther| 32|
|  f|  Fanny| 38|
|  g|  Gabby| 60|
+---+-------+---+

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
|  b|  c|      follow|
|  c|  b|      follow|
|  f|  c|      follow|
|  e|  f|      follow|
|  e|  d|      friend|
|  d|  a|      friend|
|  a|  e|      friend|
|  g|  e|      follow|
+---+---+------------+



In [37]:
from pyspark.sql.functions import udf
get_name = udf(lambda s: s[1])
ans = g.find("(a)-[]->(b); (b)-[]->(c) ").filter("a.id < b.id AND a.id < c.id AND a.name = 'Alice' ")
ans.select(get_name(ans['c']).alias("Alice's two-hop neighbors' names")).show()

+--------------------------------+
|Alice's two-hop neighbors' names|
+--------------------------------+
|                           Fanny|
|                           David|
|                         Charlie|
+--------------------------------+



In [36]:
from pyspark.sql.functions import udf
get_name = udf(lambda s: s[1])
ans = g.find("(a)-[]->(b); (b)-[]->(c); !(c)-[]->(a) ").filter("a.id < b.id AND a.id < c.id AND a.name = 'Alice' ")
ans.select(get_name(ans['c']).alias("Alice's two-hop neighbors' names")).show()

                                                                                

+--------------------------------+
|Alice's two-hop neighbors' names|
+--------------------------------+
|                           Fanny|
|                         Charlie|
+--------------------------------+



In [41]:
from pyspark.sql.functions import udf
get_name = udf(lambda s: s[1])
ans = g.find("(a)-[e]->(b)").filter("b.name = 'Charlie' AND e.relationship = 'follow'")
ans.select(get_name(ans['a']).alias("Person follow Charlie")).show()

+---------------------+
|Person follow Charlie|
+---------------------+
|                Fanny|
|                  Bob|
+---------------------+



In [52]:
from pyspark.sql.functions import udf
get_name = udf(lambda s: s[1])
ans = g.find("(a)-[e]->(b)").filter("e.relationship = 'follow'")
fans_count = ans.groupby('b').count()
at_least_two_fans = fans_count.select('b').where('count >= 2')
at_least_two_fans.select(get_name(at_least_two_fans['b']).alias("Person followed by at least two person")).show()

+--------------------------------------+
|Person followed by at least two person|
+--------------------------------------+
|                               Charlie|
+--------------------------------------+

