In [0]:
#Import GraphFrame Packages
from functools import reduce
from pyspark.sql.functions import col, lit, when
from graphframes import *

In [0]:
#Create Vertices
vertices = sqlContext.createDataFrame([
 ("a", "Alice", 34),
 ("b", "Bob", 36),
 ("c", "Charlie", 30),
 ("d", "David", 29),
 ("e", "Esther", 32),
 ("f", "Fanny", 36),
 ("g", "Gabby", 60)], ["id", "name", "age"])


In [0]:
#Create Edges
edges = sqlContext.createDataFrame([
 ("a", "b", "friend"),
 ("b", "c", "follow"),
 ("c", "b", "follow"),
 ("f", "c", "follow"),
 ("e", "f", "follow"),
 ("e", "d", "friend"),
 ("d", "a", "friend"),
 ("a", "e", "friend")
], ["src", "dst", "relationship"])


In [0]:
#Create Graph from above Vertices and Edges
g = GraphFrame(vertices, edges)
print(g)


GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])


In [0]:

# This example graph also comes with the GraphFrames package.
from graphframes.examples import Graphs
same_g = Graphs(sqlContext).friends()
print(same_g)


GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])


In [0]:
#Display List of Vertices
display(g.vertices)


id,name,age
a,Alice,34
b,Bob,36
c,Charlie,30
d,David,29
e,Esther,32
f,Fanny,36
g,Gabby,60


In [0]:
#Display Edges
display(g.edges)



src,dst,relationship
a,b,friend
b,c,follow
c,b,follow
f,c,follow
e,f,follow
e,d,friend
d,a,friend
a,e,friend


In [0]:
g.edges.filter("relationship = 'follow'").count()

Out[11]: 4

In [0]:
g.edges.filter("relationship = 'friend'").count()

Out[12]: 4

In [0]:
#Display the incoming degree of the vertices
display(g.inDegrees)



id,inDegree
b,2
c,2
f,1
d,1
a,1
e,1


In [0]:
#Display the outgoing degree of the vertices:
display(g.outDegrees)


id,outDegree
a,2
b,1
c,1
f,1
e,2
d,1


In [0]:
#Display the degree of vertices
display(g.degrees)

id,degree
b,3
a,3
c,3
f,2
e,3
d,2


In [0]:
#You can run queries directly on the vertices DataFrame. 
#For example, we can find the age of the youngest person in the graph:
youngest = g.vertices.groupBy().min("age")
display(youngest)

min(age)
29


In [0]:

#Likewise, you can run queries on the edges of DataFrame. 
#For example, let's count the number of 'follow' relationships in the graph:
numFollows = g.edges.filter("relationship = 'follow'").count()
print("The number of follow edges is", numFollows)

The number of follow edges is 4


In [0]:
# Search for pairs of vertices with edges in both directions between them.
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)")
display(motifs)


a,e,b,e2
"List(c, Charlie, 30)","List(c, b, follow)","List(b, Bob, 36)","List(b, c, follow)"
"List(b, Bob, 36)","List(b, c, follow)","List(c, Charlie, 30)","List(c, b, follow)"


In [0]:
# Search for pairs of vertices with edges in both directions between them.

filtered = motifs.filter("b.age > 30 or a.age > 30")
display(filtered)

a,e,b,e2
"List(c, Charlie, 30)","List(c, b, follow)","List(b, Bob, 36)","List(b, c, follow)"
"List(b, Bob, 36)","List(b, c, follow)","List(c, Charlie, 30)","List(c, b, follow)"


In [0]:
# Find chains of 4 vertices.
chain4 = g.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[cd]->(d)")
# Query on sequence, with state (cnt)
#  (a) Define method for updating state given the next element of the motif.
def cumFriends(cnt, edge):
 relationship = col(edge)["relationship"]
 return when(relationship == "friend", cnt + 1).otherwise(cnt)
#  (b) Use sequence operation to apply method to sequence of elements in motif.
#   In this case, the elements are the 3 edges.
edges = ["ab", "bc", "cd"]
numFriends = reduce(cumFriends, edges, lit(0))
  
chainWith2Friends2 = chain4.withColumn("num_friends", numFriends).where(numFriends >= 2)
display(chainWith2Friends2)

a,ab,b,bc,c,cd,d,num_friends
"List(a, Alice, 34)","List(a, e, friend)","List(e, Esther, 32)","List(e, d, friend)","List(d, David, 29)","List(d, a, friend)","List(a, Alice, 34)",3
"List(e, Esther, 32)","List(e, d, friend)","List(d, David, 29)","List(d, a, friend)","List(a, Alice, 34)","List(a, b, friend)","List(b, Bob, 36)",3
"List(d, David, 29)","List(d, a, friend)","List(a, Alice, 34)","List(a, b, friend)","List(b, Bob, 36)","List(b, c, follow)","List(c, Charlie, 30)",2
"List(d, David, 29)","List(d, a, friend)","List(a, Alice, 34)","List(a, e, friend)","List(e, Esther, 32)","List(e, d, friend)","List(d, David, 29)",3
"List(e, Esther, 32)","List(e, d, friend)","List(d, David, 29)","List(d, a, friend)","List(a, Alice, 34)","List(a, e, friend)","List(e, Esther, 32)",3
"List(d, David, 29)","List(d, a, friend)","List(a, Alice, 34)","List(a, e, friend)","List(e, Esther, 32)","List(e, f, follow)","List(f, Fanny, 36)",2


In [0]:
g2 = g.filterEdges("relationship = 'friend'").filterVertices("age > 30").dropIsolatedVertices()

display(g2.vertices)

id,name,age
a,Alice,34
b,Bob,36
e,Esther,32


In [0]:
#DAtaset 2

In [0]:
  vertices = sqlContext.createDataFrame([('1', 'Carter', 'Derrick', 50), 
                                  ('2', 'May', 'Derrick', 26),
                                 ('3', 'Mills', 'Jeff', 80),
                                  ('4', 'Hood', 'Robert', 65),
                                  ('5', 'Banks', 'Mike', 93),
                                 ('98', 'Berg', 'Tim', 28),
                                 ('99', 'Page', 'Allan', 16)],
                                 ['id', 'name', 'firstname', 'age'])

In [0]:
edges = sqlContext.createDataFrame([('1', '2', 'friend'), 
                               ('2', '1', 'friend'),
                              ('3', '1', 'friend'),
                              ('1', '3', 'friend'),
                               ('2', '3', 'follows'),
                               ('3', '4', 'friend'),
                               ('4', '3', 'friend'),
                               ('5', '3', 'friend'),
                               ('3', '5', 'friend'),
                               ('4', '5', 'follows'),
                              ('98', '99', 'friend'),
                              ('99', '98', 'friend')],
                              ['src', 'dst', 'type'])
g = GraphFrame(vertices, edges)

In [0]:
g.vertices.show()

+---+------+---------+---+
| id|  name|firstname|age|
+---+------+---------+---+
|  1|Carter|  Derrick| 50|
|  2|   May|  Derrick| 26|
|  3| Mills|     Jeff| 80|
|  4|  Hood|   Robert| 65|
|  5| Banks|     Mike| 93|
| 98|  Berg|      Tim| 28|
| 99|  Page|    Allan| 16|
+---+------+---------+---+



In [0]:
g.edges.show()

+---+---+-------+
|src|dst|   type|
+---+---+-------+
|  1|  2| friend|
|  2|  1| friend|
|  3|  1| friend|
|  1|  3| friend|
|  2|  3|follows|
|  3|  4| friend|
|  4|  3| friend|
|  5|  3| friend|
|  3|  5| friend|
|  4|  5|follows|
| 98| 99| friend|
| 99| 98| friend|
+---+---+-------+



In [0]:
g.degrees.show()

+---+------+
| id|degree|
+---+------+
|  1|     4|
|  2|     3|
|  3|     7|
|  4|     3|
|  5|     3|
| 98|     2|
| 99|     2|
+---+------+



In [0]:
youngest = g.vertices.groupBy().min("age")
display(youngest)

min(age)
16


In [0]:
eldest = g.vertices.groupBy().max("age")
display(eldest)

max(age)
93


In [0]:
numfriend = g.edges.filter("type = 'friend'").count()
numfriend

Out[33]: 10

In [0]:
#Dataset 3
v = sqlContext.createDataFrame([('BBB','IT','Teacher')
                                ,('Shivam','IT','Student')
                                ,('Prachi','IT','Teacher')
                                ,('Shivani','DSAI','Student'),
                                ('Vishal','DSAI','Student'),
                                 ('Saurabh','DSAI','Research Fellow'),
                                 ('Amit','CS','Research Guide'),
                                   ('Sunil','CS','Student')],
                               ['id','dept','designation'])


In [0]:
e = sqlContext.createDataFrame([('BBB', 'Shivani', 'TeacherStudent'),
                               ('Shivam', 'Shivani', 'Classmate'),
                              ('Prachi', 'Vishal', 'Classmate'),
                              ('BBB', 'Saurabh', 'ResearchStud'),
                               ('Prachi', 'Vishal', 'TeacherStudent'),
                               ('Amit', 'Saurabh', 'ResearchMate'),
                               ('Prachi', 'Amit', 'TeacherStudent'),
                               ('Amit', 'Sunil', 'TeacherStudent')],
                              ['src', 'dst', 'relationship'])

In [0]:
g3 = GraphFrame(v, e)

In [0]:
g3.vertices.show()

+-------+----+---------------+
|     id|dept|    designation|
+-------+----+---------------+
|    BBB|  IT|        Teacher|
| Shivam|  IT|        Student|
| Prachi|  IT|        Teacher|
|Shivani|DSAI|        Student|
| Vishal|DSAI|        Student|
|Saurabh|DSAI|Research Fellow|
|   Amit|  CS| Research Guide|
|  Sunil|  CS|        Student|
+-------+----+---------------+



In [0]:
g3.edges.show()

+------+-------+--------------+
|   src|    dst|  relationship|
+------+-------+--------------+
|   BBB|Shivani|TeacherStudent|
|Shivam|Shivani|     Classmate|
|Prachi| Vishal|     Classmate|
|   BBB|Shivani|  ResearchStud|
|Prachi| Vishal|TeacherStudent|
|  Amit|Saurabh|  ResearchMate|
|Prachi|   Amit|TeacherStudent|
|  Amit|  Sunil|TeacherStudent|
+------+-------+--------------+



In [0]:
teachers  = g3.vertices.filter("designation == 'Teacher'").count()
teachers

Out[69]: 2

In [0]:
students  = g3.vertices.filter("designation == 'Student'").count()
students

Out[70]: 4

In [0]:
rfrg  = g3.vertices.filter("designation == 'Research Fellow' or designation =='Research Guide'").show()
rfrg

+-------+----+---------------+
|     id|dept|    designation|
+-------+----+---------------+
|Saurabh|DSAI|Research Fellow|
|   Amit|  CS| Research Guide|
+-------+----+---------------+



In [0]:
classmate = g3.edges.filter("relationship == 'Classmate'").show()

+------+-------+------------+
|   src|    dst|relationship|
+------+-------+------------+
|Shivam|Shivani|   Classmate|
|Prachi| Vishal|   Classmate|
+------+-------+------------+



In [0]:
teacherstudent = g3.edges.filter("relationship == 'TeacherStudent'").show()

+------+-------+--------------+
|   src|    dst|  relationship|
+------+-------+--------------+
|   BBB|Shivani|TeacherStudent|
|Prachi| Vishal|TeacherStudent|
|Prachi|   Amit|TeacherStudent|
|  Amit|  Sunil|TeacherStudent|
+------+-------+--------------+



In [0]:
teacherstudent = g3.edges.filter("relationship == 'ResearchMate'").show()

+----+-------+------------+
| src|    dst|relationship|
+----+-------+------------+
|Amit|Saurabh|ResearchMate|
+----+-------+------------+



In [0]:
it  = g3.vertices.filter("dept == 'IT'").count()
it

Out[88]: 3

In [0]:
j1=g3.edges.filter("relationship = 'TeacherStudent'")
j2 = g3.vertices.filter("designation = 'Research Guide'")

In [0]:
j2.join(j1,j1.src == j2.id).show()

+----+----+--------------+----+-----+--------------+
|  id|dept|   designation| src|  dst|  relationship|
+----+----+--------------+----+-----+--------------+
|Amit|  CS|Research Guide|Amit|Sunil|TeacherStudent|
+----+----+--------------+----+-----+--------------+

