In [1]:
import os
import sys

os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client/"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.4-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")


In [2]:
import pyspark
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession

In [3]:
from pyspark.sql import *
from pyspark.sql.functions import *


In [4]:
conf = SparkConf().set('spark.jars.packages','graphframes:graphframes:0.5.0-spark2.1-s_2.11')


In [5]:
sc = SparkContext(conf=conf)
spark  = SparkSession(sc)

In [6]:
spark

In [7]:
import graphframes
from graphframes import GraphFrame

In [8]:
vertices = spark.createDataFrame([
    ("1","Aryan",23,"M","NIT"),
    ("2","Ram",28,"M","BITS"),
    ("3","Samera",25,"F","IIT"),
    ("4","Sachin",27,"M","NIT"),
    ("5","Manoj",27,"M","NIT"),
    ("6","Mytri",27,"F","BITS"),
("7","Shiva",27,"M","IIT")],["id","name","age","gender","university"])

In [9]:
edges = spark.createDataFrame([ 
    ("1", "2", "friend"), ("2", "1", "friend"), 
    ("1", "3", "friend"), ("3", "1", "friend"), 
    ("1", "4", "friend"), ("4", "1", "friend"),
    ("2", "3", "friend"), ("3", "2", "friend"), 
    ("3", "4", "friend"),  ("4", "3", "friend"),
    ("3", "5", "friend"), ("5", "3", "friend"), 
    ("3", "6", "friend"), ("6", "3", "friend"), 
    ("3", "7", "friend"), ("7", "3", "friend"),
    ("2", "5", "friend"),("5", "2", "friend")
], ["src", "dst", "relationship"])

### Creating the Graph 
#### to create a graph we require vertices and edges
* create two data frames for vertices and edges
* use GraphFrame(vertices,edges) to get the graph
* vertices should have a unique column called "id"
* edges should have two columns "src","dst"

In [10]:
g = GraphFrame(vertices, edges)

In [11]:
g.vertices.show()

+---+------+---+------+----------+
| id|  name|age|gender|university|
+---+------+---+------+----------+
|  1| Aryan| 23|     M|       NIT|
|  2|   Ram| 28|     M|      BITS|
|  3|Samera| 25|     F|       IIT|
|  4|Sachin| 27|     M|       NIT|
|  5| Manoj| 27|     M|       NIT|
|  6| Mytri| 27|     F|      BITS|
|  7| Shiva| 27|     M|       IIT|
+---+------+---+------+----------+



In [12]:
g.edges.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  1|  2|      friend|
|  2|  1|      friend|
|  1|  3|      friend|
|  3|  1|      friend|
|  1|  4|      friend|
|  4|  1|      friend|
|  2|  3|      friend|
|  3|  2|      friend|
|  3|  4|      friend|
|  4|  3|      friend|
|  3|  5|      friend|
|  5|  3|      friend|
|  3|  6|      friend|
|  6|  3|      friend|
|  3|  7|      friend|
|  7|  3|      friend|
|  2|  5|      friend|
|  5|  2|      friend|
+---+---+------------+



#### Queries on the Graph
* Queries similar to Data frame queries can be run on the graph frame
* We can query on the numerical and categorical colmns of the graph frame

In [13]:
g.vertices.filter("age>25").show()


+---+------+---+------+----------+
| id|  name|age|gender|university|
+---+------+---+------+----------+
|  2|   Ram| 28|     M|      BITS|
|  4|Sachin| 27|     M|       NIT|
|  5| Manoj| 27|     M|       NIT|
|  6| Mytri| 27|     F|      BITS|
|  7| Shiva| 27|     M|       IIT|
+---+------+---+------+----------+



In [14]:
g.vertices.filter("gender='M'").show()

+---+------+---+------+----------+
| id|  name|age|gender|university|
+---+------+---+------+----------+
|  1| Aryan| 23|     M|       NIT|
|  2|   Ram| 28|     M|      BITS|
|  4|Sachin| 27|     M|       NIT|
|  5| Manoj| 27|     M|       NIT|
|  7| Shiva| 27|     M|       IIT|
+---+------+---+------+----------+



##### Counting the number of inlinks -- inDegrees

In [15]:
## As relationship is the edge here it will tell
g.inDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|  7|       1|
|  3|       6|
|  5|       2|
|  6|       1|
|  1|       3|
|  4|       2|
|  2|       3|
+---+--------+



In [16]:
g.outDegrees.show()

+---+---------+
| id|outDegree|
+---+---------+
|  7|        1|
|  3|        6|
|  5|        2|
|  6|        1|
|  1|        3|
|  4|        2|
|  2|        3|
+---+---------+



##### To find the number of users having more than 2 friends

In [17]:
g.inDegrees.filter("inDegree > 2").sort("id").show()

+---+--------+
| id|inDegree|
+---+--------+
|  1|       3|
|  2|       3|
|  3|       6|
+---+--------+



### Motif Finding using DSL
* DSL should be given with (src vertex)-[e edge]->(dst vertex) 

In [18]:
g.find("(a)-[e]->(b)").count()

18

In [19]:
g.find(("(a)-[e]->(b);(b)-[e2]->(a)")).show(truncate=False) # Explicity giving the edges 

+-----------------------+--------------+-----------------------+--------------+
|a                      |e             |b                      |e2            |
+-----------------------+--------------+-----------------------+--------------+
|[1, Aryan, 23, M, NIT] |[1, 4, friend]|[4, Sachin, 27, M, NIT]|[4, 1, friend]|
|[4, Sachin, 27, M, NIT]|[4, 1, friend]|[1, Aryan, 23, M, NIT] |[1, 4, friend]|
|[3, Samera, 25, F, IIT]|[3, 2, friend]|[2, Ram, 28, M, BITS]  |[2, 3, friend]|
|[2, Ram, 28, M, BITS]  |[2, 1, friend]|[1, Aryan, 23, M, NIT] |[1, 2, friend]|
|[2, Ram, 28, M, BITS]  |[2, 5, friend]|[5, Manoj, 27, M, NIT] |[5, 2, friend]|
|[3, Samera, 25, F, IIT]|[3, 5, friend]|[5, Manoj, 27, M, NIT] |[5, 3, friend]|
|[1, Aryan, 23, M, NIT] |[1, 3, friend]|[3, Samera, 25, F, IIT]|[3, 1, friend]|
|[3, Samera, 25, F, IIT]|[3, 1, friend]|[1, Aryan, 23, M, NIT] |[1, 3, friend]|
|[5, Manoj, 27, M, NIT] |[5, 3, friend]|[3, Samera, 25, F, IIT]|[3, 5, friend]|
|[2, Ram, 28, M, BITS]  |[2, 3, friend]|

In [20]:
g.find(("(a)-[e]->(b);(b)-[e2]->(a)")).count()

18

#### Creating the anonymous edge based motif finding

In [21]:
g.find(("(a)-[]->(b);(b)-[]->(a)")).show()

+--------------------+--------------------+
|                   a|                   b|
+--------------------+--------------------+
|[1, Aryan, 23, M,...|[4, Sachin, 27, M...|
|[4, Sachin, 27, M...|[1, Aryan, 23, M,...|
|[3, Samera, 25, F...|[2, Ram, 28, M, B...|
|[2, Ram, 28, M, B...|[1, Aryan, 23, M,...|
|[2, Ram, 28, M, B...|[5, Manoj, 27, M,...|
|[3, Samera, 25, F...|[5, Manoj, 27, M,...|
|[1, Aryan, 23, M,...|[3, Samera, 25, F...|
|[3, Samera, 25, F...|[1, Aryan, 23, M,...|
|[5, Manoj, 27, M,...|[3, Samera, 25, F...|
|[2, Ram, 28, M, B...|[3, Samera, 25, F...|
|[3, Samera, 25, F...|[7, Shiva, 27, M,...|
|[4, Sachin, 27, M...|[3, Samera, 25, F...|
|[6, Mytri, 27, F,...|[3, Samera, 25, F...|
|[1, Aryan, 23, M,...|[2, Ram, 28, M, B...|
|[3, Samera, 25, F...|[4, Sachin, 27, M...|
|[3, Samera, 25, F...|[6, Mytri, 27, F,...|
|[5, Manoj, 27, M,...|[2, Ram, 28, M, B...|
|[7, Shiva, 27, M,...|[3, Samera, 25, F...|
+--------------------+--------------------+



#### Creating the anonymous vertex based motif finding

In [22]:
g.find("(a)-[e]->()").show()

+--------------------+--------------+
|                   a|             e|
+--------------------+--------------+
|[7, Shiva, 27, M,...|[7, 3, friend]|
|[3, Samera, 25, F...|[3, 1, friend]|
|[3, Samera, 25, F...|[3, 2, friend]|
|[3, Samera, 25, F...|[3, 4, friend]|
|[3, Samera, 25, F...|[3, 5, friend]|
|[3, Samera, 25, F...|[3, 6, friend]|
|[3, Samera, 25, F...|[3, 7, friend]|
|[5, Manoj, 27, M,...|[5, 3, friend]|
|[5, Manoj, 27, M,...|[5, 2, friend]|
|[6, Mytri, 27, F,...|[6, 3, friend]|
|[1, Aryan, 23, M,...|[1, 2, friend]|
|[1, Aryan, 23, M,...|[1, 3, friend]|
|[1, Aryan, 23, M,...|[1, 4, friend]|
|[4, Sachin, 27, M...|[4, 1, friend]|
|[4, Sachin, 27, M...|[4, 3, friend]|
|[2, Ram, 28, M, B...|[2, 1, friend]|
|[2, Ram, 28, M, B...|[2, 3, friend]|
|[2, Ram, 28, M, B...|[2, 5, friend]|
+--------------------+--------------+



##### To find a unidirectional edge

In [23]:
g.find("(a)-[]->(b);!(b)-[]->(a)").show()  ## Find the links from a to b but not b to a 

+---+---+
|  a|  b|
+---+---+
+---+---+



##### To find mutual friends

In [24]:
g.find("(a)-[]->(b);(b)-[]->(c)").show(100,truncate=False)

+-----------------------+-----------------------+-----------------------+
|a                      |b                      |c                      |
+-----------------------+-----------------------+-----------------------+
|[7, Shiva, 27, M, IIT] |[3, Samera, 25, F, IIT]|[7, Shiva, 27, M, IIT] |
|[5, Manoj, 27, M, NIT] |[3, Samera, 25, F, IIT]|[7, Shiva, 27, M, IIT] |
|[6, Mytri, 27, F, BITS]|[3, Samera, 25, F, IIT]|[7, Shiva, 27, M, IIT] |
|[1, Aryan, 23, M, NIT] |[3, Samera, 25, F, IIT]|[7, Shiva, 27, M, IIT] |
|[4, Sachin, 27, M, NIT]|[3, Samera, 25, F, IIT]|[7, Shiva, 27, M, IIT] |
|[2, Ram, 28, M, BITS]  |[3, Samera, 25, F, IIT]|[7, Shiva, 27, M, IIT] |
|[3, Samera, 25, F, IIT]|[7, Shiva, 27, M, IIT] |[3, Samera, 25, F, IIT]|
|[3, Samera, 25, F, IIT]|[5, Manoj, 27, M, NIT] |[3, Samera, 25, F, IIT]|
|[2, Ram, 28, M, BITS]  |[5, Manoj, 27, M, NIT] |[3, Samera, 25, F, IIT]|
|[3, Samera, 25, F, IIT]|[6, Mytri, 27, F, BITS]|[3, Samera, 25, F, IIT]|
|[3, Samera, 25, F, IIT]|[1, Aryan, 23

In [25]:
g.find("(a)-[]->(b);(b)-[]->(c)").filter("a.id!=c.id").show(10)

+--------------------+--------------------+--------------------+
|                   a|                   b|                   c|
+--------------------+--------------------+--------------------+
|[5, Manoj, 27, M,...|[3, Samera, 25, F...|[7, Shiva, 27, M,...|
|[6, Mytri, 27, F,...|[3, Samera, 25, F...|[7, Shiva, 27, M,...|
|[1, Aryan, 23, M,...|[3, Samera, 25, F...|[7, Shiva, 27, M,...|
|[4, Sachin, 27, M...|[3, Samera, 25, F...|[7, Shiva, 27, M,...|
|[2, Ram, 28, M, B...|[3, Samera, 25, F...|[7, Shiva, 27, M,...|
|[2, Ram, 28, M, B...|[5, Manoj, 27, M,...|[3, Samera, 25, F...|
|[4, Sachin, 27, M...|[1, Aryan, 23, M,...|[3, Samera, 25, F...|
|[2, Ram, 28, M, B...|[1, Aryan, 23, M,...|[3, Samera, 25, F...|
|[1, Aryan, 23, M,...|[4, Sachin, 27, M...|[3, Samera, 25, F...|
|[5, Manoj, 27, M,...|[2, Ram, 28, M, B...|[3, Samera, 25, F...|
+--------------------+--------------------+--------------------+
only showing top 10 rows



In [26]:
motifs = g.find("(a)-[]->(b);(b)-[]->(c)").filter("a.id!=c.id")
Friends = motifs.selectExpr("A.id as A", "C.id as C")
Friends.groupBy("A", "C").count().filter("A = 1").show()

+---+---+-----+
|  A|  C|count|
+---+---+-----+
|  1|  4|    1|
|  1|  2|    1|
|  1|  5|    2|
|  1|  3|    2|
|  1|  7|    1|
|  1|  6|    1|
+---+---+-----+



In [27]:
Friends = motifs.selectExpr("A.id as A", "C.id as C")
Friends.show()

+---+---+
|  A|  C|
+---+---+
|  5|  7|
|  6|  7|
|  1|  7|
|  4|  7|
|  2|  7|
|  2|  3|
|  4|  3|
|  2|  3|
|  1|  3|
|  5|  3|
|  1|  3|
|  7|  5|
|  6|  5|
|  1|  5|
|  4|  5|
|  2|  5|
|  3|  5|
|  1|  5|
|  7|  6|
|  5|  6|
+---+---+
only showing top 20 rows



In [28]:
Friends.groupBy("A", "C").count().filter("A = 1").show()

+---+---+-----+
|  A|  C|count|
+---+---+-----+
|  1|  4|    1|
|  1|  2|    1|
|  1|  5|    2|
|  1|  3|    2|
|  1|  7|    1|
|  1|  6|    1|
+---+---+-----+



In [29]:
g.triangleCount().show()

+-----+---+------+---+------+----------+
|count| id|  name|age|gender|university|
+-----+---+------+---+------+----------+
|    0|  7| Shiva| 27|     M|       IIT|
|    3|  3|Samera| 25|     F|       IIT|
|    1|  5| Manoj| 27|     M|       NIT|
|    0|  6| Mytri| 27|     F|      BITS|
|    2|  1| Aryan| 23|     M|       NIT|
|    1|  4|Sachin| 27|     M|       NIT|
|    2|  2|   Ram| 28|     M|      BITS|
+-----+---+------+---+------+----------+



In [30]:
results = g.pageRank(maxIter=5)


In [31]:
results.vertices.select("id", "pagerank").show()

+---+------------------+
| id|          pagerank|
+---+------------------+
|  1|1.1121941874035493|
|  3| 2.376845942965535|
|  2|1.1121941874035493|
|  4|0.7600653645833334|
|  7|0.4393174765303498|
|  6|0.4393174765303498|
|  5|0.7600653645833334|
+---+------------------+



In [32]:
v = spark.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
  ("d", "David", 29),
  ("e", "Esther", 32),
  ("f", "Fanny", 36),
  ("g", "Gabby", 60)
], ["id", "name", "age"])
# Edge DataFrame
e = spark.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
  ("f", "c", "follow"),
  ("e", "f", "follow"),
  ("e", "d", "friend"),
  ("d", "a", "friend"),
  ("a", "e", "friend")
], ["src", "dst", "relationship"])
# Create a GraphFrame
graph = GraphFrame(v, e)
paths = graph.find("(a)-[e]->(b)")\
  .filter("e.relationship = 'follow'")\
  .filter("a.age < b.age")
paths.show()

+----------------+--------------+--------------+
|               a|             e|             b|
+----------------+--------------+--------------+
| [e, Esther, 32]|[e, f, follow]|[f, Fanny, 36]|
|[c, Charlie, 30]|[c, b, follow]|  [b, Bob, 36]|
+----------------+--------------+--------------+



In [33]:
graph.find("(a)-[e]->(b)").show()

+----------------+--------------+----------------+
|               a|             e|               b|
+----------------+--------------+----------------+
| [e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|
|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|
| [e, Esther, 32]|[e, d, friend]|  [d, David, 29]|
|  [f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|
|    [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|
|[c, Charlie, 30]|[c, b, follow]|    [b, Bob, 36]|
|  [a, Alice, 34]|[a, b, friend]|    [b, Bob, 36]|
|  [d, David, 29]|[d, a, friend]|  [a, Alice, 34]|
+----------------+--------------+----------------+



### For BFS use 'graph' created above

In [34]:
# Search from "Esther" for users of age < 32.
paths = graph.bfs("name = 'Esther'", "age < 32")
paths.show()

# Specify edge filters or max path lengths.
graph.bfs("name = 'Esther'", "age < 32",\
  edgeFilter="relationship != 'friend'", maxPathLength=2).show()

+---------------+--------------+--------------+
|           from|            e0|            to|
+---------------+--------------+--------------+
|[e, Esther, 32]|[e, d, friend]|[d, David, 29]|
+---------------+--------------+--------------+

+---------------+--------------+--------------+--------------+----------------+
|           from|            e0|            v1|            e1|              to|
+---------------+--------------+--------------+--------------+----------------+
|[e, Esther, 32]|[e, f, follow]|[f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|
+---------------+--------------+--------------+--------------+----------------+

