# GraphX Example

Source: https://spark.apache.org/docs/latest/graphx-programming-guide.html

## 1. Imports

In [1]:
import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

## 2. Example Property Graph

### 2.1 Building a Graph

In [2]:
// Define vertices of the graph: users

val users: RDD[(VertexId, (String, String))] = 
sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
                     (5L, ("franklin", "prof")), (2L, ("istoica", "prof"))))

users = ParallelCollectionRDD[0] at parallelize at <console>:37


ParallelCollectionRDD[0] at parallelize at <console>:37

In [3]:
// Create edges of the graphs: relationships

val relationships: RDD[Edge[String]] = 
sc.parallelize(Array(Edge(3L, 7L, "collab"), Edge(5L, 3L, "advisor"),
                     Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi")))

relationships = ParallelCollectionRDD[1] at parallelize at <console>:37


ParallelCollectionRDD[1] at parallelize at <console>:37

In [4]:
// Define a default user (for relationships with missing users)

val defaultUser = ("John Doe", "Missing")

defaultUser = (John Doe,Missing)


(John Doe,Missing)

In [5]:
// Build the initial graph

val graph = Graph(users, relationships, defaultUser)

graph = org.apache.spark.graphx.impl.GraphImpl@7c5b1a0


org.apache.spark.graphx.impl.GraphImpl@7c5b1a0

### 2.2 Deconstruct a Graph

In [6]:
// Count all users that are postdocs

graph.vertices.filter{case(id, (name, pos)) => pos == "postdoc"}.count

1

In [7]:
// Count all edges where src > dst ()

graph.edges.filter{case Edge(src, dst, prop) => src > dst}.count

1

In [8]:
// Get all the triplets to create an RDD of facts

val facts: RDD[String] = graph.triplets.map(triplet => 
    triplet.srcAttr._1 + "is the " + triplet.attr + " of " + 
    triplet.dstAttr._2)
facts.collect().foreach(println)

rxinis the collab of postdoc
franklinis the advisor of student
istoicais the colleague of prof
franklinis the pi of postdoc


facts = MapPartitionsRDD[23] at map at <console>:36


MapPartitionsRDD[23] at map at <console>:36

## 3. Graph Operators

In [9]:
// Example

graph.inDegrees.collect()

Array((3,1), (5,1), (7,2))

### 3.1 Property Operators

* `mapVertices`
* `mapEdges`
* `mapTriplets`

In [10]:
// New graph but only with names in the vertexs

val newGraph = graph.mapVertices{case(id, attr) => (attr._1)}

org.apache.spark.graphx.impl.GraphImpl@7ac6a235

newGraph = org.apache.spark.graphx.impl.GraphImpl@7ac6a235


In [11]:
graph.vertices.collect()

Array((2,(istoica,prof)), (3,(rxin,student)), (5,(franklin,prof)), (7,(jgonzal,postdoc)))

In [12]:
newGraph.vertices.collect()

Array((2,istoica), (3,rxin), (5,franklin), (7,jgonzal))

### 3.2 Structural Operators

* `reverse`
* `subgraph`
* `mask`
* `groupEdges`

In [13]:
// Reverse a graph

val reversedGraph = graph.reverse

reversedGraph = org.apache.spark.graphx.impl.GraphImpl@64e9fc1f


org.apache.spark.graphx.impl.GraphImpl@64e9fc1f

In [14]:
// Get a subgraph: subgraph(epred, vpred). 
// Exclude all the students from the graph

val subGraph = graph.subgraph(vpred = (id, attr) => attr._2 != "student")
subgraph.vertices.collect().foreach(println)

Name: Compile Error
Message: <console>:40: error: not found: value subgraph
       subgraph.vertices.collect().foreach(println)
       ^

StackTrace: 

In [15]:
// mask

val ccGraph = graph.connectedComponents()
val maskGraph = ccGraph.mask(subGraph)

Name: Compile Error
Message: <console>:37: error: not found: value subGraph
       val maskGraph = ccGraph.mask(subGraph)
                                    ^

StackTrace: 

### 3.3 Join Operators

* `joinVertices`
* `outerJoinVertices`

In [16]:
graph.vertices.collect().foreach(println)

(2,(istoica,prof))
(3,(rxin,student))
(5,(franklin,prof))
(7,(jgonzal,postdoc))


In [17]:
// joinVertices

val emailsRDD: RDD[(VertexId, String)] = 
sc.parallelize(Array((2L, "istoica@gmail.com"), (7L, "franklin@gmail.com")))

val graphWithEmails = graph.joinVertices(emailsRDD)((idx, a, b) => (a._1, b))
graphWithEmails.vertices.collect().foreach(println)

(2,(istoica,istoica@gmail.com))
(3,(rxin,student))
(5,(franklin,prof))
(7,(jgonzal,franklin@gmail.com))


emailsRDD = ParallelCollectionRDD[36] at parallelize at <console>:39
graphWithEmails = org.apache.spark.graphx.impl.GraphImpl@342f5975


org.apache.spark.graphx.impl.GraphImpl@342f5975

In [18]:
// outerJoinVertices

val graphWithEmailsFull = graph.outerJoinVertices(emailsRDD)((idx, a, b) => {
    
    b match {
        
        case Some(b) => (a._1, a._2, b)
        case None => (a._1, a._2, "Null")
    }
    
    
})
graphWithEmailsFull.vertices.collect().foreach(println)

(2,(istoica,prof,istoica@gmail.com))
(3,(rxin,student,Null))
(5,(franklin,prof,Null))
(7,(jgonzal,postdoc,franklin@gmail.com))


graphWithEmailsFull = org.apache.spark.graphx.impl.GraphImpl@1a688eda


org.apache.spark.graphx.impl.GraphImpl@1a688eda

## 4 Neighborhood Aggregation

* `aggregateMessages`
* `mapReduceTriplets`
* `degrees, inDegrees, outDegrees`
* `collectNeighborIds`, `collectNeighbor` 

In [19]:
graph.inDegrees.collect()

Array((3,1), (5,1), (7,2))

In [21]:
// Calculate the max degrees, inDegrees and outDegrees of the graph


def max(a: (VertexId, Int), b: (VertexId, Int)): (VertexId, Int) = {
    
    if (a._2 > b._2) a else b
    
}

val maxInDegree = graph.inDegrees.reduce(max)
val maxOutDegree = graph.outDegrees.reduce(max)
val maxDegree = graph.degrees.reduce(max)

maxInDegree = (7,2)
maxOutDegree = (5,2)
maxDegree = (5,3)


max: (a: (org.apache.spark.graphx.VertexId, Int), b: (org.apache.spark.graphx.VertexId, Int))(org.apache.spark.graphx.VertexId, Int)


(5,3)

In [26]:
// Collecting Neighbors: very expensive, use aggregateMessages instead

val direction: EdgeDirection = EdgeDirection.Out
val neihborVertexsIds = graph.collectNeighborIds(direction)
val neihborVertexs = graph.collectNeighbors(direction)

direction = EdgeDirection.Out
neihborVertexsIds = VertexRDDImpl[76] at RDD at VertexRDD.scala:57
neihborVertexs = VertexRDDImpl[82] at RDD at VertexRDD.scala:57


VertexRDDImpl[82] at RDD at VertexRDD.scala:57

## 5. Graph Algorithms

* `pageRank`
* `connectedComponents`
* `triangleCount`

In [27]:
// pageRank

val ranks = graph.pageRank(0.0001).vertices

ranks = VertexRDDImpl[172] at RDD at VertexRDD.scala:57


VertexRDDImpl[172] at RDD at VertexRDD.scala:57

In [28]:
// conectedComponets

val cc = graph.connectedComponents()

cc = org.apache.spark.graphx.impl.GraphImpl@5cef4be9


org.apache.spark.graphx.impl.GraphImpl@5cef4be9

In [29]:
// triangleCounts

val tc = graph.triangleCount()

tc = org.apache.spark.graphx.impl.GraphImpl@828c842


org.apache.spark.graphx.impl.GraphImpl@828c842