# Chapter 7: Analyzing Co-Ocurrence Networks with GraphX

In [1]:
import edu.umd.cloud9.collection.XMLInputFormat

import java.nio.charset.StandardCharsets
import java.security.MessageDigest

import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.{Text => TextHadoop}
import org.apache.hadoop.conf.Configuration

import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession, Row}
import org.apache.spark.sql.functions._

import scala.xml._

import org.apache.hadoop.io.{Text=>TextHadoop}


In [2]:
import org.apache.spark.sql.{functions => F}

import org.apache.spark.sql.{functions=>F}


## Getting the Data

In [3]:
def loadMedline(spark: SparkSession, path: String): Dataset[String] = {
    import spark.implicits._
    val conf = new Configuration()
    conf.set(XMLInputFormat.START_TAG_KEY, "<MedlineCitation ")
    conf.set(XMLInputFormat.END_TAG_KEY, "</MedlineCitation>")
    val sc = spark.sparkContext
    val in = sc.newAPIHadoopFile(path, classOf[XMLInputFormat],
      classOf[LongWritable], classOf[TextHadoop], conf)
    in.map(line => line._2.toString).toDS()
}

loadMedline: (spark: org.apache.spark.sql.SparkSession, path: String)org.apache.spark.sql.Dataset[String]


In [4]:
def majorTopics(record: String): Seq[String] = {
    val elem = XML.loadString(record)
    val dn = elem \\ "DescriptorName"
    val mt = dn.filter(n => (n \ "@MajorTopicYN").text == "Y")
    mt.map(n => n.text)
}

majorTopics: (record: String)Seq[String]


In [5]:
val medlineRaw: Dataset[String] = loadMedline(spark, "../data/medline")

medlineRaw = [value: string]


[value: string]

In [6]:
val medline: Dataset[Seq[String]] = medlineRaw.map(majorTopics).sample(0.01).cache()

medline = [value: array<string>]


[value: array<string>]

In [7]:
medline.count()

2376

In [8]:
medline.take(3)

[List(Drosophila Proteins, Interleukin-6, Nerve Growth Factors), List(Attitude of Health Personnel, Interprofessional Relations, Nursing Homes, Prejudice, Social Distance), List(Child, Hospitalized, Disabled Persons)]

## Analyzing the MeSH Major Topics and Their Co-Ocurrences

In [20]:
val topics = medline.flatMap(mesh => mesh).toDF("topic")

topics = [topic: string]


[topic: string]

In [21]:
val topicDist = topics.groupBy("topic").count().sort(F.col("count").desc).
withColumnRenamed("count", "cnt")

topicDist = [topic: string, cnt: bigint]


[topic: string, cnt: bigint]

In [22]:
topicDist.show()

+--------------------+---+
|               topic|cnt|
+--------------------+---+
|            Research| 15|
|        Tuberculosis| 13|
|           Hospitals| 11|
| Population Dynamics| 11|
|       Public Policy| 11|
|             Disease| 10|
|          Physicians| 10|
|Emigration and Im...|  9|
|       Ethnic Groups|  9|
|               Blood|  9|
|   Abortion, Induced|  9|
|Social Control, F...|  8|
| Wounds and Injuries|  8|
|           Neoplasms|  8|
|            Politics|  7|
|Government Regula...|  7|
| Models, Theoretical|  7|
|       Jurisprudence|  7|
|Tomography, X-Ray...|  7|
|          Anesthesia|  7|
+--------------------+---+
only showing top 20 rows



In [11]:
topicDist.groupBy("cnt").count().withColumnRenamed("count", "dist").sort(F.col("dist").desc).show()

+---+----+
|cnt|dist|
+---+----+
|  1|1381|
|  2| 296|
|  3| 104|
|  4|  36|
|  5|  18|
|  6|   8|
|  7|   7|
|  9|   4|
| 11|   3|
|  8|   3|
| 10|   2|
| 13|   1|
| 15|   1|
+---+----+



In [12]:
val list = List(1,2,3)
val combs = list.combinations(2)
combs.foreach(println)

List(1, 2)
List(1, 3)
List(2, 3)


list = List(1, 2, 3)
combs = empty iterator


empty iterator

In [13]:
val topicsPair = medline.flatMap(t => {t.sorted.combinations((2))}).toDF("pairs")

topicsPair = [pairs: array<string>]


[pairs: array<string>]

In [14]:
topicsPair.show(truncate=false)

+-----------------------------------------------------------+
|pairs                                                      |
+-----------------------------------------------------------+
|[Drosophila Proteins, Interleukin-6]                       |
|[Drosophila Proteins, Nerve Growth Factors]                |
|[Interleukin-6, Nerve Growth Factors]                      |
|[Attitude of Health Personnel, Interprofessional Relations]|
|[Attitude of Health Personnel, Nursing Homes]              |
|[Attitude of Health Personnel, Prejudice]                  |
|[Attitude of Health Personnel, Social Distance]            |
|[Interprofessional Relations, Nursing Homes]               |
|[Interprofessional Relations, Prejudice]                   |
|[Interprofessional Relations, Social Distance]             |
|[Nursing Homes, Prejudice]                                 |
|[Nursing Homes, Social Distance]                           |
|[Prejudice, Social Distance]                               |
|[Child,

In [15]:
val cooccurs = topicsPair.groupBy("pairs").count().withColumnRenamed("count", "cnt").cache()

cooccurs = [pairs: array<string>, cnt: bigint]


[pairs: array<string>, cnt: bigint]

In [16]:
cooccurs.count()

3536

In [17]:
cooccurs.select("pairs", "cnt").sort(F.col("cnt").desc).show(10, truncate=false)

+-----------------------------------------------------+---+
|pairs                                                |cnt|
+-----------------------------------------------------+---+
|[Government Regulation, Social Control, Formal]      |7  |
|[Emigration and Immigration, Population Dynamics]    |5  |
|[Niacin, Tuberculosis]                               |5  |
|[Emigration and Immigration, Transients and Migrants]|4  |
|[Population Dynamics, Urban Population]              |4  |
|[Demography, Population Dynamics]                    |4  |
|[Emigration and Immigration, Public Policy]          |4  |
|[Public Policy, Transients and Migrants]             |3  |
|[Anesthesia, Anesthesiology]                         |3  |
|[Demography, Population Characteristics]             |3  |
+-----------------------------------------------------+---+
only showing top 10 rows



## Constructing a Co-Occurrence Network with GraphX

In [18]:
def hashId(str: String): Long = {
    // This is effectively the same implementation as in Guava's Hashing, but 'inlined'
    // to avoid a dependency on Guava just for this. It creates a long from the first 8 bytes
    // of the (16 byte) MD5 hash, with first byte as least-significant byte in the long.
    val bytes = MessageDigest.getInstance("MD5").digest(str.getBytes(StandardCharsets.UTF_8))
    (bytes(0) & 0xFFL) |
    ((bytes(1) & 0xFFL) << 8) |
    ((bytes(2) & 0xFFL) << 16) |
    ((bytes(3) & 0xFFL) << 24) |
    ((bytes(4) & 0xFFL) << 32) |
    ((bytes(5) & 0xFFL) << 40) |
    ((bytes(6) & 0xFFL) << 48) |
    ((bytes(7) & 0xFFL) << 56)
}

hashId: (str: String)Long


In [25]:
val vertices = topics.map{case(Row(topic: String)) => (hashId(topic), topic)}.toDF("hash", "topic")

vertices = [hash: bigint, topic: string]


[hash: bigint, topic: string]

In [26]:
vertices.show(10, truncate=false)

+--------------------+----------------------------+
|hash                |topic                       |
+--------------------+----------------------------+
|6881210680665544257 |Drosophila Proteins         |
|-7785719421263410589|Interleukin-6               |
|5704854033070027507 |Nerve Growth Factors        |
|6427933560761129515 |Attitude of Health Personnel|
|-4689715188779541896|Interprofessional Relations |
|2257255427537747493 |Nursing Homes               |
|-1412680106657823228|Prejudice                   |
|2634198788896344991 |Social Distance             |
|-8347274417141191070|Child, Hospitalized         |
|-7910652893888327622|Disabled Persons            |
+--------------------+----------------------------+
only showing top 10 rows



In [27]:
vertices.count()

2757

In [28]:
val uniqueHashes = vertices.agg(countDistinct("hash")).take(1)

uniqueHashes = Array([1864])


0
1864


In [29]:
vertices.agg(countDistinct("topic")).take(1)

0
1864


In [31]:
cooccurs.sort(F.col("cnt").desc).show()

+--------------------+---+
|               pairs|cnt|
+--------------------+---+
|[Government Regul...|  7|
|[Emigration and I...|  5|
|[Niacin, Tubercul...|  5|
|[Emigration and I...|  4|
|[Population Dynam...|  4|
|[Demography, Popu...|  4|
|[Emigration and I...|  4|
|[Ethnic Groups, P...|  3|
|[Castration, Orch...|  3|
|[Ethnic Groups, P...|  3|
|[Population Growt...|  3|
|[Population Dynam...|  3|
|[Economics, Socio...|  3|
| [Isoniazid, Niacin]|  3|
|[Isoniazid, Tuber...|  3|
|[Emigration and I...|  3|
|[Abortion, Induce...|  3|
|[Physiology, Rese...|  3|
|[Demography, Urba...|  3|
|[Public Policy, T...|  3|
+--------------------+---+
only showing top 20 rows



In [32]:
val edges = cooccurs.map{case(Row(topics: Seq[_], cnt:Long)) =>
    val ids = topics.map(_.toString).map(hashId).sorted
    Edge(ids(0), ids(1), cnt)
}

edges = [srcId: bigint, dstId: bigint ... 1 more field]


[srcId: bigint, dstId: bigint ... 1 more field]

In [36]:
val vertexRDD = vertices.rdd.map{
    case(Row(hash: Long, topic:String)) => (hash, topic)
}

vertexRDD = MapPartitionsRDD[179] at map at <console>:61


MapPartitionsRDD[179] at map at <console>:61

In [37]:
val topicGraph = Graph(vertexRDD, edges.rdd)
topicGraph.cache()

topicGraph = org.apache.spark.graphx.impl.GraphImpl@292d1402


org.apache.spark.graphx.impl.GraphImpl@292d1402

In [38]:
vertexRDD.count()

2757

In [39]:
topicGraph.vertices.count()

1864

## Understanding the Structure of the Network

### Conected Components

In [40]:
val connectedComponentGraph = topicGraph.connectedComponents()

connectedComponentGraph = org.apache.spark.graphx.impl.GraphImpl@27857716


org.apache.spark.graphx.impl.GraphImpl@27857716

In [41]:
val componentDF = connectedComponentGraph.vertices.toDF("vid", "cid")
val componentCounts = componentDF.groupBy("cid").count()
componentCounts.count()

componentDF = [vid: bigint, cid: bigint]
componentCounts = [cid: bigint, count: bigint]


485

In [42]:
componentCounts.orderBy(F.desc("count")).show()

+--------------------+-----+
|                 cid|count|
+--------------------+-----+
|-9215470674759766104|  967|
|-7715393728639818274|   12|
|-7766162447367385796|   10|
|-6380589216187441282|   10|
|-8060929734674540133|    9|
|-8413990221343043350|    8|
|-7837548223451515988|    8|
|-5203417666596895117|    7|
|-6781964506890345472|    7|
|-5907308932485011342|    7|
|-3843194863954452561|    7|
|-3680134813602427771|    6|
|-7919268455626835567|    6|
|-6195495891807228194|    6|
|-6827738450567291831|    6|
|-9031507740888942675|    6|
|-4011363123023043990|    6|
|-8632193869969830941|    5|
|-2198264851255507998|    5|
|-5758547386235439761|    5|
+--------------------+-----+
only showing top 20 rows



In [43]:
val topicComponentDF = topicGraph.vertices.innerJoin(
      connectedComponentGraph.vertices) {
      (topicId, name, componentId) => (name, componentId.toLong)
}.values.toDF("topic", "cid")

topicComponentDF = [topic: string, cid: bigint]


[topic: string, cid: bigint]

In [44]:
topicComponentDF.show()

+--------------------+--------------------+
|               topic|                 cid|
+--------------------+--------------------+
|      Desert Climate|-9215470674759766104|
|          Phosphates|-8538331842254293523|
|  Fluorides, Topical|-4044118614182780436|
|         Archaeology|-9215470674759766104|
|Cardiovascular Ab...|-9215470674759766104|
|          Flavonoids| 4288808014668014282|
|             Protons|-7517960540023488744|
|   Social Adjustment|-9215470674759766104|
|    Disabled Persons|-8347274417141191070|
| Clinical Competence|-7919268455626835567|
|Water-Electrolyte...|-9215470674759766104|
|        Ear Diseases|-9215470674759766104|
|            Cicatrix|-6195495891807228194|
|      Bone and Bones|-9215470674759766104|
|    Coronary Disease|-1709118461927036481|
|Cerebrovascular D...|-9215470674759766104|
|      Blast Injuries|-9215470674759766104|
|Syphilis Serodiag...|-2518328612260514536|
|            Violence|-5539826757456174267|
|      Adenocarcinoma|  -8265595

In [45]:
topicComponentDF.where(F.col("cid")==="-9215470674759766104").show()

+--------------------+--------------------+
|               topic|                 cid|
+--------------------+--------------------+
|      Desert Climate|-9215470674759766104|
|         Archaeology|-9215470674759766104|
|Cardiovascular Ab...|-9215470674759766104|
|   Social Adjustment|-9215470674759766104|
|Water-Electrolyte...|-9215470674759766104|
|        Ear Diseases|-9215470674759766104|
|      Bone and Bones|-9215470674759766104|
|Cerebrovascular D...|-9215470674759766104|
|      Blast Injuries|-9215470674759766104|
|               Aging|-9215470674759766104|
|       Prescriptions|-9215470674759766104|
|Population Charac...|-9215470674759766104|
|Cell Physiologica...|-9215470674759766104|
|               Cebus|-9215470674759766104|
|                Soil|-9215470674759766104|
|         Emergencies|-9215470674759766104|
|                Beds|-9215470674759766104|
|  Psychopharmacology|-9215470674759766104|
|Cross-Cultural Co...|-9215470674759766104|
|          Urethritis|-921547067

In [52]:
topicDist.registerTempTable("topic_dist")

lastException: Throwable = null


In [53]:
val campy = spark.sql("SELECT * FROM topic_dist WHERE topic LIKE '%ampylobacter%'")
campy.show()

+-----+---+
|topic|cnt|
+-----+---+
+-----+---+



campy = [topic: string, cnt: bigint]


[topic: string, cnt: bigint]

## Degree Distribution

In [54]:
val degrees = topicGraph.degrees.cache()

degrees = VertexRDDImpl[480] at RDD at VertexRDD.scala:57


VertexRDDImpl[480] at RDD at VertexRDD.scala:57

In [56]:
degrees.take(1)

[(7510548076323648462,4)]

In [57]:
degrees.map(_._2).stats()

(count: 1598, mean: 4.425532, stdev: 5.031853, max: 57.000000, min: 1.000000)

In [60]:
val sign = medline.filter(x => x.size == 1)
sign.count()

sign = [value: array<string>]


476

In [61]:
val singTopic = sign.flatMap(topic => topic).distinct()
singTopic.count()

singTopic = [value: string]


434

In [62]:
val topic2 = topicsPair.flatMap(_.getAs[Seq[String]](0))
singTopic.except(topic2).count()

topic2 = [value: string]


266

In [71]:
val namesAndDegrees = degrees.innerJoin(topicGraph.vertices){
    (topicId, degree, name) => (name, degree.toInt)
}.values.toDF("topic", "degree")

namesAndDegrees = [topic: string, degree: int]


[topic: string, degree: int]

In [72]:
namesAndDegrees.sort(desc("degree")).show()

+--------------------+------+
|               topic|degree|
+--------------------+------+
|            Research|    57|
|             Disease|    45|
|       Ethnic Groups|    44|
|       Public Policy|    39|
|Social Control, F...|    39|
|Socioeconomic Fac...|    38|
|Government Regula...|    37|
|           Economics|    36|
|Population Charac...|    34|
| Population Dynamics|    34|
|   Abortion, Induced|    34|
|          Physiology|    32|
|       Jurisprudence|    30|
|        Tuberculosis|    29|
|          Physicians|    28|
| Wounds and Injuries|    27|
|               Blood|    27|
| Models, Theoretical|    27|
|       Communication|    26|
|Emigration and Im...|    25|
+--------------------+------+
only showing top 20 rows



## Filtering Out Noisy Edges

In [75]:
def chiSq(YY: Long, YB: Long, YA: Long, T: Long): Double = {
    val NB = T - YB
    val NA = T - YA
    val YN = YA - YY
    val NY = YB - YY
    val NN = T - NY - YN - YY
    val inner = math.abs(YY * NN - YN * NY) - T / 2.0
    T * math.pow(inner, 2) / (YA * NA * YB * NB)
}

chiSq: (YY: Long, YB: Long, YA: Long, T: Long)Double


In [76]:
val T = medline.count()
val topicDistRdd = topicDist.map { case Row(topic: String, cnt: Long) => (hashId(topic), cnt) }.rdd
val topicDistGraph = Graph(topicDistRdd, topicGraph.edges)
val chiSquaredGraph = topicDistGraph.mapTriplets(triplet =>
    chiSq(triplet.attr, triplet.srcAttr, triplet.dstAttr, T)
)
chiSquaredGraph.edges.map(x => x.attr).stats()

T = 2376
topicDistRdd = MapPartitionsRDD[579] at rdd at <console>:78
topicDistGraph = org.apache.spark.graphx.impl.GraphImpl@37fb48ee
chiSquaredGraph = org.apache.spark.graphx.impl.GraphImpl@403831ae


(count: 3536, mean: 265.646876, stdev: 216.392021, max: 1790.919255, min: 4.039522)

In [77]:
val interesting = chiSquaredGraph.subgraph(triplet => triplet.attr > 19.5)

interesting = org.apache.spark.graphx.impl.GraphImpl@6c38fd4c


org.apache.spark.graphx.impl.GraphImpl@6c38fd4c

In [78]:
interesting.edges.count

3419

## Analyzing the Filtered Graph

In [79]:
val interestingComponentGraph = interesting.connectedComponents()
val icDF = interestingComponentGraph.vertices.toDF("vid", "cid")
val icCountDF = icDF.groupBy("cid").count()
icCountDF.count()
icCountDF.orderBy(desc("count")).show()

+--------------------+-----+
|                 cid|count|
+--------------------+-----+
|-9215470674759766104|  967|
|-7715393728639818274|   12|
|-7766162447367385796|   10|
|-6380589216187441282|   10|
|-8060929734674540133|    9|
|-7837548223451515988|    8|
|-8413990221343043350|    8|
|-3843194863954452561|    7|
|-6781964506890345472|    7|
|-5203417666596895117|    7|
|-5907308932485011342|    7|
|-9031507740888942675|    6|
|-4011363123023043990|    6|
|-3680134813602427771|    6|
|-7919268455626835567|    6|
|-6827738450567291831|    6|
|-6195495891807228194|    6|
|-9181449310634160080|    5|
|-8632193869969830941|    5|
|-8924989764442168089|    5|
+--------------------+-----+
only showing top 20 rows



interestingComponentGraph = org.apache.spark.graphx.impl.GraphImpl@549d1940
icDF = [vid: bigint, cid: bigint]
icCountDF = [cid: bigint, count: bigint]


[cid: bigint, count: bigint]

In [80]:
val interestingDegrees = interesting.degrees.cache()
interestingDegrees.map(_._2).stats()
interestingDegrees.innerJoin(topicGraph.vertices) {
    (topicId, degree, name) => (name, degree)
}.toDF("topic", "degree").orderBy(desc("degree")).show()

+--------------------+--------------------+
|               topic|              degree|
+--------------------+--------------------+
|-1087266232809183895|[p-Methoxy-N-meth...|
| 6603518327358446980|[gamma-Globulins, 4]|
| 8580170059617337124|[beta-Thalassemia...|
| 8614349794773596331|  [Zingiberaceae, 1]|
| 3082545485912749937|       [Zea mays, 6]|
|-3676948314256878067|[Wounds, Nonpenet...|
|-6855486773038346001|[Wounds and Injur...|
|-3517691285840111112|  [Wound Healing, 5]|
|-5624674739178296995|[World Health Org...|
| 1929404773684187663|           [Work, 4]|
| 3395907313796951971|[Women, Working, 13]|
|-8429595694780809751|[Women's Rights, 12]|
| 5994786774935028247| [Women's Health, 6]|
|-3904843440664529999|          [Women, 8]|
| 6062097931118342599|[Withholding Trea...|
| 8535524441063792297|    [Wheelchairs, 1]|
| 4073205060170489512| [Weightlessness, 2]|
|-8609663971641751753|       [Weaning, 11]|
| 3473622954274011948|[Water-Electrolyt...|
| 1896331054814819218|[Water Pur

interestingDegrees = VertexRDDImpl[832] at RDD at VertexRDD.scala:57


VertexRDDImpl[832] at RDD at VertexRDD.scala:57

## Cliques and Clustering Coefficients

In [81]:
def avgClusteringCoef(graph: Graph[_, _]): Double = {
    val triCountGraph = graph.triangleCount()
    val maxTrisGraph = graph.degrees.mapValues(d => d * (d - 1) / 2.0)
    val clusterCoefGraph = triCountGraph.vertices.innerJoin(maxTrisGraph) {
      (vertexId, triCount, maxTris) => if (maxTris == 0) 0 else triCount / maxTris
    }
    clusterCoefGraph.map(_._2).sum() / graph.vertices.count()
}

avgClusteringCoef: (graph: org.apache.spark.graphx.Graph[_, _])Double


In [82]:
val avgCC = avgClusteringCoef(interesting)

avgCC = 0.5216933686879061


0.5216933686879061

## Computing Average Path Length with Pregel

In [84]:
def mergeMaps(m1: Map[VertexId, Int], m2: Map[VertexId, Int]): Map[VertexId, Int] = {
    def minThatExists(k: VertexId): Int = {
      math.min(
        m1.getOrElse(k, Int.MaxValue),
        m2.getOrElse(k, Int.MaxValue))
    }

    (m1.keySet ++ m2.keySet).map(k => (k, minThatExists(k))).toMap
  }

mergeMaps: (m1: Map[org.apache.spark.graphx.VertexId,Int], m2: Map[org.apache.spark.graphx.VertexId,Int])Map[org.apache.spark.graphx.VertexId,Int]


In [85]:
  def update(id: VertexId, state: Map[VertexId, Int], msg: Map[VertexId, Int])
    : Map[VertexId, Int] = {
    mergeMaps(state, msg)
  }

update: (id: org.apache.spark.graphx.VertexId, state: Map[org.apache.spark.graphx.VertexId,Int], msg: Map[org.apache.spark.graphx.VertexId,Int])Map[org.apache.spark.graphx.VertexId,Int]


In [86]:
  def checkIncrement(a: Map[VertexId, Int], b: Map[VertexId, Int], bid: VertexId)
    : Iterator[(VertexId, Map[VertexId, Int])] = {
    val aplus = a.map { case (v, d) => v -> (d + 1) }
    if (b != mergeMaps(aplus, b)) {
      Iterator((bid, aplus))
    } else {
      Iterator.empty
    }
  }

checkIncrement: (a: Map[org.apache.spark.graphx.VertexId,Int], b: Map[org.apache.spark.graphx.VertexId,Int], bid: org.apache.spark.graphx.VertexId)Iterator[(org.apache.spark.graphx.VertexId, Map[org.apache.spark.graphx.VertexId,Int])]


In [87]:
  def iterate(e: EdgeTriplet[Map[VertexId, Int], _]): Iterator[(VertexId, Map[VertexId, Int])] = {
    checkIncrement(e.srcAttr, e.dstAttr, e.dstId) ++
    checkIncrement(e.dstAttr, e.srcAttr, e.srcId)
  }

iterate: (e: org.apache.spark.graphx.EdgeTriplet[Map[org.apache.spark.graphx.VertexId,Int], _])Iterator[(org.apache.spark.graphx.VertexId, Map[org.apache.spark.graphx.VertexId,Int])]


In [88]:
def samplePathLengths[V, E](graph: Graph[V, E], fraction: Double = 0.02)
    : RDD[(VertexId, VertexId, Int)] = {
    val replacement = false
    val sample = graph.vertices.map(v => v._1).sample(
      replacement, fraction, 1729L)
    val ids = sample.collect().toSet

    val mapGraph = graph.mapVertices((id, v) => {
      if (ids.contains(id)) {
        Map(id -> 0)
      } else {
        Map[VertexId, Int]()
      }
    })

    val start = Map[VertexId, Int]()
    val res = mapGraph.ops.pregel(start)(update, iterate, mergeMaps)
    res.vertices.flatMap { case (id, m) =>
      m.map { case (k, v) =>
        if (id < k) {
          (id, k, v)
        } else {
          (k, id, v)
        }
      }
    }.distinct().cache()
  }

samplePathLengths: [V, E](graph: org.apache.spark.graphx.Graph[V,E], fraction: Double)org.apache.spark.rdd.RDD[(org.apache.spark.graphx.VertexId, org.apache.spark.graphx.VertexId, Int)]


In [89]:
val paths = samplePathLengths(interesting)
paths.map(_._3).filter(_ > 0).stats()

paths = MapPartitionsRDD[1161] at distinct at <console>:76


(count: 14416, mean: 6.092952, stdev: 1.972259, max: 14.000000, min: 1.000000)

In [90]:
val hist = paths.map(_._3).countByValue()
hist.toSeq.sorted.foreach(println)

(0,30)
(1,90)
(2,353)
(3,894)
(4,1669)
(5,2509)
(6,3040)
(7,2496)
(8,1776)
(9,970)
(10,394)
(11,162)
(12,55)
(13,7)
(14,1)


hist = Map(0 -> 30, 5 -> 2509, 10 -> 394, 14 -> 1, 1 -> 90, 6 -> 3040, 9 -> 970, 13 -> 7, 2 -> 353, 12 -> 55, 7 -> 2496, 3 -> 894, 11 -> 162, 8 -> 1776, 4 -> 1669)


Map(0 -> 30, 5 -> 2509, 10 -> 394, 14 -> 1, 1 -> 90, 6 -> 3040, 9 -> 970, 13 -> 7, 2 -> 353, 12 -> 55, 7 -> 2496, 3 -> 894, 11 -> 162, 8 -> 1776, 4 -> 1669)