In [1]:
from graphframes import *
from pyspark import SparkContext

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
v = spark.read.csv("data/social-nodes.csv", header=True)
e = spark.read.csv("data/social-relationships.csv", header=True)
g = GraphFrame(v, e)

In [4]:
total_degree = g.degrees
in_degree = g.inDegrees
out_degree = g.outDegrees

(total_degree.join(in_degree, "id", how="left")
 .join(out_degree, "id", how="left")
 .fillna(0)
 .sort("inDegree", ascending=False)
.show())

+-------+------+--------+---------+
|     id|degree|inDegree|outDegree|
+-------+------+--------+---------+
|   Doug|     6|       5|        1|
|  Alice|     7|       3|        4|
|Michael|     5|       2|        3|
|Bridget|     5|       2|        3|
|Charles|     2|       1|        1|
|    Amy|     1|       1|        0|
|   Mark|     3|       1|        2|
|  David|     2|       1|        1|
|  James|     1|       0|        1|
+-------+------+--------+---------+



In [5]:
from graphframes.lib import AggregateMessages as AM 
from pyspark.sql import functions as F
from pyspark.sql.types import *
from operator import itemgetter

In [6]:
def collect_paths(paths):
    return F.collect_set(paths)


collect_paths_udf = F.udf(collect_paths, ArrayType(StringType()))

paths_type = ArrayType(
    StructType([StructField("id", StringType()), StructField("distance",                                                    IntegerType())]))


def flatten(ids):
    flat_list = [item for sublist in ids for item in sublist]
    return list(dict(sorted(flat_list, key=itemgetter(0))).items())


flatten_udf = F.udf(flatten, paths_type)


def new_paths(paths, id):
    paths = [{"id": col1, "distance": col2 + 1} for col1,
                            col2 in paths if col1 != id]
    paths.append({"id": id, "distance": 1})
    return paths


new_paths_udf = F.udf(new_paths, paths_type)


def merge_paths(ids, new_ids, id):
    joined_ids = ids + (new_ids if new_ids else [])
    merged_ids = [(col1, col2) for col1, col2 in joined_ids if col1 != id]
    best_ids = dict(sorted(merged_ids, key=itemgetter(1), reverse=True))
    return [{"id": col1, "distance": col2} for col1, col2 in best_ids.items()]


merge_paths_udf = F.udf(merge_paths, paths_type)


def calculate_closeness(ids):
    nodes = len(ids)
    total_distance = sum([col2 for col1, col2 in ids])
    return 0 if total_distance == 0 else nodes * 1.0 / total_distance


closeness_udf = F.udf(calculate_closeness, DoubleType())

In [7]:
vertices = g.vertices.withColumn("ids", F.array())
cached_vertices = AM.getCachedDataFrame(vertices)
g2 = GraphFrame(cached_vertices, g.edges)

In [8]:
g2.vertices.collect()

[Row(id='Alice', ids=[]),
 Row(id='Bridget', ids=[]),
 Row(id='Charles', ids=[]),
 Row(id='Doug', ids=[]),
 Row(id='Mark', ids=[]),
 Row(id='Michael', ids=[]),
 Row(id='David', ids=[]),
 Row(id='Amy', ids=[]),
 Row(id='James', ids=[])]

In [9]:
for i in range(0, g2.vertices.count()):
    msg_dst = new_paths_udf(AM.src["ids"], AM.src["id"])
    msg_src = new_paths_udf(AM.dst["ids"], AM.dst["id"])
    agg = g2.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
                               sendToSrc=msg_src, sendToDst=msg_dst)
    res = agg.withColumn("newIds", flatten_udf("agg")).drop("agg")
    new_vertices = (g2.vertices.join(res, on="id", how="left_outer")
                    .withColumn("mergedIds", merge_paths_udf("ids", "newIds",
                    "id")).drop("ids", "newIds")
                    .withColumnRenamed("mergedIds", "ids"))
    cached_new_vertices = AM.getCachedDataFrame(new_vertices)
    g2 = GraphFrame(cached_new_vertices, g2.edges)

(g2.vertices
 .withColumn("closeness", closeness_udf("ids"))
 .sort("closeness", ascending=False)
 .show(truncate=False))

+-------+-----------------------------------------------------------------+------------------+
|id     |ids                                                              |closeness         |
+-------+-----------------------------------------------------------------+------------------+
|Alice  |[[Charles, 1], [Mark, 1], [Bridget, 1], [Doug, 1], [Michael, 1]] |1.0               |
|Doug   |[[Charles, 1], [Mark, 1], [Alice, 1], [Bridget, 1], [Michael, 1]]|1.0               |
|David  |[[James, 1], [Amy, 1]]                                           |1.0               |
|Bridget|[[Charles, 2], [Mark, 2], [Alice, 1], [Doug, 1], [Michael, 1]]   |0.7142857142857143|
|Michael|[[Charles, 2], [Mark, 2], [Alice, 1], [Doug, 1], [Bridget, 1]]   |0.7142857142857143|
|James  |[[Amy, 2], [David, 1]]                                           |0.6666666666666666|
|Amy    |[[James, 2], [David, 1]]                                         |0.6666666666666666|
|Mark   |[[Bridget, 2], [Charles, 2], [Michael, 2]