### PySpark Friends of Friends

We are going to reproduce the map/reduce Hadoop example in Spark.
First things first, let's run it as a map/reduce.  The `lines_to_triples` function
implements the pattern that we used for MR.
> Node A with neighbors B and C propose canidate triples to it's neighbors
    * B A C to node B (A<C, else B C A)
    * C A B to node C (A<B else C B A)
All triples will get two proposal from it's neighbors and reduce them. If there are two matching proposals, we have a triple.

In [None]:
import numpy as np

def line_to_triples(line: str):
    fids = np.array(line.split(), dtype=int)
    ret = []
    for i in range(1, len(fids) - 1):
        for j in  range(i + 1, len(fids)):
            source = fids[0]
            fi, fj = fids[i], fids[j]
            if source < fi:
                ret.append([fj, source, fi])
            else:
                ret.append([fj, fi, source])
            if source < fj:
                ret.append([fi, source, fj])
            else:
                ret.append([fi, fj, source])
    return ret    

In [None]:
Simple test to show what it does.

In [144]:
line_to_triples ("1 5 8 7 9")

[[8, 1, 5],
 [5, 1, 8],
 [7, 1, 5],
 [5, 1, 7],
 [9, 1, 5],
 [5, 1, 9],
 [7, 1, 8],
 [8, 1, 7],
 [9, 1, 8],
 [8, 1, 9],
 [9, 1, 7],
 [7, 1, 9]]

#### Map/Reduce style impelementations

Now, using the map/reduce equivalent, use lines to triples to write a program on simple intput. You will have to use the wordcount style `<triple>, 1` to get a simpler reducer to work.

In [None]:
from pyspark import SparkContext

inputdir = "../data/simple.input"
outdir = "/tmp/output1"

from pyspark import SparkContext
sc = SparkContext("local", "App Name",)
rdd = sc.textFile(f"{inputdir}/*")
rdd = rdd.flatMap(line_to_triples)
rdd = rdd.map(lambda x: (str(x), 1))
rdd = rdd.reduceByKey(lambda x, y: x + y)
rdd = rdd.filter(lambda x: x[1] > 1)
rdd = rdd.map(lambda x: x[0])
rdd.saveAsTextFile(outdir)

sc.stop()                                                                                                                                   

In [None]:
sc.stop()  

In [None]:
### Helper funtions for join/partitions

In [None]:
def add_index (idx, part):
    for p in part:
        yield str(p), str(idx)

In [None]:
def filter_diff_idx (x):
    if x[1][0] != x[1][1]:
        return x[0]

In [None]:
filter_diff_idx(('[200, 100, 300]', ('0', '1')))

In [None]:

for t in add_index( 4, [7, 2, 4] ):
    print(t)

In [None]:
#### Join based implementation

In [None]:
from pyspark import SparkContext

inputdir = "../data/simple.input"
outdir = "/tmp/output2"

from pyspark import SparkContext
sc = SparkContext("local", "App Name",)
rdd = sc.textFile(f"{inputdir}/*")
rdd = rdd.flatMap(line_to_triples)
rdd = rdd.mapPartitionsWithIndex(add_index)
rdd = rdd.join(rdd)
rdd = rdd.map(filter_diff_idx).filter(lambda x: x!= None)
rdd.saveAsTextFile(outdir)

sc.stop()

In [None]:
##### Try to do map paritions again 

In [None]:
def filter_part (part):
    for p in part:
        if p[1][0] != p[1][1]:
            yield p[0]

In [151]:
### Timings

In [None]:
##### Medium fast version?

In [146]:
%%timeit -n1 -r1
from pyspark import SparkContext

inputdir = "../data/fof.input"
outdir = "/tmp/outputfof"

from pyspark import SparkContext
sc = SparkContext("local", "App Name",)
rdd = sc.textFile(f"{inputdir}/*")
rdd = rdd.flatMap(line_to_triples)
rdd = rdd.mapPartitionsWithIndex(add_index)
rdd = rdd.join(rdd)
rdd = rdd.map(filter_diff_idx).filter(lambda x: x!= None)
rdd.saveAsTextFile(outdir)

sc.stop()

                                                                                

8min 53s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
##### Slow version?

In [147]:
%%timeit -n1 -r1
from pyspark import SparkContext

inputdir = "../data/fof.input"
outdir = "/tmp/outputfof2"

from pyspark import SparkContext
sc = SparkContext("local", "App Name",)
rdd = sc.textFile(f"{inputdir}/*")
rdd = rdd.flatMap(line_to_triples)
rdd = rdd.map(lambda x: (str(x), 1))
rdd = rdd.reduceByKey(lambda x, y: x + y)
rdd = rdd.filter(lambda x: x[1] > 1)
rdd = rdd.map(lambda x: x[0])
rdd.saveAsTextFile(outdir)

sc.stop()                

                                                                                

3min 31s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
#### Faster version

In [150]:
%%timeit -n1 -r1
from pyspark import SparkContext

inputdir = "../data/fof.input"
outdir = "/tmp/outputfof4"

from pyspark import SparkContext
sc = SparkContext("local", "App Name",)
rdd = sc.textFile(f"{inputdir}/*")
rdd = rdd.flatMap(line_to_triples)
rdd = rdd.mapPartitionsWithIndex(add_index)
rdd = rdd.join(rdd)
rdd = rdd.map(filter_part)
rdd.saveAsTextFile(outdir)

sc.stop()

                                                                                

8min 56s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
