## test hierarchical merge in FOF algorithm

#### the idea is this: 

* after the local FOF stage, each partition reports the particles it holds in the overlap region
* do a reduceByKey or treeAggregate of some sort to collect the groups belonging to the same particles
* produce a mapping of $G -> G_1$ and distribute to all hosts in form of broadcast lookup table

In [1]:
import numpy as np
import sys
sys.setrecursionlimit(sys.getrecursionlimit()*10)

import findspark
findspark.init()

import matplotlib.pylab as plt
%matplotlib inline
import matplotlib.patches as patches
plt.style.use('bmh')



In [2]:
%load_ext line_profiler
import line_profiler

from Cython.Compiler.Options import directive_defaults

directive_defaults['linetrace'] = True
directive_defaults['binding'] = True

In [3]:
import spark_fof
import spark_fof_c
from fof import fof
%load_ext Cython

In [4]:
import spark_fof_c

In [5]:
def plot_rectangle(rec, ax=None):
    if ax is None: 
        ax = plt.subplot(aspect='equal')
    
    if isinstance(rec, (list, tuple)):
        for r in rec: 
            plot_rectangle(r,ax)
    
    else:
        size = (rec.maxes-rec.mins)
        ax.add_patch(patches.Rectangle(rec.mins, size[0], size[1], fill=False, zorder=-1))

## Start Spark

In [6]:
import os
os.environ['SPARK_CONF_DIR'] = './conf'
os.environ['SPARK_DRIVER_MEMORY'] = '4G'

In [7]:
import pyspark
from pyspark import SparkContext, SparkConf
import pynbody

In [8]:
conf = SparkConf()

conf.set('spark.python.profile', 'true')
conf.set('spark.executor.memory', '3G')
conf.set('spark.driver.memory', '4G')

<pyspark.conf.SparkConf at 0x1179784d0>

In [9]:
sc = SparkContext(master='local[4]', conf=conf)

In [10]:
sc.addPyFile('spark_fof.py')
sc.addPyFile('spark_fof_c.pyx')
sc.addPyFile('spark_fof_c.c')
sc.addPyFile('spark_fof_c.so')
sc.addPyFile('fof.so')

In [18]:
N = 2
tau = 7.8125e-4
mins = np.array([-.5,-.5,-.5])
maxs= np.array([.5,.5,.5])
nMinMembers = 8
fof_analyzer = spark_fof.FOFAnalyzer(sc, '/Users/rok/polybox/euclid256.nat_no_header', nMinMembers, N, tau, mins, maxs)

In [None]:
%time len(fof_analyzer.groups)

### Check that the results make sense

In [None]:
ps = np.concatenate(fof_analyzer.particle_rdd.collect())

In [None]:
assert(len(ps) == len(pynbody.load('/Users/rok/polybox/euclid256.nat')))
n_groups = fof.run(ps, tau, nMinMembers)

In [None]:
print 'number of groups to %d particle = %d'%(nMinMembers, n_groups)
print 'number of groups via spark-fof = %d'%(len(fof_analyzer.groups))