## test hierarchical merge in FOF algorithm

#### the idea is this: 

* after the local FOF stage, each partition reports the particles it holds in the overlap region
* do a reduceByKey or treeAggregate of some sort to collect the groups belonging to the same particles
* produce a mapping of $G -> G_1$ and distribute to all hosts in form of broadcast lookup table

In [1]:
import numpy as np
import sys
sys.setrecursionlimit(sys.getrecursionlimit()*10)

import findspark
findspark.init()

# import matplotlib.pylab as plt
# %matplotlib inline
# import matplotlib.patches as patches
# plt.style.use('bmh')

In [2]:
%load_ext line_profiler
import line_profiler

from Cython.Compiler.Options import directive_defaults

directive_defaults['linetrace'] = True
directive_defaults['binding'] = True

In [3]:
import spark_fof
import spark_fof_c
from fof import fof
%load_ext Cython

In [4]:
def plot_rectangle(rec, ax=None):
    if ax is None: 
        ax = plt.subplot(aspect='equal')
    
    if isinstance(rec, (list, tuple)):
        for r in rec: 
            plot_rectangle(r,ax)
    
    else:
        size = (rec.maxes-rec.mins)
        ax.add_patch(patches.Rectangle(rec.mins, size[0], size[1], fill=False, zorder=-1))

## Start Spark

In [5]:
import os
os.environ['SPARK_CONF_DIR'] = './conf'
os.environ['SPARK_DRIVER_MEMORY'] = '4G'

In [6]:
import pyspark
from pyspark import SparkContext, SparkConf
import pynbody



In [7]:
conf = SparkConf()

conf.set('spark.python.profile', 'true')
conf.set('spark.executor.memory', '3G')
conf.set('spark.driver.memory', '4G')

<pyspark.conf.SparkConf at 0x11798ffd0>

In [8]:
sc = SparkContext(master='local[4]', conf=conf)

In [9]:
sc.addPyFile('spark_fof.py')
sc.addPyFile('spark_fof_c.pyx')
sc.addPyFile('spark_fof_c.c')
sc.addPyFile('spark_fof_c.so')
sc.addPyFile('fof.so')

In [10]:
N = 1
tau = 7.8125e-4
mins = np.array([-.5,-.5,-.5])
maxs= np.array([.5,.5,.5])
nMinMembers = 8
fof_analyzer = spark_fof.FOFAnalyzer(sc, '/Users/rok/polybox/euclid256.nat_no_header', nMinMembers, N, tau, mins, maxs)

In [14]:
%time len(fof_analyzer.groups)

CPU times: user 2.21 s, sys: 73.5 ms, total: 2.28 s
Wall time: 1min 50s


105330

In [12]:
sc.show_profiles()

Profile of RDD<id=2>
         74304 function calls (74268 primitive calls) in 3.043 seconds

   Ordered by: internal time, cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     8210    2.749    0.000    2.749    0.000 {method 'read' of 'file' objects}
     4096    0.149    0.000    0.196    0.000 spark_tipsy.py:31(convert_to_fof_particle)
     4096    0.036    0.000    0.036    0.000 {numpy.core.multiarray.zeros}
     4114    0.027    0.000    3.026    0.001 spark_tipsy.py:38(convert_to_fof_particle_partition)
     4114    0.018    0.000    2.784    0.001 serializers.py:155(_read_with_length)
     4114    0.013    0.000    3.039    0.001 rdd.py:1004(<genexpr>)
     4114    0.010    0.000    2.598    0.001 serializers.py:542(read_int)
     4096    0.010    0.000    0.010    0.000 {numpy.core.multiarray.frombuffer}
     4096    0.007    0.000    0.009    0.000 spark_tipsy.py:12(addInPlace)
     4096    0.006    0.000    0.015    0.000 accumulators.

### Check that the results make sense

In [11]:
ps = np.concatenate(fof_analyzer.particle_rdd.collect())

In [None]:
assert(len(ps) == len(pynbody.load('/Users/rok/polybox/euclid256.nat')))
n_groups = fof.run(ps, tau, nMinMembers)

In [16]:
print 'number of groups to %d particle = %d'%(nMinMembers, n_groups)
print 'number of groups via spark-fof = %d'%(len(fof_analyzer.groups))

number of groups to 8 particle = 105761
number of groups via spark-fof = 105330


In [56]:
ps_fof = np.concatenate(fof_analyzer.final_fof_rdd.collect())

In [60]:
len(ps)

16777216

In [58]:
len(ps_fof)

16819349

In [None]:
fof_analyzer.merged_rdd.map(lambda p: p[np.where(not p['is_ghost'])])

In [None]:
%%cython 

def filter_ghostsa
