## Make a simple synthetic dataset to test hierarchical merge in FOF algorithm

#### the idea is this: 

* after the local FOF stage, each partition reports the particles it holds in the overlap region
* do a reduceByKey or treeAggregate of some sort to collect the groups belonging to the same particles
* produce a mapping of $G -> G_1$ and distribute to all hosts in form of broadcast lookup table

In [1]:
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline

import sys
sys.setrecursionlimit(sys.getrecursionlimit()*10)
import matplotlib.patches as patches

plt.style.use('bmh')

In [2]:
import findspark
findspark.init()

In [3]:
def plot_rectangle(rec, ax=None):
    if ax is None: 
        ax = plt.subplot(aspect='equal')
    
    if isinstance(rec, (list, tuple)):
        for r in rec: 
            plot_rectangle(r,ax)
    
    else:
        size = (rec.maxes-rec.mins)
        ax.add_patch(patches.Rectangle(rec.mins, size[0], size[1], fill=False, zorder=-1))

## Set up data

In [4]:
# create the arrays
from spark_fof.spark_fof_c import pdt
pdt_tipsy = np.dtype({'names': ("mass", "x", "y", "z", "vx", "vy", "vz", "eps", "phi"),
                                  'formats': ('f','f','f','f','f','f','f','f','f')})

pdt_tipsy = np.dtype([('mass', 'f4'),('pos', 'f4', 3),('vel', 'f4', 3), ('eps', 'f4'), ('phi', 'f4')])
# nps = 1000000
# ngs = 1
# particles = np.zeros(nps, dtype=pdt)
# done_ps = 0
# #centers = np.random.rand(ngs,3)*1.7 - 0.85
# centers = np.array([0,0,0]).reshape(1,3)
# for group, center in zip(range(ngs), centers): 
#     print group, center
#     group_ps = nps/ngs
#     if nps - (done_ps + group_ps) < group_ps:
#         group_ps = nps - done_ps 
#     particles['pos'][done_ps:done_ps+group_ps] = \
#         np.random.multivariate_normal(center, [[.5,0,0],[0,.5,0],[0,0,.5]], group_ps)
#     done_ps += group_ps
   
# particles['iOrder'] = range(nps)

## Start Spark

In [5]:
import findspark
findspark.init()

In [6]:
import os
os.environ['SPARK_CONF_DIR'] = './conf'
os.environ['SPARK_DRIVER_MEMORY'] = '8G'

In [7]:
import pyspark
from pyspark import SparkContext, SparkConf
import pynbody

In [8]:
conf = SparkConf()

In [9]:
conf.set('spark.python.profile', 'true')
conf.set('spark.executor.memory', '8G')
conf.set('spark.driver.memory', '8G')
conf.set('spark.driver.maxResultSize', '5G')
conf.set('spark.executor.count', 27)

<pyspark.conf.SparkConf at 0x2accda6cca90>

In [10]:
import subprocess, re

In [11]:
from IPython.display import HTML

jobid = 30373055

job_peek = subprocess.check_output(["bpeek", str(jobid)])

master_url, master_webui = re.findall('(spark://\S+:\d{4}|http://\S+:\d{4})', job_peek)

HTML("""<p>master running at %s</p>
     <p>Web UI available <a target='_blank' href='%s\'>here</a>
     """%(master_url,master_webui))

In [12]:
sc = SparkContext(master=master_url, conf=conf)

In [13]:
# sc.addPyFile('spark_fof.py')
# # sc.addPyFile('spark_util.py')
# sc.addPyFile('spark_fof_c.pyx')
# sc.addPyFile('spark_fof_c.c')
# sc.addPyFile('spark_fof_c.so')
# sc.addPyFile('fof.so')


In [14]:
import spark_fof

## Set up the domains

In [15]:
N = 2
tau = 7.8125e-4
mins = np.array([-.5,-.5,-.5], dtype=np.float)
maxs= np.array([.5,.5,.5], dtype=np.float)
domain_containers = spark_fof.spark_fof.setup_domain(N,tau,maxs,mins)

### Make the base RDD

In [16]:
def convert_to_fof_particle(s): 
    p_arr = np.frombuffer(s, pdt_tipsy)
    
    new_arr = np.zeros(len(p_arr), dtype=pdt)
    new_arr['pos'] = p_arr['pos']
    
    return new_arr

In [17]:
pdt_lc = np.dtype([('pos', 'f4', 3),('vel', 'f4', 3)])

In [18]:
import re

get_block_ids = re.compile('blk\.(\d+)\.(\d+)\.(\d+)i')

In [19]:
def get_minmax(i): 
    mins = []
    maxs = []
    for arr in i: 
        mins.append(arr['pos'].min(axis=0))
        maxs.append(arr['pos'].max(axis=0))
    
    yield (np.concatenate(mins).reshape(len(mins),3).min(axis=0), np.concatenate(maxs).reshape(len(mins),3).max(axis=0))

In [20]:
from pyspark.accumulators import AccumulatorParam

class dictAdd(AccumulatorParam):
    def zero(self, value):
        return {i:0 for i in range(len(value))}
    def addInPlace(self, val1, val2): 
        for k, v in val2.iteritems(): 
            val1[k] += v
        return val1

In [21]:
def make_lc_rdd(sc, path):
    from glob import glob

    def set_particle_IDs_partition(index, iterator): 
        p_counts = partition_counts.value
        local_index = 0
        start_index = sum([p_counts[i] for i in range(index)])
        for arr in iterator:
            arr['iOrder'] = range(start_index + local_index, start_index + local_index + len(arr))
            local_index += len(arr)
            yield arr
    
    def read_file(index, i, chunksize=1024000): 
        for f in i:
            with open(f,'rb') as f: 
                header = f.read(62500)
                while True:
                    chunk = f.read(chunksize*24)
                    if len(chunk): 
                        p_arr = np.frombuffer(chunk, pdt_lc)
                        new_arr = np.zeros(len(p_arr), dtype=pdt)
                        new_arr['pos'] = p_arr['pos']
                        npart_acc.add({index: len(p_arr)})
                        yield new_arr
                    else: 
                        break
                    
    files = glob('/cluster/home/roskarr/projects/euclid/2Tlc-final/*/*')

    ids = map(lambda x: tuple(map(int, get_block_ids.findall(x)[0])), files)
    ids_map = {x:i for i,x in enumerate(ids)}
    ids_map_b = sc.broadcast(ids_map)
    nfiles = len(files)

    print 'Number of input files: ', nfiles
    
    # set the partition count accumulator
    npart_acc = sc.accumulator({i:0 for i in range(nfiles)}, dictAdd())
    
    rec_rdd = (sc.parallelize(zip(ids,files), numSlices=27).partitionBy(nfiles, lambda x: ids_map_b.value[x])
                 .values()
                 .mapPartitionsWithIndex(read_file))
    
    partition_counts = sc.broadcast(npart_acc.value)
    
    return rec_rdd.mapPartitionsWithIndex(set_particle_IDs_partition)

In [22]:
%%time 
p_rdd = make_lc_rdd(sc, '2Tlc-final/')
#p_rdd.count()

Number of input files:  27
CPU times: user 0 ns, sys: 20 ms, total: 20 ms
Wall time: 931 ms


In [24]:
def read_tipsy_output(filename, chunksize = 2048): 
    """
    Read a tipsy file and set the sequential particle IDs
    
    This scans through the data twice -- first to get partition particle counts
    and a second time to actually set the particle IDs.
    """
    
    # helper functions
    def convert_to_fof_particle_partition(index, iterator): 
        for s in iterator: 
            a = convert_to_fof_particle(s)
            if count: 
                npart_acc.add({index: len(a)})
            yield a

    def set_particle_IDs_partition(index, iterator): 
        p_counts = partition_counts.value
        local_index = 0
        start_index = sum([p_counts[i] for i in range(index)])
        for arr in iterator:
            arr['iOrder'] = range(start_index + local_index, start_index + local_index + len(arr))
            local_index += len(arr)
            yield arr
    
    rec_rdd = sc.binaryRecords(filename, pdt_tipsy.itemsize*chunksize)
    nPartitions = rec_rdd.getNumPartitions()
    # set the partition count accumulator
    npart_acc = sc.accumulator({i:0 for i in range(nPartitions)}, dictAdd())
    count=True
    # read the data and count the particles per partition
    rec_rdd = rec_rdd.mapPartitionsWithIndex(convert_to_fof_particle_partition)
    rec_rdd.count()
    count=False

    partition_counts = sc.broadcast(npart_acc.value)

    return rec_rdd.mapPartitionsWithIndex(set_particle_IDs_partition)

In [32]:
#p_rdd = read_tipsy_output('/Users/rok/polybox/euclid256.nat_no_header')

In [33]:
#p_rdd.cache().count()

In [162]:
# %%time 
# nMinMembers = 1
# n_groups = fof.run(ps, tau, nMinMembers)
# print 'number of groups to %d particle = %d'%(nMinMembers, n_groups)

number of groups to 1 particle = 7251094
CPU times: user 18.3 s, sys: 64.1 ms, total: 18.3 s
Wall time: 18.4 s


### Partition particles into domains and set the partition part of local group ID

In [34]:
# partitioning duplicates the particles that are located in the boundary regions
# part_rdd = (p_rdd.mapPartitions(lambda particles: spark_fof.partition_particles_cython(particles, domain_containers, tau, mins, maxs))
#                  .partitionBy(len(domain_containers))
#                  .values())

### Run FOF and Merge groups

In [27]:
fof_analyzer = spark_fof.spark_fof.FOFAnalyzer(sc, p_rdd, 1, 2, tau/100, [-.5,-.5,-.5], [.5,.5,.5])

In [28]:
fof_analyzer.partitioned_rdd = fof_analyzer.particle_rdd

In [29]:
fof_rdd = fof_analyzer.run_fof()

In [30]:
fof_rdd.count()

7538

In [33]:
merged_rdd = fof_analyzer.merge_groups(0)

In [34]:
merged = merged_rdd.collect()

In [35]:
merged_arr = np.fromiter(merged, pdt)

In [36]:
groups = np.unique(merged_arr['iGroup'])

In [39]:
len(groups)

7251094

In [37]:
merged_arr['iGroup'].max()

30065658097

In [38]:
particles = np.fromiter(ps,pdt)

In [43]:
ps_0 = np.fromiter(part_rdd.glom().collect()[0], pdt)

In [34]:
fof.run(particles, tau, 1)

7251094

In [45]:
fof.run(ps_0, tau, 1)

861190

In [None]:
f, axs = plt.subplots(1,2, subplot_kw={'aspect':'equal'}, figsize=(12,12))
for group in groups: 
    inds = np.where(merged_arr['iGroup'] == group)[0]
    print group, len(inds)
    axs[0].plot(merged_arr['pos'][inds,1], merged_arr['pos'][inds,0], ',')
    axs[1].plot(merged_arr['pos'][inds,2], merged_arr['pos'][inds,0], ',')
for ax in axs:
    ax.set_xlim(-1,1); ax.set_ylim(-1,1)

In [None]:
sc.show_profiles()

In [20]:
%time fof.run(particles, tau)

NameError: name 'particles' is not defined