## Make a simple synthetic dataset to test hierarchical merge in FOF algorithm

#### the idea is this: 

* after the local FOF stage, each partition reports the particles it holds in the overlap region
* do a reduceByKey or treeAggregate of some sort to collect the groups belonging to the same particles
* produce a mapping of $G -> G_1$ and distribute to all hosts in form of broadcast lookup table

In [1]:
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline

import sys
sys.setrecursionlimit(sys.getrecursionlimit()*10)
import matplotlib.patches as patches

plt.style.use('bmh')



In [2]:
import spark_fof
import spark_fof_c
import fof

In [3]:
def plot_rectangle(rec, ax=None):
    if ax is None: 
        ax = plt.subplot(aspect='equal')
    
    if isinstance(rec, (list, tuple)):
        for r in rec: 
            plot_rectangle(r,ax)
    
    else:
        size = (rec.maxes-rec.mins)
        ax.add_patch(patches.Rectangle(rec.mins, size[0], size[1], fill=False, zorder=-1))

## Set up data

In [4]:
# create the arrays
from spark_fof_c import pdt
pdt_tipsy = np.dtype({'names': ("mass", "x", "y", "z", "vx", "vy", "vz", "eps", "phi"),
                                  'formats': ('f','f','f','f','f','f','f','f','f')})

pdt_tipsy = np.dtype([('mass', 'f4'),('pos', 'f4', 3),('vel', 'f4', 3), ('eps', 'f4'), ('phi', 'f4')])
# nps = 1000000
# ngs = 1
# particles = np.zeros(nps, dtype=pdt)
# done_ps = 0
# #centers = np.random.rand(ngs,3)*1.7 - 0.85
# centers = np.array([0,0,0]).reshape(1,3)
# for group, center in zip(range(ngs), centers): 
#     print group, center
#     group_ps = nps/ngs
#     if nps - (done_ps + group_ps) < group_ps:
#         group_ps = nps - done_ps 
#     particles['pos'][done_ps:done_ps+group_ps] = \
#         np.random.multivariate_normal(center, [[.5,0,0],[0,.5,0],[0,0,.5]], group_ps)
#     done_ps += group_ps
   
# particles['iOrder'] = range(nps)

In [5]:
from spark_fof_c import pdt

## Start Spark

In [6]:
import findspark
findspark.init()

In [7]:
import os
os.environ['SPARK_CONF_DIR'] = './conf'
os.environ['SPARK_DRIVER_MEMORY'] = '4G'

In [8]:
import pyspark
from pyspark import SparkContext, SparkConf
import pynbody

In [9]:
conf = SparkConf()

In [10]:
conf.set('spark.python.profile', 'true')
conf.set('spark.executor.memory', '3G')
conf.set('spark.driver.memory', '4G')


<pyspark.conf.SparkConf at 0x116837710>

In [11]:
sc = SparkContext(master='local[*]', conf=conf, batchSize=0)

In [12]:
sc.addPyFile('spark_fof.py')
sc.addPyFile('spark_util.py')
sc.addPyFile('spark_fof_c.pyx')
sc.addPyFile('spark_fof_c.c')
sc.addPyFile('spark_fof_c.so')
sc.addPyFile('fof.so')

## Set up the domains

In [13]:
N = 1
tau = 7.8125e-4
mins = np.array([-.5,-.5,-.5], dtype=np.float)
maxs= np.array([.5,.5,.5], dtype=np.float)
domain_containers = spark_fof.setup_domain(N,tau,maxs,mins)

In [14]:
domain_containers

[<DomainRectangle [(-0.5, 0.0), (-0.5, 0.0), (-0.5, 0.0)]>,
 <DomainRectangle [(0.0, 0.5), (-0.5, 0.0), (-0.5, 0.0)]>,
 <DomainRectangle [(-0.5, 0.0), (0.0, 0.5), (-0.5, 0.0)]>,
 <DomainRectangle [(0.0, 0.5), (0.0, 0.5), (-0.5, 0.0)]>,
 <DomainRectangle [(-0.5, 0.0), (-0.5, 0.0), (0.0, 0.5)]>,
 <DomainRectangle [(0.0, 0.5), (-0.5, 0.0), (0.0, 0.5)]>,
 <DomainRectangle [(-0.5, 0.0), (0.0, 0.5), (0.0, 0.5)]>,
 <DomainRectangle [(0.0, 0.5), (0.0, 0.5), (0.0, 0.5)]>]

In [15]:
# f, ax = plt.subplots(subplot_kw={'aspect':'equal'}, figsize=(15,15))
# pynbody.plot.image(s.d, width=1, units = 'Msol Mpc^-2', cmap=plt.cm.Greys, show_cbar=False, subplot=ax)
# #plot_rectangle(domain_containers[0].bufferRectangle, ax=ax)
# for p in particles[::1000000]: 
#     plot_rectangle(domain_containers[spark_fof.get_bin_cython(p['pos'],2**N, np.array(mins), np.array(maxs))], ax=ax)
#     plot_rectangle(domain_containers[spark_fof.get_bin_cython(p['pos'], 2**N, np.array(mins),np.array(maxs))].bufferRectangle, ax=ax)
#     ax.plot(p['pos'][0], p['pos'][1], '.')
# plt.draw()
# ax.set_xlim(-.5,.5)
# ax.set_ylim(-.5,.5)

### Make the base RDD

In [16]:
def convert_to_fof_particle(s): 
    p_arr = np.fromstring(s, pdt_tipsy)
    
    new_arr = np.zeros(len(p_arr), dtype=pdt)
    new_arr['pos'] = p_arr['pos']
    
    return new_arr

In [17]:
def set_particle_IDs((p,i)): 
    p['iOrder'] = i
    return p

In [18]:
p_rdd = sc.binaryRecords('/Users/rok/polybox/euclid256.nat_no_header', pdt_tipsy.itemsize*2048)\
          .flatMap(convert_to_fof_particle).zipWithIndex().map(set_particle_IDs).cache()

In [19]:
%time p_rdd.count()

CPU times: user 13.9 ms, sys: 4.06 ms, total: 18 ms
Wall time: 6.47 s


16777216

In [20]:
p_rdd.take(10)

[([-0.49415767192840576, -0.49715662002563477, -0.4951761066913605], 0, 0),
 ([-0.4937679171562195, -0.4991474151611328, -0.4966009259223938], 0, 1),
 ([-0.48686617612838745, -0.4905261993408203, -0.4930003881454468], 0, 2),
 ([-0.4987441599369049, -0.4919763207435608, -0.4865908622741699], 0, 3),
 ([-0.4862843155860901, -0.4971228241920471, -0.49436184763908386], 0, 4),
 ([-0.49742138385772705, -0.48984673619270325, -0.4977671802043915], 0, 5),
 ([-0.4875064194202423, -0.4853893220424652, -0.48677125573158264], 0, 6),
 ([-0.4771958291530609, -0.4996482729911804, -0.4922342896461487], 0, 7),
 ([-0.47889092564582825, -0.49586814641952515, -0.4979572296142578], 0, 8),
 ([-0.47619882225990295, -0.49780598282814026, -0.4988156855106354], 0, 9)]

In [21]:
ps = np.fromiter(p_rdd.collect(), pdt)

In [22]:
%time fof.run(ps, tau, 1)

CPU times: user 17.8 s, sys: 78.6 ms, total: 17.9 s
Wall time: 17.9 s


7251094

### Partition particles into domains and set the partition part of local group ID

In [23]:
# partitioning duplicates the particles that are located in the boundary regions
part_rdd = (p_rdd.mapPartitions(lambda particles: spark_fof.partition_particles_cython(particles, domain_containers, tau, mins, maxs))
                 .partitionBy(len(domain_containers))
                 .values())

### Run the local FOF

In [24]:
from spark_util import spark_cython

In [25]:
import fof

In [30]:
def run_local_fof(index, particle_iter, tau, nMinMembers): 
    part_arr = np.fromiter(particle_iter, pdt)
    if len(part_arr)>0:
        fof.run(part_arr, tau, nMinMembers)
    return part_arr

def encode_gid(pid, cid, bits=32):
    if bits == 32: 
        res = np.int64(int(np.binary_repr(pid,width=32)+np.binary_repr(cid,width=32),2))
    elif bits == 16:
        res = np.int32(int(np.binary_repr(pid,width=16)+np.binary_repr(cid,width=16),2))
    else: 
        raise RuntimeError('Group encoding must use either 16 or 32 bit integers')
    return res

def set_group_id(partition_index, particle_iter):
    part_arr = np.fromiter(particle_iter, pdt)
    for i in range(len(part_arr)):
        #p = np.copy(part_arr[i])
        gid = part_arr['iGroup'][i]
        part_arr['iGroup'][i] = np.int64(bin((partition_index<<32) | gid), 2)
    #    res = int(bin((partition_index<<16) | gid),2)
     #   part_arr['iGroup'][i] = res
    #    print res, partition_index, gid, spark_fof.encode_gid(partition_index, gid, 16)
   # print part_arr[:10]
    return part_arr

In [31]:
fof_rdd = part_rdd.mapPartitionsWithIndex(lambda index, particles: run_local_fof(index, particles, tau, 1))\
                  .mapPartitionsWithIndex(set_group_id).cache()

### Group Merging stage

In [32]:
fof_analyzer = spark_fof.FOFAnalyzer(sc, N, tau, fof_rdd, [-.5,-.5,-.5], [.5,.5,.5])

In [33]:
merged_rdd = fof_analyzer.merge_groups(0)

In [34]:
merged = merged_rdd.collect()

In [35]:
merged_arr = np.fromiter(merged, pdt)

In [36]:
groups = np.unique(merged_arr['iGroup'])

In [39]:
len(groups)

7251094

In [37]:
merged_arr['iGroup'].max()

30065658097

In [38]:
particles = np.fromiter(ps,pdt)

In [43]:
ps_0 = np.fromiter(part_rdd.glom().collect()[0], pdt)

In [34]:
fof.run(particles, tau, 1)

7251094

In [45]:
fof.run(ps_0, tau, 1)

861190

In [None]:
f, axs = plt.subplots(1,2, subplot_kw={'aspect':'equal'}, figsize=(12,12))
for group in groups: 
    inds = np.where(merged_arr['iGroup'] == group)[0]
    print group, len(inds)
    axs[0].plot(merged_arr['pos'][inds,1], merged_arr['pos'][inds,0], ',')
    axs[1].plot(merged_arr['pos'][inds,2], merged_arr['pos'][inds,0], ',')
for ax in axs:
    ax.set_xlim(-1,1); ax.set_ylim(-1,1)

In [None]:
sc.show_profiles()

In [20]:
%time fof.run(particles, tau)

NameError: name 'particles' is not defined