## Make a simple synthetic dataset to test hierarchical merge in FOF algorithm

#### the idea is this: 

* after the local FOF stage, each partition reports the particles it holds in the overlap region
* do a reduceByKey or treeAggregate of some sort to collect the groups belonging to the same particles
* produce a mapping of $G -> G_1$ and distribute to all hosts in form of broadcast lookup table

In [1]:
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline

import sys
sys.setrecursionlimit(sys.getrecursionlimit()*10)
import matplotlib.patches as patches

plt.style.use('fivethirtyeight')

In [2]:
import os
os.environ['SPARK_HOME'] = os.path.join(os.path.expanduser('~'), 'spark')
import findspark
findspark.init()

In [3]:
def plot_rectangle(rec, ax=None):
    if ax is None: 
        ax = plt.subplot(aspect='equal')
    
    if isinstance(rec, (list, tuple)):
        for r in rec: 
            plot_rectangle(r,ax)
    
    else:
        size = (rec.maxes-rec.mins)
        ax.add_patch(patches.Rectangle(rec.mins, size[0], size[1], fill=False, zorder=-1))
    plt.draw()
    plt.show()

## Start Spark

In [4]:
import os
os.environ['SPARK_CONF_DIR'] = './conf'
os.environ['SPARK_DRIVER_MEMORY'] = '8G'

In [5]:
import pyspark
from pyspark import SparkContext, SparkConf
import pynbody
import sparkhpc

In [6]:
%%writefile job.template
#!/bin/sh
#BSUB -J {jobname}
#BSUB -W {walltime} # runtime to request
#BSUB -o {jobname}-%J.log # output extra o means overwrite
#BSUB -n {ncores} # requesting ncores cores
#BSUB -R "rusage[mem={memory}, scratch=10000] span[hosts=-1]" # take any available core with mem MB of memory

# setup the spark paths
export SPARK_HOME={spark_home}
export SPARK_LOCAL_DIRS=$__LSF_JOB_TMPDIR__
export LOCAL_DIRS=$SPARK_LOCAL_DIRS
export SPARK_WORKER_DIR=$__LSF_JOB_TMPDIR__/work
#export SPARK_CONF_DIR=/cluster/home/roskarr/Projects/spark-fof/conf

sparkcluster launch --memory {memory}M


Overwriting job.template


In [7]:
#sj = sparkhpc.sparkjob.LSFSparkJob(ncores=27, memory=5500, walltime='4:00', template='./job.template')
sj = sparkhpc.sparkjob.LSFSparkJob(ncores=9,memory=10000,walltime='6:00', template='./job.template')
#sj = sparkhpc.sparkjob.LSFSparkJob(clusterid=0)

In [8]:
sj.wait_to_start()

INFO:sparkhpc:Submitted cluster 0


In [9]:
sc = sparkhpc.start_spark(master=sj.master_url, spark_conf='../conf', profiling=True, executor_memory='1500M')

In [10]:
sc.defaultParallelism

9

In [11]:
sj

### Run FOF 

In [12]:
import spark_fof

In [13]:
reload(spark_fof.spark_fof)
reload(spark_fof.spark_fof_c)
reload(spark_fof)


<module 'spark_fof' from '/cluster/project/sis/ri/roskarr/spark-fof/spark_fof/__init__.pyc'>

In [14]:
%%time
#path = '/cluster/home/roskarr/work/2Tlc-final/'
path = '/cluster/home/roskarr/projects/euclid/2Tlc-final/'

# domain parameters
diff = np.float32(0.033068776)
global_min = -31*diff
global_max = 31*diff

dom_maxs = np.array([global_max]*3, dtype=np.float64)
dom_mins = np.array([global_min]*3, dtype=np.float64)

#tau = diff*5./125.
tau = 0.2/12600
buffer_tau = diff*5./150.

fof_analyzer = spark_fof.spark_fof.LCFOFAnalyzer(sc, path, 64, 62, tau, dom_mins, dom_maxs, blockids=range(30,33), buffer_tau=tau*2)

Number of input files:  27
Total number of particles:  1972749082
CPU times: user 8.21 s, sys: 334 ms, total: 8.54 s
Wall time: 8.94 s


In [15]:
len(fof_analyzer.groups)

domain group mapping build took 88.913373 seconds
Final group map build took 0.678887 seconds


1269075

In [16]:
sc.stop()
sj.stop()

INFO:sparkhpc:Job <34013133> is being terminated



In [87]:
import gc
import time
from itertools import izip
merged_rdd = fof_analyzer.merged_rdd
# group_merge_map = fof_analyzer.group_merge_map
# gr_map_inv = {v:k for (k,v) in group_merge_map.iteritems()}
# gr_map_inv_b = sc.broadcast(gr_map_inv)
nPartitions = 27*5
nMinMembers = 64
from collections import Counter

def count_groups_partition(particle_arrays, gr_map_inv_b, nMinMembers): 
    global_counts = Counter()
    
    for p_arr in particle_arrays: 
        local_counts = Counter()
        gs, counts = np.unique(p_arr['iGroup'], return_counts=True)
        for g,cnt in izip(gs,counts): 
            local_counts[g]
    print 'unique: ', time.time()-timein
    del(p_arr)
    timein = time.time()
    print gc.collect()
    print 'collect: ', time.time()-timein
    gr_map_inv = gr_map_inv_b.value
    return ((g,cnt) for g,cnt in izip(gs,counts) if (g in gr_map_inv) or (cnt >= nMinMembers))

def count_groups(p):
    timein = time.time()
    gs, counts = np.unique(p['iGroup'], return_counts=True)
    print 'unique: ', time.time()-timein, 'ngroups: ', len(gs)
    return ((g,cnt) for g,cnt in izip(gs,counts))

domain group mapping build took 84.133303 seconds


In [88]:
no_ghosts_rdd = merged_rdd.map(lambda p: p[np.where(p['is_ghost'] != 2)[0]])

In [89]:
no_ghosts_rdd.cache()

PythonRDD[224] at RDD at PythonRDD.scala:48

In [90]:
no_ghosts_rdd.count()

204

In [91]:
group_counts = no_ghosts_rdd.flatMap(spark_fof.spark_fof_c.count_groups_cython)

In [92]:
group_counts

PythonRDD[226] at RDD at PythonRDD.scala:48

In [96]:
%time group_counts.count()

CPU times: user 25 ms, sys: 1 ms, total: 26 ms
Wall time: 25 s


813061056

In [97]:
group_counts2 = no_ghosts_rdd.flatMap(count_groups)
group_counts2

PythonRDD[231] at RDD at PythonRDD.scala:48

In [98]:
%time group_counts2.count()

CPU times: user 20 ms, sys: 999 µs, total: 21 ms
Wall time: 33.8 s


813061056

In [99]:
from pyspark.sql import Row

In [100]:
sqc = pyspark.sql.SQLContext(sc)

In [101]:
gc_df = sqc.createDataFrame(group_counts.map(lambda (gid,count): Row(gid=int(gid),count=int(count))))

In [102]:
%%time 
final_groups = gc_df.groupBy('gid').sum().filter('sum(count) >= %d'%nMinMembers).select('gid', 'sum(count)').collect()
print len(final_groups)

1269066
CPU times: user 7.91 s, sys: 279 ms, total: 8.19 s
Wall time: 10min 25s


In [103]:
sc.show_profiles()

Profile of RDD<id=1>
         1593 function calls in 0.013 seconds

   Ordered by: internal time, cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       81    0.002    0.000    0.010    0.000 rdd.py:1713(add_shuffle_key)
       27    0.002    0.000    0.002    0.000 {cPickle.load}
       27    0.001    0.000    0.004    0.000 broadcast.py:82(load)
       27    0.001    0.000    0.001    0.000 {open}
       27    0.001    0.000    0.001    0.000 {cPickle.dumps}
       54    0.001    0.000    0.001    0.000 serializers.py:143(_write_with_length)
       27    0.000    0.000    0.001    0.000 serializers.py:217(load_stream)
       54    0.000    0.000    0.001    0.000 serializers.py:155(_read_with_length)
       27    0.000    0.000    0.005    0.000 broadcast.py:92(value)
       27    0.000    0.000    0.001    0.000 rdd.py:61(portable_hash)
       27    0.000    0.000    0.013    0.000 worker.py:165(process)
       27    0.000    0.000    0.005  

In [69]:
x = no_ghosts_rdd.first()

In [106]:
%timeit res = list(count_groups_cython(x))

1 loop, best of 3: 806 ms per loop


In [100]:
len(res)

4234195

In [96]:
%time gs, counts = np.unique(x['iGroup'], return_counts=True)

CPU times: user 452 ms, sys: 51 ms, total: 503 ms
Wall time: 502 ms


In [97]:
%timeit list(count_groups(x))

unique:  0.498481988907 ngroups:  4234195
unique:  0.499393939972 ngroups:  4234195
unique:  0.499611854553 ngroups:  4234195
unique:  0.500025987625 ngroups:  4234195
1 loop, best of 3: 1.68 s per loop
