In [1]:
import findspark
findspark.init()
import pyspark
import numpy as np
import os

In [2]:
from pyspark import SparkContext, SparkConf

# how many cores do we have for the driver
ncores = int(os.environ.get('LSB_DJOB_NUMPROC', 1)) 

# here we set the memory we want spark to use for the driver JVM
os.environ['SPARK_DRIVER_MEMORY'] = '%dG'%(ncores*0.7)

In [3]:
conf = SparkConf()

conf.set('spark.executor.instances', 44)
conf.set('spark.executor.cores', 4)

sc = SparkContext('yarn-client', conf=conf)

In [6]:
bytes_per_particle = 4*6 # 4 bytes, 6 components

lcp = np.dtype([('x','f4'),('y','f4'), ('z','f4'), ('vx','f4'), ('vy','f4'), ('vz','f4')])

In [7]:
from glob import glob
filelist = glob('/cluster/home03/sdid/roskarr/work/euclid/output/euclid.*.lcp.*')

In [8]:
import shutil
for f in filelist: 
    if os.path.getsize(f) < bytes_per_particle*10:
        shutil.move(f,'/cluster/home03/sdid/roskarr/work/euclid/output/smallfiles/')

In [9]:
file_rdd = sc.binaryRecords('file:///cluster/home03/sdid/roskarr/work/euclid/output/euclid.*.lcp.*', bytes_per_particle)\
             .map(lambda x: np.fromstring(x,lcp))

In [10]:
file_rdd.cache().count()

19073685

In [11]:
mins = np.ndarray(1, dtype=lcp)
maxs = np.ndarray(1, dtype=lcp)
for n in lcp.names: 
    mins[n] = 1e500
    maxs[n] = -1e500

In [12]:
def get_mins(p1, p2): 
    p_min = np.ndarray(1,dtype=lcp)
    for n in lcp.names: 
        if p1[n] < p2[n]: p_min[n] = p1[n]
        else            : p_min[n] = p2[n]
    return p_min

def get_maxs(p1, p2): 
    p_max = np.ndarray(1,dtype=lcp)
    for n in lcp.names: 
        if p1[n] > p2[n]: p_max[n] = p1[n]
        else            : p_max[n] = p2[n]
    return p_max

In [13]:
p_mins = file_rdd.reduce(get_mins)

In [14]:
p_maxs = file_rdd.reduce(get_maxs)

In [15]:
from math import floor
def get_bin(p, nbins, p_mins, p_maxs): 
    dx = (p_maxs['x'] - p_mins['x'])/float(nbins)
    dy = (p_maxs['y'] - p_mins['y'])/float(nbins)
    dz = (p_maxs['z'] - p_mins['z'])/float(nbins)
    xbin = floor((p['x'] + 1)/dx)
    ybin = floor((p['y'] + 1)/dy)
    zbin = floor((p['z'] + 1)/dz)
    
    return int(xbin + ybin*nbins + zbin*nbins*nbins)

In [16]:
numPartitions = sc.defaultParallelism*10

In [17]:
mapped_rdd = file_rdd.keyBy(lambda p: get_bin(p, 20, p_mins, p_maxs)).partitionBy(numPartitions, lambda p: p%numPartitions)

In [18]:
mapped_rdd.keys().max()

7029

In [19]:
box_counts = mapped_rdd.countByKey()

In [20]:
len(box_counts)

5096

In [None]:
ps = file_rdd.collect()

In [None]:
particles = file_rdd.filter(lambda p: abs(p['z']) < 1e-2)\
                    .map(lambda p: (p['x'][0], p['y'][0])).collect()

In [None]:
%matplotlib notebook

In [None]:
import matplotlib.pylab as plt

In [None]:
xs = [x[0] for x in particles]

In [None]:
ys = [x[1] for x in particles]

In [None]:
plt.figure()
plt.plot(xs, ys,',')

In [None]:
import pynbody

In [None]:
s = pynbody.load('/cluster/home03/sdid/roskarr/work/euclid/output/allsteps.tipsy')

In [None]:
plt.figure()
plt.plot(s['vx'], s['vz'], '.')

In [None]:
pynbody.plot.image(s)