# Preparation and Reading Input Data


    conda install dask
    conda install -c conda-forge  graphviz

In [1]:
%matplotlib inline
import sys
import numpy as np
import random
import os, time, sys, datetime
import sklearn.metrics.pairwise
import scipy.spatial.distance
from scipy.spatial.distance import cdist
import dask
import dask.array as da
import dask.multiprocessing
from dask.diagnostics import ProgressBar
from dask.diagnostics import ResourceProfiler
from dask.dot import dot_graph
from dask.array.core import map_blocks
import dask.bag as db
#from multiprocessing.pool import ThreadPool

In [5]:
RESULT_DIR="results"
RESULT_FILE_PREFIX="pair-distance-"
HEADER_CSV="Scenario, Type, Time"
#BASE_DIRECTORY=os.getcwd()
# Dask has issues with NFS home directory on Comet
# BASE_DIRECTORY='/scratch/luckow/7146882'
BASE_DIRECTORY='/oasis/scratch/comet/luckow/temp_project'
OUT_DIR=os.path.join(BASE_DIRECTORY, "npy_stack")

FILENAMES=["../132k_dataset/atom_pos_132K.npy", "../145K_dataset/atom_pos_145K.npy", 
          "../300K_dataset/atom_pos_291K.npy", '../840K_dataset/atom_pos_839K.npy']

scenario = FILENAMES[0]

## Preprate npy stacks for Dask

In [6]:
CHUNKSIZE=8192

for i in FILENAMES:
    print i
    atoms = np.load(i)
    a_da = da.from_array(atoms, chunks=(CHUNKSIZE,3))
    out_file=os.path.join(OUT_DIR, os.path.basename(i)+"_"+str(CHUNKSIZE))
    try:
        os.makedirs(out_file)
    except:
        pass
    da.to_npy_stack(out_file, a_da)

../132k_dataset/atom_pos_132K.npy
../145K_dataset/atom_pos_145K.npy
../300K_dataset/atom_pos_291K.npy
../840K_dataset/atom_pos_839K.npy


In [4]:
os.listdir(os.path.join(OUT_DIR))

['atom_pos_291K.npy_4096',
 'atom_pos_132K.npy_4096',
 'atom_pos_145K.npy_4096',
 'atom_pos_839K.npy_4096']

## Load Data Numpy

In [6]:
start = time.time()
atoms = np.load(scenario)
atoms.shape
end = time.time()

try:
    os.mkdir(RESULT_DIR)
except:
    pass        

results=[]
d =datetime.datetime.now()
result_filename = RESULT_FILE_PREFIX + d.strftime("%Y%m%d-%H%M%S") + ".csv"
f = open(os.path.join(RESULT_DIR, result_filename), "w")
f.write(HEADER_CSV+ "\n")
result_line = scenario + ",Read," + str(end-start)
f.write(result_line)
f.flush()
a=atoms[:100]
print "Loaded file: " + result_line
print "Use subset of Shape %s"%str(a.shape)

Loaded file: ../132k_dataset/atom_pos_132K.npy,Read,0.00250887870789
Use subset of Shape (100, 3)


# Distance Calculation - Single Core

## Giannis

In [None]:
%%time

def get_distance(Atom1, Atom2):
    # Calculate Euclidean distance. 1-D and 3-D in the future
    return np.sqrt(sum((Atom1 - Atom2) ** 2))

def n_dim_input_to_numpy_array(temp):
    temp = temp.split(',')
    temp = map(float,temp)
    return np.asfarray(temp)
    
    
# the difference is that in the Cus compute data that are in main diagonal compute half of the elements 
# because table is symmetric, so the second loop can be half in the first case 

WINDOW_SIZE=a.shape[0]
reading_start_point_i = 0
j_dim = 0
cutoff = 15
print WINDOW_SIZE


def distance_giannis():
    distances=np.empty((WINDOW_SIZE, WINDOW_SIZE), dtype='bool')    
    for i in range(0, WINDOW_SIZE):
        for j in range(i+1,WINDOW_SIZE):
            dist = get_distance(a[i],a[j])  
            if dist<=cutoff:
                distances[i][j]=True 
            else:
                distances[i][j]=False
    return distances

## Python Plain NumPy

In [27]:
a=atoms[:10000]

In [31]:
%%time
dist_sq = np.sqrt(np.sum((a[:,np.newaxis,:] - a[np.newaxis,:,:]) ** 2, axis=-1))

CPU times: user 3.87 s, sys: 683 ms, total: 4.55 s
Wall time: 4.54 s


In [32]:
dist_sq.shape

(10000, 10000)

## Scikit

In [33]:
%%time
dist_sq = cdist(a, a, 'euclidean')

CPU times: user 429 ms, sys: 181 ms, total: 610 ms
Wall time: 607 ms


In [34]:
dist_sq.shape

(10000, 10000)

# Dask

In [32]:
%%time
a_dask=da.from_npy_stack(os.path.join(OUT_DIR,'atom_pos_132K.npy_4096'))
#a_dask=da.from_array(a, chunks=(2048,3))
diff=a_dask[:, np.newaxis, :] - a_dask[np.newaxis, :, :]
res=da.sqrt(da.sum((diff) ** 2, axis=-1))>15.0
#res.visualize()
out = res.compute()

CPU times: user 13min 22s, sys: 2min 7s, total: 15min 29s
Wall time: 47.7 s


## Dask Threads

In [None]:
def benchmark_dask(filename, cutoff=15, number_threads=40):
    results = []
    start = time.time()
    a_dask=da.from_npy_stack(filename)
    end_read = time.time()
    diff=a_dask[:, np.newaxis, :] - a_dask[np.newaxis, :, :]
    res=da.sqrt(da.sum((diff) ** 2, axis=-1))>cutoff
    with ProgressBar():
    #with dask.set_options(get=dask.multiprocessing.get):
        out = res.compute()           
        end_compute = time.time()
    np.save(os.path.basename(filename)+"_out.npy", out)
    #da.to_npy_stack("out.npy", out)
    end_out_write = time.time()
    results.append("%s,dask,thread,read_file, %.4f"%(filename, end_read-start))
    results.append("%s,dask,thread,compute, %.4f"%(filename, end_compute-end_read))
    results.append("%s,dask,thread,write_file, %.4f"%(filename, end_out_write-end_compute))
    results.append("%s,dask,thread,total, %.4f"%(filename, end_out_write-start))
    print("\n".join(results))

## Dask Distributed

In [None]:
from distributed import Client, progress

hostname = '198.202.114.170:8786'

def benchmark_dask_distributed(filename, cutoff=15, number_threads=40):
    client = Client(hostname)
    results = []
    start = time.time()
    a_dask=da.from_npy_stack(filename)
    end_read = time.time()
    diff=a_dask[:, np.newaxis, :] - a_dask[np.newaxis, :, :]
    res=da.sqrt(da.sum((diff) ** 2, axis=-1))>cutoff 
    with ProgressBar():
        with dask.set_options(get=client.get):
            out = res.compute()
            end_compute = time.time()
            print "end compute"
            outfile = os.path.join("/data/tmp", os.path.basename(filename)+"_out.npy")
            np.save(outfile, out)
            #da.to_npy_stack("out.npy", out)
            end_out_write = time.time()            
            os.remove(outfile)
            results.append("%s,dask,thread-distributed,read_file, %.4f"%(filename, end_read-start))
            results.append("%s,dask,thread-distributed,compute, %.4f"%(filename, end_compute-end_read))
            results.append("%s,dask,thread-distributed,write_file, %.4f"%(filename, end_out_write-end_compute))
            results.append("%s,dask,thread-distributed,total, %.4f"%(filename, end_out_write-start))
            print("\n".join(results))
    #client.shutdown()

## Benchmark Execution

In [None]:
dask_scenarios = [os.path.abspath(os.path.join(OUT_DIR, i)) for i in os.listdir(OUT_DIR)]
dask_scenarios

In [None]:
dask_scenarios = [os.path.abspath(os.path.join(OUT_DIR, i)) for i in os.listdir(OUT_DIR)]
for s in dask_scenarios:
    if "_4096" in s: # and '839K' not in s:
        print "Process: %s"%s
        benchmark_dask_distributed(s)

## Map Blocks
(still exploratory)

In [41]:
#a_dask = da.from_array(a, chunks=(2, 3))
a_dask=da.from_npy_stack(os.path.join(OUT_DIR,'atom_pos_132K.npy_4096'))
a_dask.shape

(131072, 3)

In [42]:
a_dask[0].compute()

memmap([ 458.09997559,  510.39996338,   59.09999847], dtype=float32)

In [None]:
da.

In [33]:
%%time

f = map_blocks(lambda a, b: cdist(a,b, 'euclidean'), a_dask, a_dask)
#f = map_blocks(lambda a, b: map_func_test, a_dask, a_dask, chunks=(100,100))
res=f.compute()

CPU times: user 3.32 s, sys: 2.06 s, total: 5.38 s
Wall time: 2.01 s


In [None]:
da.f

## Dask Bag Implementation

* To slow and not scalable enough
* Cartesian product does not scale sufficiently
* map partition has a bug in Dask 0.13 that does not work with Dask Bag created from cartesian product

In [45]:
a_db=db.from_sequence(a_dask, partition_size=8192)

In [46]:
all_pairs = a_db.product(a_db)

In [47]:
all_pairs.npartitions

256

In [49]:
%%time
number_pairs = all_pairs.count().compute()

CPU times: user 5min 37s, sys: 43.8 s, total: 6min 21s
Wall time: 5min 31s


In [50]:
%%time 
print "Processing All Pairs of Length: %d"%number_pairs

def dask_dist(a, b, cutoff=15):
    diff = a - b
    res=da.sqrt(da.sum((diff) ** 2, axis=-1))
    return res

res=all_pairs.map(lambda a: (a[0], a[1], dask_dist(a[0], a[1]))).compute()

Processing All Pairs of Length: 17179869184


Exception in thread Thread-50:
Traceback (most recent call last):
  File "/home/luckow/anaconda2/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/home/luckow/anaconda2/lib/python2.7/threading.py", line 754, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/home/luckow/anaconda2/lib/python2.7/multiprocessing/pool.py", line 326, in _handle_workers
    pool._maintain_pool()
  File "/home/luckow/anaconda2/lib/python2.7/multiprocessing/pool.py", line 230, in _maintain_pool
    self._repopulate_pool()
  File "/home/luckow/anaconda2/lib/python2.7/multiprocessing/pool.py", line 223, in _repopulate_pool
    w.start()
  File "/home/luckow/anaconda2/lib/python2.7/multiprocessing/process.py", line 130, in start
    self._popen = Popen(self)
  File "/home/luckow/anaconda2/lib/python2.7/multiprocessing/forking.py", line 121, in __init__
    self.pid = os.fork()
OSError: [Errno 12] Cannot allocate memory



KeyboardInterrupt: 

# MXNet

In [None]:
import mxnet as mx

In [None]:
%%time
a_mx=mx.nd.array(a)
diff=(mx.ndarray.power(a_mx.reshape((1, a_mx.shape[0], a_mx.shape[1])) - a_mx.reshape((a_mx.shape[0], 1, a_mx.shape[1])), 2))
dist=mx.ndarray.sqrt(mx.ndarray.sum_axis(diff, axis=2)).asnumpy()

In [None]:
dist

# Tensorflow

In [None]:
import tensorflow as tf

In [None]:
tf.__version__

# Benchmark All

In [None]:
RESULT_DIR="results"
RESULT_FILE_PREFIX="mdanalysis-distance-spark-"
HEADER_CSV="Scenario, NumberAtoms, NumberExecutors, Time"

try:
    os.mkdir(RESULT_DIR)
except:
    pass        

results=[]
d =datetime.datetime.now()
result_filename = RESULT_FILE_PREFIX + d.strftime("%Y%m%d-%H%M%S") + ".csv"
f = open(os.path.join(RESULT_DIR, result_filename), "w")
f.write(HEADER_CSV+ "\n")


for i in range(10):
    for file_name in files:
        print "Process: " + file_name
        coord = np.loadtxt(file_name, dtype='float32')
        for i in NUMBER_EXECUTORS_SCENARIOS:
            #result=benchmark_spark(coord, i)
            result=benchmark_mdanalysis_dense(coord, i)
            #result=benchmark__tf_batch(coord, 500)
            results.append(result)
            f.write(result + "\n")
            f.flush()            
        del coord
        gc.collect()

f.close()
print("Finished run")

# Testing and Playing around with Numpy Broadcasting

In [17]:
a_np = np.arange(15).reshape(5,3)
a_np

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [18]:
a_np.shape[-1]

3

In [19]:
x=a_np[:, np.newaxis, :]
y=a_np[np.newaxis,:,:]

In [20]:
y.shape

(1, 5, 3)

In [21]:
x.shape

(5, 1, 3)

In [22]:
(x-y).shape

(5, 5, 3)

In [23]:
x.stack(x)

AttributeError: 'numpy.ndarray' object has no attribute 'stack'