In [2]:
from sklearn.neighbors import NearestNeighbors, BallTree, KDTree
import numpy as np
import networkx as nx
import sys, os
from scipy.spatial.distance import cdist
import time

In [3]:
#from multiprocessing.pool import ThreadPool
RESULT_DIR="results"
RESULT_FILE_PREFIX="tree-pair-distance-"
HEADER_CSV="Scenario, Type, Time"
#BASE_DIRECTORY=os.getcwd()
# Dask has issues with NFS home directory on Comet
# BASE_DIRECTORY='/scratch/luckow/7146882'
BASE_DIRECTORY='/oasis/scratch/comet/luckow/temp_project'
#BASE_DIRECTORY='/scratch/luckow/7218009/'
OUT_DIR=os.path.join(BASE_DIRECTORY, "npy_stack")
RESULT_DIR=os.path.join(BASE_DIRECTORY, "results")

FILENAMES=["../132k_dataset/atom_pos_132K.npy", "../145K_dataset/atom_pos_145K.npy", 
          "../300K_dataset/atom_pos_291K.npy", '../840K_dataset/atom_pos_839K.npy']

# Dummy Data and Scikit

In [4]:
number_points = 10
points_local_np = np.arange(number_points*3).reshape(number_points,3)

In [5]:
def pairwise_distance_balltree(points_np):
    tree = BallTree(points_np, leaf_size=40)
    edges = tree.query_radius(points_np, 15.0)
    edge_list=[list(zip(np.repeat(idx, len(dest_list)), dest_list)) for idx, dest_list in enumerate(edges)]
    edge_list_flat = np.array([list(item) for sublist in edge_list for item in sublist])
    res = edge_list_flat
    res=edge_list_flat[edge_list_flat[:,0]<edge_list_flat[:,1], :] 
    return res

In [6]:
def pairwise_distance_kdtree(points_np):
    tree = KDTree(points_np, leaf_size=40)
    edges = tree.query_radius(points_np, 15.0)
    edge_list=[list(zip(np.repeat(idx, len(dest_list)), dest_list)) for idx, dest_list in enumerate(edges)]
    edge_list_flat = np.array([list(item) for sublist in edge_list for item in sublist])
    res = edge_list_flat
    res=edge_list_flat[edge_list_flat[:,0]<edge_list_flat[:,1], :] 
    return res

In [7]:
def pairwise_distance_cdist(points_np, cutoff=15.0):
    distances = cdist(points_np, points_np)
    true_res = np.array(np.where(distances < cutoff))
    res=np.array(zip(true_res[0], true_res[1]))
    res=res[res[:,0]<res[:,1], :]
    return res

In [8]:
atoms = np.load("../132k_dataset/atom_pos_132K.npy")
atoms = atoms[:50000]

In [11]:
%%time
res_cdist=pairwise_distance_cdist(atoms)

CPU times: user 27.1 s, sys: 41.9 s, total: 1min 8s
Wall time: 2min 17s


In [12]:
%%time
res_tree=pairwise_distance_tree(atoms)

CPU times: user 1.68 s, sys: 164 ms, total: 1.84 s
Wall time: 1.87 s


In [13]:
len(res_tree)

290917

In [14]:
len(res_cdist)

290917

In [15]:
res_tree[:5]

array([[ 0, 11],
       [ 0,  8],
       [ 0,  4],
       [ 0, 32],
       [ 0,  7]])

In [16]:
res_cdist[:5]

array([[0, 4],
       [0, 5],
       [0, 6],
       [0, 7],
       [0, 8]])

In [19]:
g1=nx.from_edgelist(res_tree)
g2=nx.from_edgelist(res_cdist)

In [25]:
nx.difference(g1, g2).edges()

[]

In [11]:
machine = "comet"
for i in range(10):
    atoms = np.load("../840K_dataset/atom_pos_839K.npy")
    for n in [10000, 20000, 40000, 80000, 160000, 320000, 640000, 839000]:
        a = atoms[:n]
        start = time.time()
        res=pairwise_distance_balltree(a)
        end = time.time()
        print ("BallTree, %s, %d, %.4f"%(machine, n, end-start))
        start = time.time()
        res=pairwise_distance_kdtree(a)
        end = time.time()
        print ("KDTree, %s, %d, %.4f"%(machine, n, end-start))
        if n<160000:
            start = time.time()
            res=pairwise_distance_cdist(a)
            end = time.time()
            print ("cdist, %s, %d, %.4f"%(machine, n, end-start))       

BallTree, comet, 10000, 0.2110
KDTree, comet, 10000, 0.2302
cdist, comet, 10000, 1.0401
BallTree, comet, 20000, 0.5451
KDTree, comet, 20000, 0.6486
cdist, comet, 20000, 4.0374
BallTree, comet, 40000, 1.2871
KDTree, comet, 40000, 1.4574
cdist, comet, 40000, 15.8857
BallTree, comet, 80000, 2.8230
KDTree, comet, 80000, 2.9835
cdist, comet, 80000, 62.9494
BallTree, comet, 160000, 6.3881
KDTree, comet, 160000, 6.6423
BallTree, comet, 320000, 13.8700
KDTree, comet, 320000, 15.0973
BallTree, comet, 640000, 33.7613
KDTree, comet, 640000, 39.3834
BallTree, comet, 839000, 50.6539
KDTree, comet, 839000, 54.7261
BallTree, comet, 10000, 0.1957
KDTree, comet, 10000, 0.2436
cdist, comet, 10000, 1.0404
BallTree, comet, 20000, 0.4418
KDTree, comet, 20000, 0.5222
cdist, comet, 20000, 4.0370
BallTree, comet, 40000, 0.9857
KDTree, comet, 40000, 1.4793
cdist, comet, 40000, 15.8929
BallTree, comet, 80000, 2.8776
KDTree, comet, 80000, 2.9804
cdist, comet, 80000, 63.0422
BallTree, comet, 160000, 6.2109
KDTree

In [30]:
!ls ../840K_dataset 

atom_pos_839K.npy
