# Mapper Function for KMeans

Optimized computation of distance between points and centroids

In [2]:
import sklearn
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np
import time

## Read data from file

In [3]:
#data_file = "/data/kmeans/dataset_200_2d.in"
data_file = "/data/kmeans/dataset_10M_2d.in"

In [4]:
!head -n 2 $data_file

-4594.04586276
12194.3621136


In [5]:
start = time.time()
data = np.loadtxt(data_file)
print "Loading Time: %.2f sec"%(time.time()-start)

Loading Time: 42.77 sec


In [6]:
num_dimensions = 2
num_clusters = 5000
num_points = len(data)/num_dimensions 

## Reshape read in data to n dimensions

In [7]:
data=data.reshape(num_points, num_dimensions)

In [8]:
print "Data Shape: " + str(data.shape)
print "First point: " + str(data[0])

Data Shape: (10000000, 2)
First point: [ -4594.04586276  12194.3621136 ]


## Extract n random points as initial centroids from data

In [9]:
clusters = data[np.random.choice(data.shape[0], num_clusters, replace=False),:]

In [10]:
print "First Centroid: " + str(clusters[0])

First Centroid: [-17427.9065159    6115.02101404]


In [11]:
print "First Distance: " + str(np.sqrt(sum((data[0] - clusters[0]) ** 2)))

First Distance: 14200.9284017


## Compute Distance between all points and centroids

Sklearn Documentation: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.euclidean_distances.html

In [None]:
start = time.time()
distance = sklearn.metrics.pairwise.euclidean_distances(data, clusters)
print "Distance Computation Time (sklearn): %.2f sec"%(time.time()-start)

In [None]:
distance.shape

# Dask Testing

Dask Paper: http://nipy.bic.berkeley.edu:5000/download/24

Documentation: http://dask.pydata.org/en/latest/array-creation.html

In [12]:
import dask.array as da

In [13]:
dask_data = da.from_array(data, chunks=10000)

In [14]:
dask_clusters = da.from_array(clusters, chunks=100)

In [15]:
import multiprocessing
from multiprocessing.pool import ThreadPool
#pool = ThreadPool()

pool = multiprocessing.Pool(12)
da.set_options(pool=pool) 

#start = time.time()
#distance = sklearn.metrics.pairwise.euclidean_distances(dask_data, dask_clusters)
#print "Distance Computation Time (sklearn/dask): %.2f sec"%(time.time()-start)

<dask.context.set_options at 0x7f9a6844dad0>

In [None]:
result = da.sqrt((dask_data[:, :, None] - dask_clusters.T[None, :, :])**2).sum(axis=1)

In [None]:
start = time.time()
dist_np=np.array(result) 
print "Distance Computation Time (sklearn/dask): %.2f sec"%(time.time()-start)

In [None]:
dist_np[1]