# Mapper Function for KMeans

Optimized computation of distance between points and centroids

In [1]:
import sklearn
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np
import time

## Read data from file

In [2]:
#data_file = "/data/kmeans/dataset_200_2d.in"
data_file = "/data/kmeans/dataset_10M_2d.in"

In [3]:
!head -n 2 $data_file

-4594.04586276
12194.3621136


In [4]:
start = time.time()
data = np.loadtxt(data_file)
print "Loading Time: %.2f sec"%(time.time()-start)

Loading Time: 42.41 sec


In [5]:
num_dimensions = 2
num_clusters = 1000
num_points = len(data)/num_dimensions 

## Reshape read in data to n dimensions

In [6]:
data=data.reshape(num_points, num_dimensions)

In [7]:
print "Data Shape: " + str(data.shape)
print "First point: " + str(data[0])

Data Shape: (10000000, 2)
First point: [ -4594.04586276  12194.3621136 ]


## Extract n random points as initial centroids from data

In [8]:
clusters = data[np.random.choice(data.shape[0], num_clusters, replace=False),:]

In [9]:
print "First Centroid: " + str(clusters[0])

First Centroid: [ 13801.7574992    1004.71996604]


In [None]:
print "First Distance: " + str(np.sqrt(sum((data[0] - clusters[0]) ** 2)))

First Distance: 21531.6899644


## Compute Distance between all points and centroids

Sklearn Documentation: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.euclidean_distances.html

In [None]:
start = time.time()
distance = sklearn.metrics.pairwise.euclidean_distances(data, clusters)
print "Distance Computation Time (sklearn): %.2f sec"%(time.time()-start)

In [None]:
distance.shape

# Dask Testing

Dask Paper: http://nipy.bic.berkeley.edu:5000/download/24

Documentation: http://dask.pydata.org/en/latest/array-creation.html

In [None]:
import dask.array as da

In [None]:
dask_data = da.from_array(data, chunks=(num_points/24,))

In [None]:
dask_data

In [None]:
start = time.time()
distance = sklearn.metrics.pairwise.euclidean_distances(data, clusters)
print "Distance Computation Time (sklearn/dask): %.2f sec"%(time.time()-start)