# KMeans Spark

https://spark.apache.org/docs/latest/mllib-clustering.html#k-means

1. Start Spark Cluster:

Link to iPython Notebook: [Spark on Stampede](../02_hadoop_on_hpc/Stampede.ipynb)

In [1]:
from numpy import array
from math import sqrt

%run ../env.py
%run ../util/init_spark.py

from pilot_hadoop import PilotComputeService as PilotSparkComputeService

pilotcompute_description = {
    "service_url": "yarn-client://sc15.radical-cybertools.org",
    "number_of_processes": 5
}
pilot_spark = PilotSparkComputeService.create_pilot(pilotcompute_description=pilotcompute_description)
sc = pilot_spark.get_spark_context()

SPARK Home: /usr/hdp/2.3.0.0-2557/spark-1.5.0/


## MLLib KMeans

In [62]:
from pyspark.mllib.clustering import KMeans, KMeansModel
# Load and parse the data

# index for points
idx = []
for i in range(1000000):
    idx.extend([i]*3)    
idx_rdd = sc.parallelize(idx, numSlices=100)

# 3D points stored on 3 lines
data = sc.textFile("/data/kmeans/dataset_1M_3d.in")
parsedData = data.map(lambda x: float(x))

In [63]:
parsedData.count()

3000000

In [64]:
# merge data
parsedData = parsedData.repartition(1)
idx_rdd = idx_rdd.repartition(1)
rdd = idx_rdd.zip(parsedData)

In [72]:
input_data = rdd.groupByKey().mapValues(list).values()
input_data = input_data.repartition(100)

In [79]:
input_data.count()

1000000

In [76]:
input_data.getNumPartitions()

100

In [77]:
start = time.time()
# Build the model (cluster the data)
clusters = KMeans.train(input_data, 50, maxIterations=10, runs=1, initializationMode="random")
end = time.time()
print "Training Time %.2f"%(end-start)

Training Time 19.2


In [78]:
# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = input_data.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))


Within Set Sum of Squared Error = 5522621904.56


In [5]:
str(clusters)

'<pyspark.mllib.clustering.KMeansModel object at 0x2b03471b3810>'

## Scikit-Learn

In [83]:
import numpy as np
start = time.time()
data = np.loadtxt("/data/kmeans/pilot-hadoop/dataset_1M_3d.in")
print "Loading Time: %.2f sec"%(time.time()-start)

Loading Time: 6.41 sec


In [84]:
num_dimensions = 2
num_clusters = 5000
num_points = len(data)/num_dimensions 

In [85]:
data=data.reshape(num_points, num_dimensions)

In [None]:
start = time.time()
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=50)
results = kmeans.fit_predict(data)
print "KMeans Training Time: %.2f sec"%(time.time()-start)