In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [3]:
from pyspark.ml.clustering import KMeans

In [4]:
dataset = spark.read.format('libsvm').load('sample_kmeans_data.txt')

In [5]:
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [6]:
# we only need features for clustering
final_data = dataset.select('features')

In [7]:
final_data.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



In [8]:
# final_data is a most realistic cenario
kmeans = KMeans().setK(2).setSeed(1) # kmeans model. I defined 2 clusters.

In [9]:
# fit the model
# no problem: final_data contains only features data and it fits because of unsupervised algorithm
model = kmeans.fit(final_data)

In [10]:
# checkout sum of square errors
# it's different from evaluation models early presented for regression and classification 
wssse = model.computeCost(final_data)

In [11]:
print (wssse)

0.11999999999994547


In [15]:
# centers of the 2 clusters
centers = model.clusterCenters()
centers

[array([0.1, 0.1, 0.1]), array([9.1, 9.1, 9.1])]

In [16]:
# You see we don't have the labels result from clustering algorithm yet.
final_data.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



In [17]:
# I want to generate the labels that represents the clustering data grouping the features
results = model.transform(final_data)

In [18]:
results.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         0|
|(3,[0,1,2],[0.1,0...|         0|
|(3,[0,1,2],[0.2,0...|         0|
|(3,[0,1,2],[9.0,9...|         1|
|(3,[0,1,2],[9.1,9...|         1|
|(3,[0,1,2],[9.2,9...|         1|
+--------------------+----------+



If you set 3 clusters in "setK(3)", the features will be grouped in 3 differente predictions (it will be 3 centers). And so on. wsssw will decrease when you increase K because the mean of the distances from the centers will decrease.