### Tutorial on K-Means clustering algorithm

#### For more information please visit:
##### https://spark.apache.org/docs/latest/ml-clustering.html

##### https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.clustering.KMeans

In [2]:
# Import libraries for this tutorial
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer,StandardScaler
from pyspark.ml.clustering import KMeans

In [3]:
# Create a spark session
spark = SparkSession.builder.appName('clustering').getOrCreate()

In [4]:
# read the documentation libsvm data
data = spark.read.format("libsvm").load("sample_kmeans_data.txt")

In [6]:
# Print the schema of the input data
data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [5]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [7]:
# Create an object of K-Means clutering class with the number of clusters as one of the arguments
kmeans = KMeans(k=2)

In [8]:
# Build the clustering model
model = kmeans.fit(data)

In [9]:
# Get the model centroids
model.clusterCenters()

[array([ 0.1,  0.1,  0.1]), array([ 9.1,  9.1,  9.1])]

In [9]:
# Find the predictions
model.transform(data).show()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|           (3,[],[])|         1|
|  1.0|(3,[0,1,2],[0.1,0...|         1|
|  2.0|(3,[0,1,2],[0.2,0...|         1|
|  3.0|(3,[0,1,2],[9.0,9...|         0|
|  4.0|(3,[0,1,2],[9.1,9...|         0|
|  5.0|(3,[0,1,2],[9.2,9...|         0|
+-----+--------------------+----------+



In [10]:
# Compute the WSSSE cost
wssse = model.computeCost(data)
print("WSSSE: ", wssse)

WSSSE:  0.11999999999994547


### Word on seeds dataset

In [11]:
data = spark.read.csv("seeds_dataset.csv", inferSchema=True, header=True)

In [12]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [13]:
data.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|             4.805|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|             5.175|
|14.38|    14.21|     0.8951|             5.386|             3.312|   2.4619999999999997|             4.956|
|14.69|    14.49|  

In [14]:
# Rearrange data as vector format
assembler = VectorAssembler(inputCols=data.columns, outputCol='features')
feature_data = assembler.transform(data)
feature_data.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|[15.26,14.84,0.87...|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|[14.88,14.57,0.88...|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|[14.29,14.09,0.90...|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|             4.805|[13.84,13.94,0.89...|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355| 

In [15]:
# Create a kmeans instance and fit the data to the model
kmeans = KMeans(k=3, featuresCol='features')

In [16]:
# Fit the data
model = kmeans.fit(feature_data)

In [17]:
# Compute the cost
print("WSSSE: ", model.computeCost(feature_data))

WSSSE:  587.3193471468309


In [18]:
model.transform(feature_data).select("prediction").show()

+----------+
|prediction|
+----------+
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         0|
|         1|
|         1|
|         0|
+----------+
only showing top 20 rows



In [19]:
# With feature normalization
feature_norm = StandardScaler(inputCol="features", outputCol="scaledFeatures")

In [20]:
norm_data = feature_norm.fit(feature_data).transform(feature_data)

In [21]:
norm_data.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|      scaledFeatures|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|[15.26,14.84,0.87...|[5.24452795332028...|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|[14.88,14.57,0.88...|[5.11393027165175...|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|[14.29,14.09,0.90...|[4.91116018695588...|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|     

In [22]:
# Create a kmeans instance and fit the data to the model
kmeans = KMeans(k=3, featuresCol='scaledFeatures')

In [23]:
# Fit the data
model = kmeans.fit(norm_data)

In [24]:
# Compute the cost
print("WSSSE: ", model.computeCost(norm_data))

WSSSE:  429.0317159380717


In [25]:
model.transform(norm_data).select("prediction").show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         1|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         2|
+----------+
only showing top 20 rows



### Working with hacked dataset

In [26]:
data = spark.read.csv("hack_data.csv", inferSchema=True, header=True)

In [27]:
data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [28]:
data.show()

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|             Bolivia|            70.8|
|                   20.0|            408.5|              0|             3.57

In [29]:
data.head(1)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37)]

In [30]:
data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [31]:
# Create a vector assembler
assembler = VectorAssembler(inputCols=['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'WPM_Typing_Speed'], outputCol= 'features')

In [32]:
features_data = assembler.transform(data)
features_data.show()

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|            features|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|[8.0,391.09,1.0,2...|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|[20.0,720.99,0.0,...|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|[31.0,356.32,1.0,...|
|                    2.0|           228.08|              1|             2.48|     

In [33]:
norm = StandardScaler( inputCol='features', outputCol= 'scaledFeatures')

In [34]:
norm_data = norm.fit(features_data).transform(features_data)

In [35]:
norm_data.show()

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------------+--------------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|            features|      scaledFeatures|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------------+--------------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|[8.0,391.09,1.0,2...|[0.56785108466505...|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|[20.0,720.99,0.0,...|[1.41962771166263...|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58

In [36]:
# Create a kmeans object
kmeans2 = KMeans(k=2, featuresCol='scaledFeatures')
kmeans3 = KMeans(k=3, featuresCol='scaledFeatures')

In [37]:
# Fit the model
model2 = kmeans2.fit(norm_data)
model3 = kmeans3.fit(norm_data)

In [38]:
# Compute the cost
print("Model2: ", model2.computeCost(norm_data))
print("Model3: ", model3.computeCost(norm_data))

Model2:  601.7707512676716
Model3:  434.1492898715845


In [39]:
model2.clusterCenters()

[array([ 2.99991988,  2.92319035,  1.05261534,  3.20390443,  4.51321315,
         3.28474   ]),
 array([ 1.26023837,  1.31829808,  0.99280765,  1.36491885,  2.5625043 ,
         5.26676612])]

In [40]:
model3.clusterCenters()

[array([ 2.99991988,  2.92319035,  1.05261534,  3.20390443,  4.51321315,
         3.28474   ]),
 array([ 1.30217042,  1.25830099,  0.        ,  1.35793211,  2.57251009,
         5.24230473]),
 array([ 1.21780112,  1.37901802,  1.99757683,  1.37198977,  2.55237797,
         5.29152222])]

In [41]:
preds2 = model2.transform(norm_data).groupBy("prediction").count()
preds3 = model3.transform(norm_data).groupBy("prediction").count()

In [42]:
preds2.show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [43]:
preds3.show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   84|
|         2|   83|
|         0|  167|
+----------+-----+



In [44]:
for kk in range(2,9):
    kmeans = KMeans(k=kk, featuresCol='scaledFeatures')
    model = kmeans.fit(norm_data)
    wssse = model.computeCost(norm_data)
    print("The cost for k = %d is %f"%(kk, wssse))

The cost for k = 2 is 601.770751
The cost for k = 3 is 434.149290
The cost for k = 4 is 415.161397
The cost for k = 5 is 248.884629
The cost for k = 6 is 239.028015
The cost for k = 7 is 226.714016
The cost for k = 8 is 202.143083
