In [1]:
import findspark
findspark.init('/home/shashank/spark-2.3.2-bin-hadoop2.7')
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('ca_clust').getOrCreate()

In [4]:
from pyspark.ml.clustering import KMeans

In [5]:
data = spark.read.csv('seeds_dataset.csv', inferSchema=True, header=True)

In [7]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [8]:
from pyspark.ml.feature import VectorAssembler

In [9]:
data.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [10]:
assembler = VectorAssembler(inputCols= data.columns, outputCol = 'features')

In [11]:
final_data = assembler.transform(data)

In [13]:
final_data.head()

Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]))

In [14]:
from pyspark.ml.feature import StandardScaler

In [15]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [16]:
final_data = scaler.fit(final_data).transform(final_data)

In [17]:
final_data.show() #there is a scaled features column

+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|      scaledFeatures|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|[15.26,14.84,0.87...|[5.24452795332028...|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|[14.88,14.57,0.88...|[5.11393027165175...|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|[14.29,14.09,0.90...|[4.91116018695588...|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|     

In [18]:
kmeans = KMeans(k=3, featuresCol='scaledFeatures', maxIter=100, seed=101)

In [19]:
clust_model = kmeans.fit(final_data)

In [21]:
clust_model.computeCost(final_data) #wssse

429.07559671506715

In [23]:
clust_model.clusterCenters() #k=3

[array([ 4.87257659, 10.88120146, 37.27692543, 12.3410157 ,  8.55443412,
         1.81649011, 10.32998598]),
 array([ 6.31670546, 12.37109759, 37.39491396, 13.91155062,  9.748067  ,
         2.39849968, 12.2661748 ]),
 array([ 4.06105916, 10.13979506, 35.80536984, 11.82133095,  7.50395937,
         3.27184732, 10.42126018])]

In [25]:
clusters = clust_model.transform(final_data)

In [26]:
clusters.createOrReplaceTempView('clusters')

In [30]:
spark.sql("FROM clusters SELECT prediction as Clusters, AVG(length_of_kernel) AS Mean_Length, AVG(width_of_kernel) AS Mean_Width GROUP BY prediction").show()

+--------+------------------+------------------+
|Clusters|       Mean_Length|        Mean_Width|
+--------+------------------+------------------+
|       1|            6.1637|3.6819857142857138|
|       2|5.2376000000000005|2.8343538461538467|
|       0| 5.467853333333332|3.2311333333333336|
+--------+------------------+------------------+

