In [1]:
import findspark
findspark.init('/home/siddharth/spark-2.4.1-bin-hadoop2.7/')

from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('kmeans_example').getOrCreate()

In [3]:
data = spark.read.csv('./Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Clustering/seeds_dataset.csv',inferSchema=True,header=True)

In [4]:
data.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [5]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [6]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StandardScaler,VectorAssembler

In [7]:
assembler = VectorAssembler(inputCols=data.columns,outputCol='unscaled_feat')

In [8]:
df = assembler.transform(data).select('unscaled_feat')
df.show()

+--------------------+
|       unscaled_feat|
+--------------------+
|[15.26,14.84,0.87...|
|[14.88,14.57,0.88...|
|[14.29,14.09,0.90...|
|[13.84,13.94,0.89...|
|[16.14,14.99,0.90...|
|[14.38,14.21,0.89...|
|[14.69,14.49,0.87...|
|[14.11,14.1,0.891...|
|[16.63,15.46,0.87...|
|[16.44,15.25,0.88...|
|[15.26,14.85,0.86...|
|[14.03,14.16,0.87...|
|[13.89,14.02,0.88...|
|[13.78,14.06,0.87...|
|[13.74,14.05,0.87...|
|[14.59,14.28,0.89...|
|[13.99,13.83,0.91...|
|[15.69,14.75,0.90...|
|[14.7,14.21,0.915...|
|[12.72,13.57,0.86...|
+--------------------+
only showing top 20 rows



In [9]:
scaler = StandardScaler(inputCol='unscaled_feat',outputCol='features')

In [10]:
df_scaled = scaler.fit(df).transform(df)

In [11]:
kmeans = KMeans(featuresCol='features',k=3,seed=24)

In [12]:
kmeans_model = kmeans.fit(df_scaled)

In [13]:
kmeans_model.computeCost(df_scaled)

429.07559671506715

In [14]:
df_labeled = kmeans_model.transform(df_scaled)

In [15]:
df_labeled.show()

+--------------------+--------------------+----------+
|       unscaled_feat|            features|prediction|
+--------------------+--------------------+----------+
|[15.26,14.84,0.87...|[5.24452795332028...|         1|
|[14.88,14.57,0.88...|[5.11393027165175...|         1|
|[14.29,14.09,0.90...|[4.91116018695588...|         1|
|[13.84,13.94,0.89...|[4.75650503761158...|         1|
|[16.14,14.99,0.90...|[5.54696468981581...|         1|
|[14.38,14.21,0.89...|[4.94209121682475...|         1|
|[14.69,14.49,0.87...|[5.04863143081749...|         1|
|[14.11,14.1,0.891...|[4.84929812721816...|         1|
|[16.63,15.46,0.87...|[5.71536696354628...|         0|
|[16.44,15.25,0.88...|[5.65006812271202...|         0|
|[15.26,14.85,0.86...|[5.24452795332028...|         1|
|[14.03,14.16,0.87...|[4.82180387844584...|         1|
|[13.89,14.02,0.88...|[4.77368894309428...|         1|
|[13.78,14.06,0.87...|[4.73588435103234...|         1|
|[13.74,14.05,0.87...|[4.72213722664617...|         1|
|[14.59,14