In [2]:
import findspark
findspark.init('/home/ubuntu/spark-2.3.2-bin-hadoop2.7')
import pyspark

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cluster_example').getOrCreate()

In [3]:
data = spark.read.csv('../Python-and-Spark-for-Big-Data/Spark_for_Machine_Learning/Clustering/seeds_dataset.csv', 
                      inferSchema=True, header=True)

In [4]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [5]:
data.show(1)

+-----+---------+-----------+----------------+---------------+---------------------+----------------+
| area|perimeter|compactness|length_of_kernel|width_of_kernel|asymmetry_coefficient|length_of_groove|
+-----+---------+-----------+----------------+---------------+---------------------+----------------+
|15.26|    14.84|      0.871|           5.763|          3.312|                2.221|            5.22|
+-----+---------+-----------+----------------+---------------+---------------------+----------------+
only showing top 1 row



In [6]:
from pyspark.ml.clustering import KMeans

In [7]:
from pyspark.ml.feature import VectorAssembler

In [8]:
data.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [10]:
assembler = VectorAssembler(inputCols=data.columns, outputCol='features')

In [11]:
final_data = assembler.transform(data)

In [12]:
final_data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



In [13]:
from pyspark.ml.feature import StandardScaler

In [14]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [15]:
scaler_model = scaler.fit(final_data)

In [16]:
final_data = scaler_model.transform(final_data)

In [18]:
final_data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

In [19]:
kmeans = KMeans(featuresCol='scaledFeatures', k=3)

In [20]:
model = kmeans.fit(final_data)

In [21]:
print('WSSSE: ')
print(model.computeCost(final_data))

WSSSE: 
428.60820118716356


In [22]:
centers = model.clusterCenters()

In [23]:
print(centers)

[array([  4.96198582,  10.97871333,  37.30930808,  12.44647267,
         8.62880781,   1.80061978,  10.41913733]), array([  6.35645488,  12.40730852,  37.41990178,  13.93860446,
         9.7892399 ,   2.41585013,  12.29286107]), array([  4.07497225,  10.14410142,  35.89816849,  11.80812742,
         7.54416916,   3.15410901,  10.38031464])]


In [25]:
model.transform(final_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         1|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         2|
+----------+
only showing top 20 rows

