For certain Mahchine learning algorithms, it is good idea to scale your data.
Drops in model performance can occur with highly dimensional data, so we will practice
scalling features using pySpark!

In [29]:
from pyspark.sql import SparkSession

In [30]:
spark = SparkSession.builder.appName("cluster").getOrCreate()

In [31]:
dataset = spark.read.csv('seeds_dataset.csv',inferSchema=True, header=True)

In [32]:
dataset.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [33]:
dataset.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)]

In [51]:
from pyspark.ml.clustering import KMeans

In [52]:
from pyspark.ml.feature import VectorAssembler

In [53]:
dataset.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [54]:
assembler = VectorAssembler(inputCols = dataset.columns,
                           outputCol='features')

In [38]:
final_data = assembler.transform(dataset)

In [39]:
final_data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



In [40]:
from pyspark.ml.feature import StandardScaler

In [41]:
scaler = StandardScaler(inputCol='features',
                       outputCol = 'scaledFeatures')

In [42]:
scaler_model = scaler.fit(final_data)

In [43]:
final_data = scaler_model.transform(final_data)

In [44]:
final_data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

In [45]:
kmeans = KMeans(featuresCol='scaledFeatures', k=3)

In [55]:
model = kmeans.fit(final_data)

In [57]:
print("WSSE")
print(model.computeCost(final_data))

WSSE


AttributeError: 'KMeansModel' object has no attribute 'computeCost'

In [48]:
centers = model.clusterCenters()
print(centers)

[array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
        2.41585013, 12.29286107]), array([ 4.07135818, 10.14438097, 35.86461803, 11.81349589,  7.53471695,
        3.18317127, 10.39230304]), array([ 4.94114963, 10.95557919, 37.3028184 , 12.42383591,  8.60815545,
        1.80983376, 10.40657797])]


In [49]:
model.transform(final_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         0|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         1|
+----------+
only showing top 20 rows

