## K-MEANS

### LOAD THE DATASET

#### Using the Bank Note Authentication Dataset - http://archive.ics.uci.edu/ml/datasets/banknote+authentication#

In [1]:
banknoteDataset = spark.read.csv("data_banknote_authentication.csv", sep=',',inferSchema=True)\
        .toDF('variance','skewness','curtosis','entropy','class')
print(banknoteDataset.head())
print(banknoteDataset.printSchema())

Row(variance=3.6216, skewness=8.6661, curtosis=-2.8073, entropy=-0.44699, class=0)
root
 |-- variance: double (nullable = true)
 |-- skewness: double (nullable = true)
 |-- curtosis: double (nullable = true)
 |-- entropy: double (nullable = true)
 |-- class: integer (nullable = true)

None


In [2]:
banknoteDataset.show(5)

+--------+--------+--------+--------+-----+
|variance|skewness|curtosis| entropy|class|
+--------+--------+--------+--------+-----+
|  3.6216|  8.6661| -2.8073|-0.44699|    0|
|  4.5459|  8.1674| -2.4586| -1.4621|    0|
|   3.866| -2.6383|  1.9242| 0.10645|    0|
|  3.4566|  9.5228| -4.0112| -3.5944|    0|
| 0.32924| -4.4552|  4.5718| -0.9888|    0|
+--------+--------+--------+--------+-----+
only showing top 5 rows



In [3]:
banknoteDataset.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+
|summary|          variance|          skewness|          curtosis|           entropy|             class|
+-------+------------------+------------------+------------------+------------------+------------------+
|  count|              1372|              1372|              1372|              1372|              1372|
|   mean|0.4337352570699707|1.9223531206393603|1.3976271172667651|-1.191656520043731|0.4446064139941691|
| stddev|2.8427625862785577| 5.869046743695513| 4.310030090106595| 2.101013137359609|0.4971032701256608|
|    min|           -7.0421|          -13.7731|           -5.2861|           -8.5482|                 0|
|    max|            6.8248|           12.9516|           17.9274|            2.4495|                 1|
+-------+------------------+------------------+------------------+------------------+------------------+



### TRANSFORM 4 FEATURES INTO 1 FEATURES COLUMN

In [4]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

In [5]:
vector_assembler = VectorAssembler(\
inputCols=['variance','skewness','curtosis','entropy'],\
outputCol="features")

In [6]:
assembled_data = vector_assembler.transform(banknoteDataset)
assembled_data.show(3)

+--------+--------+--------+--------+-----+--------------------+
|variance|skewness|curtosis| entropy|class|            features|
+--------+--------+--------+--------+-----+--------------------+
|  3.6216|  8.6661| -2.8073|-0.44699|    0|[3.6216,8.6661,-2...|
|  4.5459|  8.1674| -2.4586| -1.4621|    0|[4.5459,8.1674,-2...|
|   3.866| -2.6383|  1.9242| 0.10645|    0|[3.866,-2.6383,1....|
+--------+--------+--------+--------+-----+--------------------+
only showing top 3 rows



### Feature scaling - standardise the data

In [7]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [8]:
scaler_model = scaler.fit(assembled_data)
scaled_data = scaler_model.transform(assembled_data)
scaled_data.printSchema()

root
 |-- variance: double (nullable = true)
 |-- skewness: double (nullable = true)
 |-- curtosis: double (nullable = true)
 |-- entropy: double (nullable = true)
 |-- class: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)



In [9]:
scaled_data.select('scaledFeatures').show(5)

+--------------------+
|      scaledFeatures|
+--------------------+
|[1.27397202196227...|
|[1.59911348979409...|
|[1.35994473075606...|
|[1.21592989041164...|
|[0.11581691752564...|
+--------------------+
only showing top 5 rows



### Building the models - K-Means

In [10]:
from pyspark.ml.clustering import KMeans

In [11]:
k_means_2 = KMeans(featuresCol='scaledFeatures', k=2)
k_means_3 = KMeans(featuresCol='scaledFeatures', k=3)

In [12]:
model_k2 = k_means_2.fit(scaled_data)
model_k3 = k_means_3.fit(scaled_data)

In [13]:
model_k3_data = model_k3.transform(scaled_data)
model_k3_data.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  659|
|         2|  241|
|         0|  472|
+----------+-----+



In [14]:
model_k2_data = model_k2.transform(scaled_data)
model_k2_data.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  711|
|         0|  661|
+----------+-----+



In [15]:
model_k2_data.show(5)

+--------+--------+--------+--------+-----+--------------------+--------------------+----------+
|variance|skewness|curtosis| entropy|class|            features|      scaledFeatures|prediction|
+--------+--------+--------+--------+-----+--------------------+--------------------+----------+
|  3.6216|  8.6661| -2.8073|-0.44699|    0|[3.6216,8.6661,-2...|[1.27397202196227...|         0|
|  4.5459|  8.1674| -2.4586| -1.4621|    0|[4.5459,8.1674,-2...|[1.59911348979409...|         0|
|   3.866| -2.6383|  1.9242| 0.10645|    0|[3.866,-2.6383,1....|[1.35994473075606...|         1|
|  3.4566|  9.5228| -4.0112| -3.5944|    0|[3.4566,9.5228,-4...|[1.21592989041164...|         0|
| 0.32924| -4.4552|  4.5718| -0.9888|    0|[0.32924,-4.4552,...|[0.11581691752564...|         1|
+--------+--------+--------+--------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [16]:
# Shows the result.
centers = model_k2.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[ 0.19752416  1.1344166  -0.36542882 -1.2493032 ]
[ 0.11078734 -0.4225924   0.96547291  0.06697034]
