In [1]:
from pyspark.sql import SparkSession
#from pyspark.ml.stat import Correlation
#import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.csv("iris.csv", inferSchema=True,
                   header=True)

In [4]:
df.show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

In [5]:
df.select("species").show()

+-------+
|species|
+-------+
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
+-------+
only showing top 20 rows



In [6]:
df.count()

150

In [7]:
len(df.columns)

5

In [8]:
df.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)



In [9]:
df.describe().show()

+-------+------------------+-------------------+------------------+------------------+---------+
|summary|      sepal_length|        sepal_width|      petal_length|       petal_width|  species|
+-------+------------------+-------------------+------------------+------------------+---------+
|  count|               150|                150|               150|               150|      150|
|   mean| 5.843333333333335|  3.057333333333334|3.7580000000000027| 1.199333333333334|     null|
| stddev|0.8280661279778637|0.43586628493669793|1.7652982332594662|0.7622376689603467|     null|
|    min|               4.3|                2.0|               1.0|               0.1|   setosa|
|    max|               7.9|                4.4|               6.9|               2.5|virginica|
+-------+------------------+-------------------+------------------+------------------+---------+



In [10]:
df.head(5)

[Row(sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2, species='setosa'),
 Row(sepal_length=4.9, sepal_width=3.0, petal_length=1.4, petal_width=0.2, species='setosa'),
 Row(sepal_length=4.7, sepal_width=3.2, petal_length=1.3, petal_width=0.2, species='setosa'),
 Row(sepal_length=4.6, sepal_width=3.1, petal_length=1.5, petal_width=0.2, species='setosa'),
 Row(sepal_length=5.0, sepal_width=3.6, petal_length=1.4, petal_width=0.2, species='setosa')]

In [11]:
df.groupBy('species').count().show()

+----------+-----+
|   species|count|
+----------+-----+
| virginica|   50|
|versicolor|   50|
|    setosa|   50|
+----------+-----+



In [12]:
df.groupBy('sepal_length').count().show()

+------------+-----+
|sepal_length|count|
+------------+-----+
|         5.4|    6|
|         7.0|    1|
|         6.1|    6|
|         7.7|    4|
|         6.6|    2|
|         4.5|    1|
|         5.7|    8|
|         6.7|    8|
|         7.4|    1|
|         6.5|    5|
|         4.9|    6|
|         6.2|    4|
|         5.1|    9|
|         7.3|    1|
|         4.3|    1|
|         7.9|    1|
|         4.7|    2|
|         5.3|    1|
|         7.2|    3|
|         7.6|    1|
+------------+-----+
only showing top 20 rows



In [13]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [14]:
df.columns

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

In [15]:
input_cols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

In [16]:
vec_assembler = VectorAssembler(inputCols = input_cols, outputCol="features")

In [17]:
final_data = vec_assembler.transform(df)

In [27]:
final_data.show()

+------------+-----------+------------+-----------+-------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|species|         features|
+------------+-----------+------------+-----------+-------+-----------------+
|         5.1|        3.5|         1.4|        0.2| setosa|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2| setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2| setosa|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2| setosa|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2| setosa|[5.0,3.6,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4| setosa|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|        0.3| setosa|[4.6,3.4,1.4,0.3]|
|         5.0|        3.4|         1.5|        0.2| setosa|[5.0,3.4,1.5,0.2]|
|         4.4|        2.9|         1.4|        0.2| setosa|[4.4,2.9,1.4,0.2]|
|         4.9|        3.1|         1.5|        0.1| setosa|[4.9,

In [18]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [19]:
kmeans = KMeans(featuresCol='features',k=3)

In [20]:
model = kmeans.fit(final_data)

In [21]:
model.transform(final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   50|
|         2|   61|
|         0|   39|
+----------+-----+



In [22]:
predictions=model.transform(final_data)

In [23]:
predictions.groupBy('species','prediction').count().show()

'''
As it can be observed, the setosa species is perfectly grouped along with
versicolor, almost being captured in the same cluster, but verginica seems to fall
within two different groups. K-means can produce different results every time as
it chooses the starting point (centroid) randomly every time. Hence, the results
that you might get in you K-means clustering might be totally different from
these results unless we use a seed to reproduce the results. The seed ensures the
split and the initial centroid values remain consistent throughout the analysis.
'''

+----------+----------+-----+
|   species|prediction|count|
+----------+----------+-----+
| virginica|         2|   14|
| virginica|         0|   36|
|versicolor|         0|    3|
|    setosa|         1|   50|
|versicolor|         2|   47|
+----------+----------+-----+



'\nAs it can be observed, the setosa species is perfectly grouped along with\nversicolor, almost being captured in the same cluster, but verginica seems to fall\nwithin two different groups. K-means can produce different results every time as\nit chooses the starting point (centroid) randomly every time. Hence, the results\nthat you might get in you K-means clustering might be totally different from\nthese results unless we use a seed to reproduce the results. The seed ensures the\nsplit and the initial centroid values remain consistent throughout the analysis.\n'

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [25]:
# visualization of clusters
pandas_df = predictions.toPandas()
pandas_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,features,prediction
0,5.1,3.5,1.4,0.2,setosa,"[5.1, 3.5, 1.4, 0.2]",1
1,4.9,3.0,1.4,0.2,setosa,"[4.9, 3.0, 1.4, 0.2]",1
2,4.7,3.2,1.3,0.2,setosa,"[4.7, 3.2, 1.3, 0.2]",1
3,4.6,3.1,1.5,0.2,setosa,"[4.6, 3.1, 1.5, 0.2]",1
4,5.0,3.6,1.4,0.2,setosa,"[5.0, 3.6, 1.4, 0.2]",1
