# Iris Clustering

In [None]:
from mpl_toolkits import mplot3d
from pyspark.sql import DataFrameReader
from pyspark.sql import SparkSession
from pyspark.ml.feature import IndexToString, Normalizer, StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml.clustering import KMeans, BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from helpers.helper_functions import translate_to_file_string

## Select the churn file 

In [None]:
inputFile = translate_to_file_string("../data/iris.data")

## Create the Spark Session 

In [None]:
#create a SparkSession
spark = (SparkSession
       .builder
       .appName("IrisClustering")
       .getOrCreate())
# create a DataFrame using an ifered Schema 
df = spark.read.option("header", "false") \
       .option("inferSchema", "true") \
       .option("delimiter", ",") \
       .csv(inputFile) \
       .withColumnRenamed("_c0","sepal length")\
       .withColumnRenamed("_c1","sepal width") \
       .withColumnRenamed("_c2","petal length")\
       .withColumnRenamed("_c3","petal width") \
       .withColumnRenamed("_c4","class")
print(df.printSchema()) 

## Data Preparation
### Transform labels into index

In [None]:
labelIndexer = StringIndexer().setInputCol("class").setOutputCol("label").fit(df)

 ### Build the feature vector

In [None]:
featureCols = df.columns.copy()
featureCols.remove("class")
assembler =  VectorAssembler(outputCol="features", inputCols=list(featureCols))

## Build KMeans

In [None]:
km = KMeans (seed=12345, featuresCol="features",predictionCol="prediction")   

## Data Preparation
### Build a network parameters grid

In [None]:
paramGrid = ParamGridBuilder().addGrid(km.k, [3, 4, 5]) \
				                 .addGrid(km.maxIter, [10, 100 ]) \
                                 .build()

### Build a pipeline

In [None]:
pipeline = Pipeline(stages = [labelIndexer, assembler, km ])

## Build an evaluator

In [None]:
evaluator = ClusteringEvaluator()

## Build the Cross Validator

In [None]:
 cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=2, parallelism=2)

## Train the Model 

In [None]:
cvModel = cv.fit(df)

## Find out what is the best model

In [None]:
kmModel = cvModel.bestModel.stages[2]
print(kmModel.explainParams())
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
   print(center)

## Test the model 

In [None]:
predictions = cvModel.transform(df)
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " , silhouette)

## Visualize the clusters

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

ax = predictions.filter(predictions.prediction == "0" ).toPandas().plot.scatter(x='sepal length', y='petal width', color='DarkBlue', label='Cluster 0')
predictions.filter(predictions.prediction == "1").toPandas().plot.scatter(x='sepal length', y='petal width', color='DarkGreen', label='Cluster 1', ax=ax)
predictions.filter(predictions.prediction == "2").toPandas().plot.scatter(x='sepal length', y='petal width', color='DarkRed', label='Cluster 2', ax=ax)
plt.show()
# Data for three-dimensional scattered points
cluster_0=predictions.filter(predictions.prediction == "0" )
cluster_1=predictions.filter(predictions.prediction == "1" )
cluster_2=predictions.filter(predictions.prediction == "2" )


fig = plt.figure()
ax = plt.axes(projection='3d')
ax.scatter3D(np.array(cluster_0.select('sepal length').collect()),np.array(cluster_0.select("petal width").collect()),np.array(cluster_0.select("petal length").collect()),color='DarkBlue', label='Cluster 0')
ax.scatter3D(np.array(cluster_1.select('sepal length').collect()),np.array(cluster_1.select("petal width").collect()),np.array(cluster_1.select("petal length").collect()),color='DarkGreen', label='Cluster 1')
ax.scatter3D(np.array(cluster_2.select('sepal length').collect()),np.array(cluster_2.select("petal width").collect()),np.array(cluster_2.select("petal length").collect()),color='DarkRed', label='Cluster 2')

## Bisecting k-means

Build the Bisecting KMeans

In [None]:
bkm = BisectingKMeans(k=2, seed=12345, featuresCol="features", predictionCol="prediction")

Param Grid for Bisecting

In [None]:
paramGridBkm = ParamGridBuilder().addGrid(bkm.k, [ 2, 3, 4, 5]) \
				                .addGrid(bkm.maxIter, [10, 100]) \
                                .build()

Pipeline for Bisecting

In [None]:
pipelineBkm = Pipeline(stages = [labelIndexer, assembler, bkm ])

Build the cross validation

In [None]:
cvbkm = CrossValidator(estimator=pipelineBkm,evaluator=evaluator,estimatorParamMaps=paramGridBkm, numFolds=2,parallelism=2)

## Train the Model

In [None]:
cvModebkml = cvbkm.fit(df)

### Find the best model

In [None]:
kmModelbkm = cvModebkml.bestModel.stages[2]
print(kmModelbkm.explainParams())
centers = kmModelbkm.clusterCenters()
print("Cluster Centers: ")
for center in centers:
   print(center)

### Test the model

In [None]:
predictions = cvModebkml.transform(df)
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " , silhouette)

In [None]:
spark.stop()