In [1]:
from pyspark.ml.classification import NaiveBayes, MultilayerPerceptronClassifier, DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.clustering import KMeans, BisectingKMeans
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as tp

In [2]:
spark = SparkSession.builder.appName("IRIS Classification").getOrCreate()

In [3]:
spark

In [4]:
df = spark.read.csv("/user/ana012815/iris.data.csv", header = True, inferSchema = True)

In [5]:
df = df.toDF("sepal_length","sepal_width","petal_length","petal_width","category")

In [6]:
vectorAssembler = VectorAssembler(inputCols = ["sepal_length","sepal_width","petal_length","petal_width"], outputCol = "features")
df_vector = vectorAssembler.transform(df)

In [7]:
df_vector.show()

+------------+-----------+------------+-----------+-----------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|   category|         features|
+------------+-----------+------------+-----------+-----------+-----------------+
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|[5.0,3.4,1.5,0.2]|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|[4.4,2.9,1.4,0.2]|
|         4.9|        3.1|         1.5|        0.1|Iris-setosa|[4.9,3.1,1.5,0.1]|
|         5.4|  

In [25]:
cat_encoder = StringIndexer(inputCol = "category", outputCol = "label")
df_vector = cat_encoder.fit(train).transform(train)

IllegalArgumentException: requirement failed: Output column label already exists.

In [27]:
kmeans = KMeans().setK(3)
kmeans = kmeans.setSeed(1)
kmeans_model = kmeans.fit(df_vector)

In [28]:
kmeans_model.clusterCenters()

[array([5.95      , 2.78571429, 4.41428571, 1.4452381 ]),
 array([4.96071429, 3.38571429, 1.42857143, 0.23928571]),
 array([6.75925926, 2.98888889, 5.68888889, 2.08518519])]

In [29]:
bkmeans = BisectingKMeans().setK(3)
bkmeans = bkmeans.setSeed(1)
bkmeans_model = bkmeans.fit(df_vector)

In [30]:
bkmeans_model.clusterCenters()

[array([4.96206897, 3.34827586, 1.49310345, 0.26551724]),
 array([5.96904762, 2.79761905, 4.45714286, 1.47857143]),
 array([6.79615385, 2.99615385, 5.71153846, 2.07307692])]