## Classification on IRIS Data with pyspark

In [1]:
import pyspark
from pyspark.sql import SQLContext, SparkSession

spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/14 10:10:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler, StringIndexer

#### Read data from url

In [3]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

In [4]:
from pyspark import SparkFiles
spark.sparkContext.addFile(url)

In [5]:
iris_df = spark.read.csv("file://"+SparkFiles.get("iris.data"),
                           header=False,
                          inferSchema=True)

                                                                                

In [6]:
iris_df.show(5)

+---+---+---+---+-----------+
|_c0|_c1|_c2|_c3|        _c4|
+---+---+---+---+-----------+
|5.1|3.5|1.4|0.2|Iris-setosa|
|4.9|3.0|1.4|0.2|Iris-setosa|
|4.7|3.2|1.3|0.2|Iris-setosa|
|4.6|3.1|1.5|0.2|Iris-setosa|
|5.0|3.6|1.4|0.2|Iris-setosa|
+---+---+---+---+-----------+
only showing top 5 rows



#### Dataprep

In [7]:
from functools import reduce

newColumns = ["sepallength", "sepalwidth", "petallength", "petalwidth", "species"]
oldColumns = iris_df.schema.names

iris_df = reduce(lambda iris_df, idx: iris_df.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), iris_df)
iris_df.printSchema()
iris_df.show()

root
 |-- sepallength: double (nullable = true)
 |-- sepalwidth: double (nullable = true)
 |-- petallength: double (nullable = true)
 |-- petalwidth: double (nullable = true)
 |-- species: string (nullable = true)

+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|    species|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|
|        5.4|       3.9|        1.7|       0.4|Iris-setosa|
|        4.6|       3.4|        1.4|       0.3|Iris-setosa|
|        5.0|       3.4|        1.5|       0.2|Iris-setosa|
|        4.4|       2.9|        1.4|       0.2|Iris-setosa|
|        4.9|       3.1|        1.5|       0.1|Iris-setosa|
|    

In [8]:
vectorAssembler = VectorAssembler(inputCols=["sepallength", "sepalwidth", "petallength", "petalwidth"],
                                 outputCol="features")

In [9]:
viris_df = vectorAssembler.transform(iris_df)
viris_df.show(5)

+-----------+----------+-----------+----------+-----------+-----------------+
|sepallength|sepalwidth|petallength|petalwidth|    species|         features|
+-----------+----------+-----------+----------+-----------+-----------------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
+-----------+----------+-----------+----------+-----------+-----------------+
only showing top 5 rows



In [10]:
indexer = StringIndexer(inputCol="species",
                       outputCol="label")

iviris_df = indexer.fit(viris_df).transform(viris_df)
iviris_df.show(5)

+-----------+----------+-----------+----------+-----------+-----------------+-----+
|sepallength|sepalwidth|petallength|petalwidth|    species|         features|label|
+-----------+----------+-----------+----------+-----------+-----------------+-----+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|  0.0|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|  0.0|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|  0.0|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|  0.0|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|  0.0|
+-----------+----------+-----------+----------+-----------+-----------------+-----+
only showing top 5 rows



#### Split Data

In [11]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [12]:
splits = iviris_df.randomSplit(weights=[0.6,0.4], seed=1)

In [13]:
train_df = splits[0]
test_df = splits[1]

In [14]:
train_df.count(), test_df.count()

(98, 52)

In [21]:
train_df.show(5)

+-----------+----------+-----------+----------+-----------+-----------------+-----+
|sepallength|sepalwidth|petallength|petalwidth|    species|         features|label|
+-----------+----------+-----------+----------+-----------+-----------------+-----+
|        4.4|       2.9|        1.4|       0.2|Iris-setosa|[4.4,2.9,1.4,0.2]|  0.0|
|        4.4|       3.0|        1.3|       0.2|Iris-setosa|[4.4,3.0,1.3,0.2]|  0.0|
|        4.4|       3.2|        1.3|       0.2|Iris-setosa|[4.4,3.2,1.3,0.2]|  0.0|
|        4.6|       3.2|        1.4|       0.2|Iris-setosa|[4.6,3.2,1.4,0.2]|  0.0|
|        4.6|       3.4|        1.4|       0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|  0.0|
+-----------+----------+-----------+----------+-----------+-----------------+-----+
only showing top 5 rows



In [22]:
test_df.show(5)

+-----------+----------+-----------+----------+-----------+-----------------+-----+
|sepallength|sepalwidth|petallength|petalwidth|    species|         features|label|
+-----------+----------+-----------+----------+-----------+-----------------+-----+
|        4.3|       3.0|        1.1|       0.1|Iris-setosa|[4.3,3.0,1.1,0.1]|  0.0|
|        4.5|       2.3|        1.3|       0.3|Iris-setosa|[4.5,2.3,1.3,0.3]|  0.0|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|  0.0|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|  0.0|
|        4.8|       3.1|        1.6|       0.2|Iris-setosa|[4.8,3.1,1.6,0.2]|  0.0|
+-----------+----------+-----------+----------+-----------+-----------------+-----+
only showing top 5 rows



#### Naive Bayes Algorithm

In [15]:
#fit train data, transform test data

nb = NaiveBayes(modelType='multinomial')
nbmodel = nb.fit(train_df)
predictions_df = nbmodel.transform(test_df)

In [16]:
predictions_df.take(1)

22/08/14 10:11:01 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/08/14 10:11:01 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


[Row(sepallength=4.3, sepalwidth=3.0, petallength=1.1, petalwidth=0.1, species='Iris-setosa', features=DenseVector([4.3, 3.0, 1.1, 0.1]), label=0.0, rawPrediction=DenseVector([-9.9894, -11.3476, -11.902]), probability=DenseVector([0.7118, 0.183, 0.1051]), prediction=0.0)]

In [17]:
evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                             predictionCol="prediction",
                                             metricName="accuracy")

nbaccuracy = evaluator.evaluate(predictions_df)

In [18]:
nbaccuracy

0.9807692307692307

In [19]:
evaluator1 = MulticlassClassificationEvaluator(labelCol="label",
                                             predictionCol="prediction",
                                             metricName="f1")

nbf1 = evaluator1.evaluate(predictions_df)

In [20]:
nbf1

0.9807504323633356

We have Accuracy of 98% and F1 of 98%

#### Multi-Layer Perceptron

In [23]:
train_df.take(1)

[Row(sepallength=4.4, sepalwidth=2.9, petallength=1.4, petalwidth=0.2, species='Iris-setosa', features=DenseVector([4.4, 2.9, 1.4, 0.2]), label=0.0)]

In [24]:
test_df.take(1)

[Row(sepallength=4.3, sepalwidth=3.0, petallength=1.1, petalwidth=0.1, species='Iris-setosa', features=DenseVector([4.3, 3.0, 1.1, 0.1]), label=0.0)]

In [25]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [26]:
layers = [4, 5, 5, 3]

mlp = MultilayerPerceptronClassifier(layers=layers, seed=1)
mlp_model = mlp.fit(train_df)
mlp_pred = mlp_model.transform(test_df)

In [27]:
mlp_eval = MulticlassClassificationEvaluator(metricName="accuracy")
mlp_accuracy = mlp_eval.evaluate(mlp_pred)
mlp_accuracy

0.6923076923076923