# Iris Classification using PySpark-ml 

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer,IndexToString,VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd

In [2]:
# Building the Spark Session
# Befire 2.0.0, the main connection objects were 'SaprkContext,SqlContext, and HiveContext'

spark = SparkSession \
        .builder \
        .appName("Iris_Spark_ML") \
        .config('spark.some.config.option','some-value') \
        .getOrCreate()

# here, 'spark' is an object of SparkSession, which has the 'SparkContext' object and can be accessed directly
sc = spark.sparkContext
print(sc.version)

2.0.2


In [3]:
# reading the data from 'csv'
Iris = spark.read \
    .format('com.databricks.spark.csv') \
    .option('header','true') \
    .option('inferSchema', 'true') \
    .load('/home/ramscrux7757/SPARK/Iris.csv')

In [4]:
print(Iris.count(), len(Iris.columns))

(150, 6)


In [5]:
Iris.columns

['Id',
 'SepalLengthCm',
 'SepalWidthCm',
 'PetalLengthCm',
 'PetalWidthCm',
 'Species']

In [6]:
# Renaming the iris columns to their standard names
Iris = Iris.withColumnRenamed('SepalLengthCm','SepalLength') \
           .withColumnRenamed('SepalWidthCm','SepalWidth') \
           .withColumnRenamed('PetalLengthCm','PetalLength') \
           .withColumnRenamed('PetalWidthCm','PetalWidth') \

In [7]:
# Always limit it first if you want to use pandas
Iris.limit(5).show()
#Iris.limit(5).toPandas()

+---+-----------+----------+-----------+----------+-----------+
| Id|SepalLength|SepalWidth|PetalLength|PetalWidth|    Species|
+---+-----------+----------+-----------+----------+-----------+
|  1|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|  2|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
|  3|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
|  4|        4.6|       3.1|        1.5|       0.2|Iris-setosa|
|  5|        5.0|       3.6|        1.4|       0.2|Iris-setosa|
+---+-----------+----------+-----------+----------+-----------+



In [8]:
Iris = Iris.drop('ID')
print(Iris.dtypes)
print(Iris.printSchema())

[('SepalLength', 'double'), ('SepalWidth', 'double'), ('PetalLength', 'double'), ('PetalWidth', 'double'), ('Species', 'string')]
root
 |-- SepalLength: double (nullable = true)
 |-- SepalWidth: double (nullable = true)
 |-- PetalLength: double (nullable = true)
 |-- PetalWidth: double (nullable = true)
 |-- Species: string (nullable = true)

None


In [9]:
Iris.describe().show()

+-------+------------------+-------------------+------------------+------------------+
|summary|       SepalLength|         SepalWidth|       PetalLength|        PetalWidth|
+-------+------------------+-------------------+------------------+------------------+
|  count|               150|                150|               150|               150|
|   mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|
| stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|
|    min|               4.3|                2.0|               1.0|               0.1|
|    max|               7.9|                4.4|               6.9|               2.5|
+-------+------------------+-------------------+------------------+------------------+



In [10]:
# Number of distinct classes in 'Species'
Iris.createOrReplaceTempView('View1')
spark.sql('select Distinct(Species) from View1').show()

+---------------+
|        Species|
+---------------+
| Iris-virginica|
|    Iris-setosa|
|Iris-versicolor|
+---------------+



In [11]:
# Identifying the counts in each class of Species
Iris.groupBy("Species").count().show()

+---------------+-----+
|        Species|count|
+---------------+-----+
| Iris-virginica|   50|
|    Iris-setosa|   50|
|Iris-versicolor|   50|
+---------------+-----+



In [12]:
# Checking for missing values in each column
from pyspark.sql.functions import isnan, when, count, col
Iris.select([count(when(isnan(c),c)).alias(c)  for c in Iris.columns]).show()

+-----------+----------+-----------+----------+-------+
|SepalLength|SepalWidth|PetalLength|PetalWidth|Species|
+-----------+----------+-----------+----------+-------+
|          0|         0|          0|         0|      0|
+-----------+----------+-----------+----------+-------+



In [13]:
# Spark ML specific Transformations
# The following will add two columns to the dataset ('features (a vector of all predictors) and Labeled categorical column)
# The ml algorithm can straightaway take these two columns 

# Prepare the data by indexing the classes and putting the features into a vector.
speciesIndexer = StringIndexer(inputCol="Species", outputCol="Label")
vectorAssembler = VectorAssembler(inputCols=["SepalLength","SepalWidth","PetalLength","PetalWidth"],
                                  outputCol="features")

iris_pred_vector = vectorAssembler.transform(Iris) # it collects the predictor variable names
index_model = speciesIndexer.fit(iris_pred_vector) # about the target variable
iris_data_indexed = index_model.transform(iris_pred_vector) # creates the 'features' and 'Labels' 
# ('features' and 'Labels' will be feeded into the model straightaway)

print(iris_data_indexed.limit(5)).toPandas()

   SepalLength  SepalWidth  PetalLength  PetalWidth      Species  \
0          5.1         3.5          1.4         0.2  Iris-setosa   
1          4.9         3.0          1.4         0.2  Iris-setosa   
2          4.7         3.2          1.3         0.2  Iris-setosa   
3          4.6         3.1          1.5         0.2  Iris-setosa   
4          5.0         3.6          1.4         0.2  Iris-setosa   

               features  Label  
0  [5.1, 3.5, 1.4, 0.2]    0.0  
1  [4.9, 3.0, 1.4, 0.2]    0.0  
2  [4.7, 3.2, 1.3, 0.2]    0.0  
3  [4.6, 3.1, 1.5, 0.2]    0.0  
4  [5.0, 3.6, 1.4, 0.2]    0.0  


In [14]:
# Training and Testing a Classification Model

# Split the data into training and test sets.
train, test =  iris_data_indexed.randomSplit([0.7, 0.3],123) # seed=123

# Configure the classifier and then train it using the training set.
rf = RandomForestClassifier(labelCol='Label', featuresCol='features',numTrees=100, maxDepth=4,maxBins=32)
model = rf.fit(train)

In [18]:
# Run the classifier on the test set
predictions = model.transform(test)

# Un-index the data so we have the species names rather than the index numbers in our output.
converter = IndexToString(inputCol="prediction", outputCol="PredictedSpecies", labels=index_model.labels)
converted = converter.transform(predictions)

# Display the actual and predicted species side-by-side
converted.select(['Species','PredictedSpecies']).show(10)

+---------------+----------------+
|        Species|PredictedSpecies|
+---------------+----------------+
|    Iris-setosa|     Iris-setosa|
|    Iris-setosa|     Iris-setosa|
|    Iris-setosa|     Iris-setosa|
|    Iris-setosa|     Iris-setosa|
|    Iris-setosa|     Iris-setosa|
|    Iris-setosa|     Iris-setosa|
|Iris-versicolor| Iris-versicolor|
| Iris-virginica| Iris-versicolor|
|    Iris-setosa|     Iris-setosa|
|    Iris-setosa|     Iris-setosa|
+---------------+----------------+
only showing top 10 rows



In [21]:
# Model Evaluation
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="Label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.96


In [37]:
# The following will lists the hyper parameters available for the selected classifier
print(rf.explainParams()) # gives the details about the hyper-parameter

cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. (default: False)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. (default: 10)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2 (0.0-1.0], [1-n]. (default: auto)
featuresCol: features column name. (default: features, current: features)
impurity: Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini (default: gini)
labelCol: label column name. (default: label, current: speciesIndex)
maxBins: Max number of bins for discretizing continuous features. 

In [33]:
# Try with Cross-validation and parameter tuning

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
            .addGrid(rf.maxDepth, [2,4,6])
            #.addGrid(rf.maxBins, [20,60])
            .addGrid(rf.numTrees, [50, 100, 200])
            .build())

In [31]:
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations.  This can take about 6 minutes since it is training over 20 trees!
cvModel = cv.fit(train)

# Use test set here so we can measure the accuracy of our model on new data
predictions = cvModel.transform(test)

# cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluator.evaluate(predictions)

0.94

In [34]:
spark.stop()