In [19]:
!pip install pyspark



In [22]:
#Initialize SparkSession and SparkContext
from pyspark.sql import SparkSession
from pyspark import SparkContext

In [23]:
#Create a Spark Session
SpSession = SparkSession.builder.master("local[*]").getOrCreate()

In [24]:
#Get the Spark Context from Spark Session    
SpContext = SpSession.sparkContext

In [39]:
#Load Data

#Load the CSV file into a RDD
irisData = SpContext.textFile("/content/drive/My Drive/iris.csv")
irisData.cache()
irisData.count()

#Remove the first line (contains headers)
dataLines = irisData.filter(lambda x: "Sepal" not in x)
dataLines.count()

150

In [41]:
#Cleanup Data

from pyspark.sql import Row
#Create a Data Frame from the data
parts = dataLines.map(lambda l: l.split(","))
irisMap = parts.map(lambda p: Row(SEPAL_LENGTH=float(p[0]),\
                                SEPAL_WIDTH=float(p[1]), \
                                PETAL_LENGTH=float(p[2]), \
                                PETAL_WIDTH=float(p[3]), \
                                SPECIES=p[4] ))
                                
# Infer the schema, and register the DataFrame as a table.
irisDf = SpSession.createDataFrame(irisMap)
irisDf.cache()

#Add a numeric indexer for the label/target column
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="SPECIES", outputCol="IND_SPECIES")
si_model = stringIndexer.fit(irisDf)
irisNormDf = si_model.transform(irisDf)

irisNormDf.select("SPECIES","IND_SPECIES").distinct().show()
irisNormDf.cache()

+----------+-----------+
|   SPECIES|IND_SPECIES|
+----------+-----------+
|    setosa|        0.0|
| virginica|        2.0|
|versicolor|        1.0|
+----------+-----------+



DataFrame[SEPAL_LENGTH: double, SEPAL_WIDTH: double, PETAL_LENGTH: double, PETAL_WIDTH: double, SPECIES: string, IND_SPECIES: double]

In [43]:
#Perform Data Analytics


#See standard parameters
irisNormDf.describe().show()



+-------+------------------+------------------+------------------+------------------+---------+------------------+
|summary|      SEPAL_LENGTH|       SEPAL_WIDTH|      PETAL_LENGTH|       PETAL_WIDTH|  SPECIES|       IND_SPECIES|
+-------+------------------+------------------+------------------+------------------+---------+------------------+
|  count|               150|               150|               150|               150|      150|               150|
|   mean| 5.843333333333332|3.0573333333333337| 3.758000000000001|1.1993333333333331|     null|               1.0|
| stddev|0.8280661279778634|0.4358662849366978|1.7652982332594662|0.7622376689603467|     null|0.8192319205190406|
|    min|               4.3|               2.0|               1.0|               0.1|   setosa|               0.0|
|    max|               7.9|               4.4|               6.9|               2.5|virginica|               2.0|
+-------+------------------+------------------+------------------+--------------

In [44]:

#Prepare data for ML

#Transform to a Data Frame for input to Machine Learing
#Drop columns that are not required (low correlation)

from pyspark.ml.linalg import Vectors
def transformToLabeledPoint(row) :
    lp = ( row["SPECIES"], row["IND_SPECIES"], \
                Vectors.dense([row["SEPAL_LENGTH"],\
                        row["SEPAL_WIDTH"], \
                        row["PETAL_LENGTH"], \
                        row["PETAL_WIDTH"]]))
    return lp
    
irisLp = irisNormDf.rdd.map(transformToLabeledPoint)
irisLpDf = SpSession.createDataFrame(irisLp,["species","label", "features"])
irisLpDf.select("species","label","features").show(10)
irisLpDf.cache()



+-------+-----+-----------------+
|species|label|         features|
+-------+-----+-----------------+
| setosa|  0.0|[5.1,3.5,1.4,0.2]|
| setosa|  0.0|[4.9,3.0,1.4,0.2]|
| setosa|  0.0|[4.7,3.2,1.3,0.2]|
| setosa|  0.0|[4.6,3.1,1.5,0.2]|
| setosa|  0.0|[5.0,3.6,1.4,0.2]|
| setosa|  0.0|[5.4,3.9,1.7,0.4]|
| setosa|  0.0|[4.6,3.4,1.4,0.3]|
| setosa|  0.0|[5.0,3.4,1.5,0.2]|
| setosa|  0.0|[4.4,2.9,1.4,0.2]|
| setosa|  0.0|[4.9,3.1,1.5,0.1]|
+-------+-----+-----------------+
only showing top 10 rows



DataFrame[species: string, label: double, features: vector]

In [35]:
#Prepare data for ML


#Transform to a Data Frame for input to Machine Learing
#Drop columns that are not required (low correlation)

from pyspark.ml.linalg import Vectors
def transformToLabeledPoint(row) :
    lp = ( row["MPG"], Vectors.dense([row["ACCELERATION"],\
                        row["DISPLACEMENT"], \
                        row["WEIGHT"]]))
    return lp
    
autoLp = autoMap.map(transformToLabeledPoint)
autoDF = SpSession.createDataFrame(autoLp,["label", "features"])
autoDF.select("label","features").show(10)

+-----+-------------------+
|label|           features|
+-----+-------------------+
| 18.0|[12.0,307.0,3504.0]|
| 15.0|[11.5,350.0,3693.0]|
| 18.0|[11.0,318.0,3436.0]|
| 16.0|[12.0,304.0,3433.0]|
| 17.0|[10.5,302.0,3449.0]|
| 15.0|[10.0,429.0,4341.0]|
| 14.0| [9.0,454.0,4354.0]|
| 14.0| [8.5,440.0,4312.0]|
| 14.0|[10.0,455.0,4425.0]|
| 15.0| [8.5,390.0,3850.0]|
+-----+-------------------+
only showing top 10 rows



In [45]:
#Perform Machine Learning


#Split into training and testing data
(trainingData, testData) = irisLpDf.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()
testData.show()

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Create the model
dtClassifer = DecisionTreeClassifier(maxDepth=2, labelCol="label",\
                featuresCol="features")
dtModel = dtClassifer.fit(trainingData)

dtModel.numNodes
dtModel.depth

#Predict on the test data
predictions = dtModel.transform(testData)
predictions.select("prediction","species","label").show()

#Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="label",metricName="accuracy")
evaluator.evaluate(predictions)      

#Draw a confusion matrix
predictions.groupBy("label","prediction").count().show()

+----------+-----+-----------------+
|   species|label|         features|
+----------+-----+-----------------+
|    setosa|  0.0|[4.8,3.0,1.4,0.1]|
|    setosa|  0.0|[5.0,3.4,1.5,0.2]|
|    setosa|  0.0|[5.1,3.3,1.7,0.5]|
|versicolor|  1.0|[5.2,2.7,3.9,1.4]|
|versicolor|  1.0|[5.6,2.9,3.6,1.3]|
|versicolor|  1.0|[7.0,3.2,4.7,1.4]|
| virginica|  2.0|[6.4,3.2,5.3,2.3]|
| virginica|  2.0|[6.5,3.0,5.5,1.8]|
| virginica|  2.0|[6.7,2.5,5.8,1.8]|
| virginica|  2.0|[6.7,3.0,5.2,2.3]|
| virginica|  2.0|[6.7,3.3,5.7,2.1]|
| virginica|  2.0|[6.7,3.3,5.7,2.5]|
| virginica|  2.0|[6.8,3.2,5.9,2.3]|
+----------+-----+-----------------+

+----------+----------+-----+
|prediction|   species|label|
+----------+----------+-----+
|       0.0|    setosa|  0.0|
|       0.0|    setosa|  0.0|
|       0.0|    setosa|  0.0|
|       1.0|versicolor|  1.0|
|       1.0|versicolor|  1.0|
|       1.0|versicolor|  1.0|
|       2.0| virginica|  2.0|
|       2.0| virginica|  2.0|
|       2.0| virginica|  2.0|
|       2.