In [42]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [43]:
spark = SparkSession.builder.getOrCreate()

In [44]:
data_path = '/Users/quangly/github/Spark SQL/Data'

In [45]:
# exploratory data analysis, Clustering. natural groupings
#divide data set into 3 logical datasets 

In [46]:
csv_path = data_path + "/iris.txt"
iris_df = spark.read.format("csv") \
                .option("inferSchema", True) \
                .load(csv_path)

In [47]:
# group based on cpu, free memory, session count
# ml libraries work with vectors (like an array)
# create vector to store all 3 values called "features"

In [48]:
iris_df.show(5)

+---+---+---+---+-----------+
|_c0|_c1|_c2|_c3|        _c4|
+---+---+---+---+-----------+
|5.1|3.5|1.4|0.2|Iris-setosa|
|4.9|3.0|1.4|0.2|Iris-setosa|
|4.7|3.2|1.3|0.2|Iris-setosa|
|4.6|3.1|1.5|0.2|Iris-setosa|
|5.0|3.6|1.4|0.2|Iris-setosa|
+---+---+---+---+-----------+
only showing top 5 rows



In [49]:
iris_df = iris_df.select(col("_c0").alias("sepal_length"),
        col("_c1").alias("sepal_width"),
        col("_c2").alias("petal_length"),
        col("_c3").alias("petal_width"),
        col("_c4").alias("species"))


In [50]:
iris_df.take(1)

[Row(sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2, species='Iris-setosa')]

In [51]:
vectorAssembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"], outputCol="features")

In [52]:
#vectorized iris df

In [53]:
viris_df = vectorAssembler.transform(iris_df)

In [54]:
viris_df.show()

+------------+-----------+------------+-----------+-----------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|    species|         features|
+------------+-----------+------------+-----------+-----------+-----------------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|[5.0,3.4,1.5,0.2]|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|[4.4,2.9,1.4,0.2]|
|         4.9|  

In [55]:
# Part of PreProcessing. Convert label name (species name) into numeric value.
# StringIndexer - 

In [59]:
indexer = StringIndexer(inputCol="species", outputCol="label")
ivirus_df = indexer.fit(viris_df).transform(viris_df)
ivirus_df.show(3)

+------------+-----------+------------+-----------+-----------+-----------------+-----+
|sepal_length|sepal_width|petal_length|petal_width|    species|         features|label|
+------------+-----------+------------+-----------+-----------+-----------------+-----+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|  0.0|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|  0.0|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|  0.0|
+------------+-----------+------------+-----------+-----------+-----------------+-----+
only showing top 3 rows



In [60]:
#Naive Bayes

In [66]:
splits = ivirus_df.randomSplit([0.6,0.4])
train_df = splits[0]
test_df = splits[1]

In [67]:
print(train_df.count())
print(test_df.count())
print(viris_df.count())

97
53
150


In [68]:
nb = NaiveBayes(modelType="multinomial")

In [69]:
nbmodel = nb.fit(train_df)

In [70]:
predictions_df = nbmodel.transform(test_df)

In [75]:
predictions_df.take(1)

[Row(sepal_length=4.4, sepal_width=3.2, petal_length=1.3, petal_width=0.2, species='Iris-setosa', features=DenseVector([4.4, 3.2, 1.3, 0.2]), label=0.0, rawPrediction=DenseVector([-10.8631, -12.3804, -12.7991]), probability=DenseVector([0.7334, 0.1608, 0.1058]), prediction=0.0)]

In [None]:
# prediction=0.0 is index of the species Setosa which is correct

In [76]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [77]:
nbaccuracy = evaluator.evaluate(predictions_df)

In [78]:
nbaccuracy

0.7547169811320755