In [1]:
# https://scikit-learn.org/stable/modules/tree.html
# Decision Trees (DTs) are a non-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features. A tree can be seen as a piecewise constant approximation.

# For instance, in the example below, decision trees learn from data to approximate a sine curve with a set of if-then-else decision rules. The deeper the tree, the more complex the decision rules and the fitter the model.


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.classification import DecisionTreeClassifier

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
data_path = '/Users/quangly/github/Spark SQL/Data'

In [5]:
# exploratory data analysis, Clustering. natural groupings
#divide data set into 3 logical datasets 

In [6]:
csv_path = data_path + "/iris.txt"
iris_df = spark.read.format("csv") \
                .option("inferSchema", True) \
                .load(csv_path)

In [7]:
# group based on cpu, free memory, session count
# ml libraries work with vectors (like an array)
# create vector to store all 3 values called "features"

In [8]:
iris_df.show(5)

+---+---+---+---+-----------+
|_c0|_c1|_c2|_c3|        _c4|
+---+---+---+---+-----------+
|5.1|3.5|1.4|0.2|Iris-setosa|
|4.9|3.0|1.4|0.2|Iris-setosa|
|4.7|3.2|1.3|0.2|Iris-setosa|
|4.6|3.1|1.5|0.2|Iris-setosa|
|5.0|3.6|1.4|0.2|Iris-setosa|
+---+---+---+---+-----------+
only showing top 5 rows



In [9]:
iris_df = iris_df.select(col("_c0").alias("sepal_length"),
        col("_c1").alias("sepal_width"),
        col("_c2").alias("petal_length"),
        col("_c3").alias("petal_width"),
        col("_c4").alias("species"))


In [10]:
iris_df.take(1)

[Row(sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2, species='Iris-setosa')]

In [11]:
vectorAssembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"], outputCol="features")

In [12]:
#vectorized iris df

In [13]:
viris_df = vectorAssembler.transform(iris_df)

In [14]:
viris_df.show()

+------------+-----------+------------+-----------+-----------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|    species|         features|
+------------+-----------+------------+-----------+-----------+-----------------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|[5.0,3.4,1.5,0.2]|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|[4.4,2.9,1.4,0.2]|
|         4.9|  

In [15]:
# Part of PreProcessing. Convert label name (species name) into numeric value.
# StringIndexer - 

In [16]:
indexer = StringIndexer(inputCol="species", outputCol="label")
ivirus_df = indexer.fit(viris_df).transform(viris_df)
ivirus_df.show(3)

+------------+-----------+------------+-----------+-----------+-----------------+-----+
|sepal_length|sepal_width|petal_length|petal_width|    species|         features|label|
+------------+-----------+------------+-----------+-----------+-----------------+-----+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|  0.0|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|  0.0|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|  0.0|
+------------+-----------+------------+-----------+-----------+-----------------+-----+
only showing top 3 rows



In [17]:
#Naive Bayes

In [18]:
splits = ivirus_df.randomSplit([0.6,0.4])
train_df = splits[0]
test_df = splits[1]

In [19]:
print(train_df.count())
print(test_df.count())
print(viris_df.count())

85
65
150


In [20]:
nb = NaiveBayes(modelType="multinomial")

In [21]:
nbmodel = nb.fit(train_df)

In [22]:
predictions_df = nbmodel.transform(test_df)

In [23]:
predictions_df.take(1)

24/06/26 14:36:41 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/06/26 14:36:41 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


[Row(sepal_length=4.4, sepal_width=3.0, petal_length=1.3, petal_width=0.2, species='Iris-setosa', features=DenseVector([4.4, 3.0, 1.3, 0.2]), label=0.0, rawPrediction=DenseVector([-10.8085, -11.8672, -12.3854]), probability=DenseVector([0.6437, 0.2233, 0.133]), prediction=0.0)]

In [24]:
# prediction=0.0 is index of the species Setosa which is correct

In [25]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [26]:
nbaccuracy = evaluator.evaluate(predictions_df)

In [27]:
nbaccuracy

0.9846153846153847

In [43]:
#first layer has same number of nodes as inputs. 4 measures so 4. last layer has same number of types of output 3
# layers in between so the multilayer perceptron has learn how to classify correctly. 2 rows 5 neurons each
layers = [4,5,5, 3]

In [44]:
mlp = MultilayerPerceptronClassifier(layers = layers, seed=1)

In [45]:
mlp_model = mlp.fit(train_df)

In [46]:
mlp_predictions = mlp_model.transform(test_df)

In [47]:
mlp_evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

In [48]:
mlp_accuracy = mlp_evaluator.evaluate(mlp_predictions)

In [49]:
mlp_accuracy

0.676923076923077

In [50]:
#higher than Naive Bayes

In [51]:
# Decision Trees

In [52]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

In [53]:
dt_model = dt.fit(train_df)

In [54]:
dt_predictions = dt_model.transform(test_df)

In [55]:
dt_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [56]:
dt_accuracy = dt_evaluator.evaluate(dt_predictions)

In [57]:
dt_accuracy

0.9538461538461539

In [58]:
# Decision Tree and MLP worked well. but Decition trees dont' require us to make any configuration decisions
# Naive Bayes can work well when attributes are indepedent of each other.