#### Clase 3

In [1]:
import findspark
findspark.init()

import pyspark

In [2]:
import seaborn as sns
from pyspark.sql import SparkSession

# Start Spark Session
spark = SparkSession.builder.appName('iris').getOrCreate()

# Load iris dataset from seaborn
iris = sns.load_dataset('iris')

# Convert the iris dataset to a Spark DataFrame
iris_df = spark.createDataFrame(iris)

In [3]:
iris_df.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [4]:
from pyspark.ml.feature import StringIndexer

# Convert target variable into numerical form
indexer = StringIndexer(inputCol="species", outputCol="label")
iris_df = indexer.fit(iris_df).transform(iris_df)
iris_df.show(5)

+------------+-----------+------------+-----------+-------+-----+
|sepal_length|sepal_width|petal_length|petal_width|species|label|
+------------+-----------+------------+-----------+-------+-----+
|         5.1|        3.5|         1.4|        0.2| setosa|  0.0|
|         4.9|        3.0|         1.4|        0.2| setosa|  0.0|
|         4.7|        3.2|         1.3|        0.2| setosa|  0.0|
|         4.6|        3.1|         1.5|        0.2| setosa|  0.0|
|         5.0|        3.6|         1.4|        0.2| setosa|  0.0|
+------------+-----------+------------+-----------+-------+-----+
only showing top 5 rows



In [5]:
from pyspark.ml.feature import VectorAssembler

# Specify the input and output columns of the vector assembler
assembler = VectorAssembler(
    inputCols=[
        'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
    ],
    outputCol='features'
)

# Transform the data
iris_df = assembler.transform(iris_df)

In [6]:
iris_df.show(5)

+------------+-----------+------------+-----------+-------+-----+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|species|label|         features|
+------------+-----------+------------+-----------+-------+-----+-----------------+
|         5.1|        3.5|         1.4|        0.2| setosa|  0.0|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2| setosa|  0.0|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2| setosa|  0.0|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2| setosa|  0.0|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2| setosa|  0.0|[5.0,3.6,1.4,0.2]|
+------------+-----------+------------+-----------+-------+-----+-----------------+
only showing top 5 rows



In [7]:
# Split the data into training and test sets
train_data, test_data = iris_df.randomSplit([0.8, 0.2])

In [8]:
train_data.show(5)

+------------+-----------+------------+-----------+-------+-----+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|species|label|         features|
+------------+-----------+------------+-----------+-------+-----+-----------------+
|         4.4|        2.9|         1.4|        0.2| setosa|  0.0|[4.4,2.9,1.4,0.2]|
|         4.6|        3.1|         1.5|        0.2| setosa|  0.0|[4.6,3.1,1.5,0.2]|
|         4.6|        3.4|         1.4|        0.3| setosa|  0.0|[4.6,3.4,1.4,0.3]|
|         4.8|        3.4|         1.6|        0.2| setosa|  0.0|[4.8,3.4,1.6,0.2]|
|         4.9|        3.0|         1.4|        0.2| setosa|  0.0|[4.9,3.0,1.4,0.2]|
+------------+-----------+------------+-----------+-------+-----+-----------------+
only showing top 5 rows



In [9]:
from pyspark.ml.classification import RandomForestClassifier

# Random Forest Classifier
rf = RandomForestClassifier(
    featuresCol='features', labelCol='label',
    numTrees=100
)

rf_model = rf.fit(train_data)

In [10]:
# Make predictions on the test data
predictions = rf_model.transform(test_data)
predictions.show(5)

# from pyspark.ml.classification import LogisticRegression

# # Create a Logistic Regression model and fit it to the training data
# lr = LogisticRegression(featuresCol='features', labelCol='label')
# lr_model = lr.fit(train_data)

# # Make predictions on the test data
# predictions = lr_model.transform(test_data)

+------------+-----------+------------+-----------+-------+-----+-----------------+---------------+----------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|species|label|         features|  rawPrediction|     probability|prediction|
+------------+-----------+------------+-----------+-------+-----+-----------------+---------------+----------------+----------+
|         4.7|        3.2|         1.3|        0.2| setosa|  0.0|[4.7,3.2,1.3,0.2]|[100.0,0.0,0.0]|   [1.0,0.0,0.0]|       0.0|
|         4.9|        3.1|         1.5|        0.1| setosa|  0.0|[4.9,3.1,1.5,0.1]|[100.0,0.0,0.0]|   [1.0,0.0,0.0]|       0.0|
|         5.1|        3.5|         1.4|        0.2| setosa|  0.0|[5.1,3.5,1.4,0.2]|[100.0,0.0,0.0]|   [1.0,0.0,0.0]|       0.0|
|         5.4|        3.9|         1.7|        0.4| setosa|  0.0|[5.4,3.9,1.7,0.4]|[100.0,0.0,0.0]|   [1.0,0.0,0.0]|       0.0|
|         5.8|        4.0|         1.2|        0.2| setosa|  0.0|[5.8,4.0,1.2,0.2]| [94.0,5.0,1.0]|[0.94

In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

# Convert prediction column to double type
predictions = predictions.withColumn(
    "prediction", predictions["prediction"].cast("double")
)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction",
    metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)

# Compute confusion matrix
predictionAndLabels = predictions.select(
    "prediction", "label"
).rdd

metrics = MulticlassMetrics(predictionAndLabels)

# Get the confusion matrix
confusion_matrix = metrics.confusionMatrix().toArray()

print("Confusion Matrix:\n", confusion_matrix)
print("Accuracy = %g" % accuracy)



Confusion Matrix:
 [[11.  0.  0.]
 [ 0. 11.  0.]
 [ 0.  1.  7.]]
Accuracy = 0.966667


In [12]:
spark.stop()