In [453]:
import urllib.request
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier, RandomForestClassifier, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [454]:

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

urllib.request.urlretrieve(url, "iris.csv")

('iris.csv', <http.client.HTTPMessage at 0x105de6b50>)

In [455]:
# Definir los nombres de las columnas del Iris Dataset
columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]

In [456]:
spark = SparkSession.builder.appName("IrisPrediction").getOrCreate()

In [457]:
data = spark.read.csv("iris.csv", header=False, inferSchema=True)
data = data.toDF(*columns)
data.show(5)

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



In [458]:
feature_columns =["sepal_length", "sepal_width", "petal_length", "petal_width"]

In [459]:
# Usar VectorAssembler para combinar las columnas numéricas en un solo vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)
data.show(5)

+------------+-----------+------------+-----------+-----------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|    species|         features|
+------------+-----------+------------+-----------+-----------+-----------------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
+------------+-----------+------------+-----------+-----------+-----------------+
only showing top 5 rows



In [460]:
# 2. Transformar la columna 'species' usando StringIndexer
indexer = StringIndexer(inputCol="species", outputCol="label")
data = indexer.fit(data).transform(data)
data.show(5)

+------------+-----------+------------+-----------+-----------+-----------------+-----+
|sepal_length|sepal_width|petal_length|petal_width|    species|         features|label|
+------------+-----------+------------+-----------+-----------+-----------------+-----+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|  0.0|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|  0.0|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|  0.0|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|  0.0|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|  0.0|
+------------+-----------+------------+-----------+-----------+-----------------+-----+
only showing top 5 rows



In [461]:
# Dividir los datos en conjunto de entrenamiento y prueba (80% entrenamiento, 20% prueba)
train_data, test_data = data.randomSplit([0.8, 0.2])


In [462]:
# 4. Normalización de las características numéricas usando StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=False)
scaler_model = scaler.fit(train_data)
train_data_scaled = scaler_model.transform(train_data)
test_data_scaled = scaler_model.transform(test_data)

In [463]:
# Evaluador para medir la precisión
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [464]:
# 6. Calcular la predicción y precisión usando Decision Tree Classifier
dt = DecisionTreeClassifier(labelCol="label", featuresCol="scaled_features")
dt_model = dt.fit(train_data_scaled)
dt_predictions = dt_model.transform(test_data_scaled)
dt_accuracy = evaluator.evaluate(dt_predictions)
print(f"Decision Tree Accuracy: {dt_accuracy}")

Decision Tree Accuracy: 0.9642857142857143


In [465]:
# Usar GBTClassifier en combinación con One-vs-Rest para multiclase
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=100)

ovr = OneVsRest(classifier=gbt)

# Entrenar el modelo
ovr_model = ovr.fit(train_data_scaled)

# Realizar predicciones
gbt_predictions = ovr_model.transform(test_data_scaled)

# Evaluar la precisión
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
gbt_accuracy = evaluator.evaluate(gbt_predictions)
print(f"One-vs-Rest GBTClassifier Accuracy: {gbt_accuracy}")

One-vs-Rest GBTClassifier Accuracy: 1.0


                                                                                

In [466]:
# 5. Calcular la predicción y precisión usando Random Forest Classifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=20)
rf_model = rf.fit(train_data_scaled)
rf_predictions = rf_model.transform(test_data_scaled)
rf_accuracy = evaluator.evaluate(rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy}")

Random Forest Accuracy: 1.0


In [467]:
# Finalizar la sesión de Spark
spark.stop()