In [15]:
import os

# 1. Install OpenJDK 21 (if not already done in a previous cell)
!apt-get update -qq
!apt-get install -qq openjdk-21-jdk-headless

# 2. Verify where it landed (if needed)
!ls /usr/lib/jvm | grep 21

# 3. Point to JDK 21
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

# 4. Install PySpark via pip (make sure this happens AFTER setting JAVA_HOME)
!pip install pyspark --quiet

# 5. Import and start Spark
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
      .master("local[*]")
      .appName("ML-Pipleline_IrisData")
      .getOrCreate()
)


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
java-1.21.0-openjdk-amd64
java-21-openjdk-amd64


In [16]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [17]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType
from sklearn.datasets import load_iris
from pyspark.ml.feature import StringIndexer


In [18]:
# Load Iris dataset using scikit-learn
iris = load_iris()
iris_data = iris.data
iris_target = iris.target

In [19]:
# Define the schema for the DataFrame
schema = StructType([
    StructField("sepal_length", DoubleType(), True),
    StructField("sepal_width", DoubleType(), True),
    StructField("petal_length", DoubleType(), True),
    StructField("petal_width", DoubleType(), True),
    StructField("label", IntegerType(), True)
])

# Convert the data to a DataFrame with the specified schema
data = spark.createDataFrame(
    [(float(x[0]), float(x[1]), float(x[2]), float(x[3]), int(y)) for x, y in zip(iris_data, iris_target)],
    schema=schema
)

In [20]:
data.show()

+------------+-----------+------------+-----------+-----+
|sepal_length|sepal_width|petal_length|petal_width|label|
+------------+-----------+------------+-----------+-----+
|         5.1|        3.5|         1.4|        0.2|    0|
|         4.9|        3.0|         1.4|        0.2|    0|
|         4.7|        3.2|         1.3|        0.2|    0|
|         4.6|        3.1|         1.5|        0.2|    0|
|         5.0|        3.6|         1.4|        0.2|    0|
|         5.4|        3.9|         1.7|        0.4|    0|
|         4.6|        3.4|         1.4|        0.3|    0|
|         5.0|        3.4|         1.5|        0.2|    0|
|         4.4|        2.9|         1.4|        0.2|    0|
|         4.9|        3.1|         1.5|        0.1|    0|
|         5.4|        3.7|         1.5|        0.2|    0|
|         4.8|        3.4|         1.6|        0.2|    0|
|         4.8|        3.0|         1.4|        0.1|    0|
|         4.3|        3.0|         1.1|        0.1|    0|
|         5.8|

In [21]:
# Split the data into training and testing sets
(trainingData, testData) = data.randomSplit([0.8, 0.2], seed=1234)

In [22]:
# Define the feature columns
feature_columns = data.columns
feature_columns.remove("label")

# Create a vector assembler to assemble feature columns into a single feature vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Create a StringIndexer to convert labels to indices
indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")

In [23]:
# Create a RandomForestClassifier
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features")

In [24]:
# Create a pipeline with the stages: vector assembler, label indexer, and random forest
pipeline = Pipeline(stages=[assembler, indexer, rf])

In [25]:
# Define the parameter grid for hyperparameter tuning
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [5, 10, 15])
             .addGrid(rf.numTrees, [20, 50, 100])
             .build())

In [26]:
# Create a multi-class classification evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")

# Create a cross-validator with the pipeline, parameter grid, and evaluator
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

# Fit the cross-validator to the training data
cvModel = crossval.fit(trainingData)

# Make predictions on the test data
predictions = cvModel.transform(testData)

# Evaluate the model
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)


Accuracy = 0.945946


In [27]:

# Stop the Spark session
spark.stop()