### 1. Set up spark context and SparkSession

In [2]:
import os

# 1. Install OpenJDK 21 (if not already done in a previous cell)
!apt-get update -qq
!apt-get install -qq openjdk-21-jdk-headless

# 2. Verify where it landed (if needed)
!ls /usr/lib/jvm | grep 21

# 3. Point to JDK 21
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

# 4. Install PySpark via pip (make sure this happens AFTER setting JAVA_HOME)
!pip install pyspark --quiet

# 5. Import and start Spark
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
      .master("local[*]")
      .appName("PySpark-RandomForestClassifier_Iris")
      .getOrCreate()
)

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package openjdk-21-jre-headless:amd64.
(Reading database ... 126109 files and directories currently installed.)
Preparing to unpack .../openjdk-21-jre-headless_21.0.7+6~us1-0ubuntu1~22.04_amd64.deb ...
Unpacking openjdk-21-jre-headless:amd64 (21.0.7+6~us1-0ubuntu1~22.04) ...
Selecting previously unselected package openjdk-21-jdk-headless:amd64.
Preparing to unpack .../openjdk-21-jdk-headless_21.0.7+6~us1-0ubuntu1~22.04_amd64.deb ...
Unpacking openjdk-21-jdk-headless:amd64 (21.0.7+6~us1-0ubuntu1~22.04) ...
Setting up openjdk-21-jre-headless:amd64 (21.0.7+6~us1-0ubuntu1~22.04) ...
update-alternatives: using /usr/lib/jvm/java-21-openjdk-amd64/bin/java to provide /usr/bin/java (java) in auto mode
update-alternatives: using /usr/lib/jvm/java-21-openjdk-amd64/bin/jpackage to

# Mount Google drive

In [3]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

### 2. Load dataset

In [5]:
# Load the Iris dataset (assuming you have it in a CSV format)
iris_data = spark.read.csv("/content/iris-data.csv", header=True, inferSchema=True)

In [6]:
# Assuming the target variable is "class" and other columns are features
feature_cols = iris_data.columns[:-1]

In [7]:
# Convert string labels into numerical labels
indexer = StringIndexer(inputCol="class", outputCol="label")
iris_data = indexer.fit(iris_data).transform(iris_data)

In [8]:
# Create a feature vector by assembling the feature columns
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(iris_data)

In [9]:
# Split the data into training and testing sets
(training_data, testing_data) = data.randomSplit([0.8, 0.2], seed=123)

In [10]:
# Customized parameters
num_trees = 10
max_depth = 5

In [11]:
# Create and train a RandomForestClassifier with customized parameters
rf = RandomForestClassifier(
    labelCol="label",
    featuresCol="features",
    numTrees=num_trees,
    maxDepth=max_depth
)

In [12]:
model = rf.fit(training_data)

In [13]:
# Make predictions on the testing data
predictions = model.transform(testing_data)

In [14]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

In [15]:
# Print the accuracy
print("Accuracy: {:.2f}".format(accuracy))


Accuracy: 0.97


In [16]:
# Show the feature importances
print("Feature Importances: ", model.featureImportances)

Feature Importances:  (4,[0,1,2,3],[0.14855900919958878,0.017477117982863583,0.4161820723775073,0.4177818004400404])


In [17]:
# Stop the Spark session
spark.stop()