# This workshop demonstrate how to apply SVM classifier on multi-class classification


# 1. Set up spark environment and SparkSession

In [1]:
import os

# 1. Install OpenJDK 21 (if not already done in a previous cell)
!apt-get update -qq
!apt-get install -qq openjdk-21-jdk-headless

# 2. Verify where it landed (if needed)
!ls /usr/lib/jvm | grep 21

# 3. Point to JDK 21
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

# 4. Install PySpark via pip (make sure this happens AFTER setting JAVA_HOME)
!pip install pyspark --quiet

# 5. Import and start Spark
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
      .master("local[*]")
      .appName("PySpark-SVMClassifier_Iris")
      .getOrCreate()
)

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package openjdk-21-jre-headless:amd64.
(Reading database ... 126109 files and directories currently installed.)
Preparing to unpack .../openjdk-21-jre-headless_21.0.7+6~us1-0ubuntu1~22.04_amd64.deb ...
Unpacking openjdk-21-jre-headless:amd64 (21.0.7+6~us1-0ubuntu1~22.04) ...
Selecting previously unselected package openjdk-21-jdk-headless:amd64.
Preparing to unpack .../openjdk-21-jdk-headless_21.0.7+6~us1-0ubuntu1~22.04_amd64.deb ...
Unpacking openjdk-21-jdk-headless:amd64 (21.0.7+6~us1-0ubuntu1~22.04) ...
Setting up openjdk-21-jre-headless:amd64 (21.0.7+6~us1-0ubuntu1~22.04) ...
update-alternatives: using /usr/lib/jvm/java-21-openjdk-amd64/bin/java to provide /usr/bin/java (java) in auto mode
update-alternatives: using /usr/lib/jvm/java-21-openjdk-amd64/bin/jpackage to

# Mount google drive

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LinearSVC
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

from pyspark.ml.classification import OneVsRest
from pyspark.ml.feature import VectorAssembler, StringIndexer


In [4]:
# Load the Iris dataset into a DataFrame
# Replace 'iris_data.csv' with the path to your dataset file
data = spark.read.csv("/content/iris-data.csv", header=True, inferSchema=True)


In [5]:
# Define the feature columns
feature_columns = ["sepal length", "sepal width", "petal length", "petal width"]



In [6]:
# Create a StringIndexer to encode the "species" column
indexer = StringIndexer(inputCol="class", outputCol="label")
data = indexer.fit(data).transform(data)

In [7]:
# Create a vector assembler to combine feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)

In [8]:


# Split the data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=123)

# Create an SVM classifier
svm = LinearSVC(maxIter=100, labelCol="label")


In [9]:
# Create an OvR classifier
ovr_classifier = OneVsRest(classifier=svm, labelCol="label")

In [10]:
# Train the OvR model
ovr_model = ovr_classifier.fit(train_data)

In [11]:
# Make predictions on the test data
predictions = ovr_model.transform(test_data)

In [12]:
# Evaluate the model using MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)


In [13]:
# Print the accuracy of the model
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 93.10%


In [14]:
# Convert the predictions and labels to RDD for MulticlassMetrics
prediction_and_label = predictions.select("prediction", "label").rdd

In [15]:
# Instantiate MulticlassMetrics
metrics = MulticlassMetrics(prediction_and_label)



In [16]:
# Print the confusion matrix
print("Confusion Matrix:")
print(metrics.confusionMatrix().toArray())

Confusion Matrix:
[[13.  0.  0.]
 [ 0.  6.  1.]
 [ 0.  1.  8.]]


In [17]:
# Get the recall for the "Setosa" class (class index 0)
setosa_recall = metrics.recall(0)

# Print the recall for the "Setosa" class
print(f"Recall for Setosa class: {setosa_recall:.2f}")

Recall for Setosa class: 1.00


In [18]:
# Stop the Spark session
spark.stop()