In [1]:
import os

# 1. Install OpenJDK 21 (if not already done in a previous cell)
!apt-get update -qq
!apt-get install -qq openjdk-21-jdk-headless

# 2. Verify where it landed (if needed)
!ls /usr/lib/jvm | grep 21

# 3. Point to JDK 21
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

# 4. Install PySpark via pip (make sure this happens AFTER setting JAVA_HOME)
!pip install pyspark --quiet

# 5. Import and start Spark
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
      .master("local[*]")
      .appName("PySpark-MLP-Classifier")
      .getOrCreate()
)


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package openjdk-21-jre-headless:amd64.
(Reading database ... 126102 files and directories currently installed.)
Preparing to unpack .../openjdk-21-jre-headless_21.0.7+6~us1-0ubuntu1~22.04_amd64.deb ...
Unpacking openjdk-21-jre-headless:amd64 (21.0.7+6~us1-0ubuntu1~22.04) ...
Selecting previously unselected package openjdk-21-jdk-headless:amd64.
Preparing to unpack .../openjdk-21-jdk-headless_21.0.7+6~us1-0ubuntu1~22.04_amd64.deb ...
Unpacking openjdk-21-jdk-headless:amd64 (21.0.7+6~us1-0ubuntu1~22.04) ...
Setting up openjdk-21-jre-headless:amd64 (21.0.7+6~us1-0ubuntu1~22.04) ...
update-alternatives: using /usr/lib/jvm/java-21-openjdk-amd64/bin/java to provide /usr/bin/java (java) in auto mode
update-alternatives: using /usr/lib/jvm/java-21-openjdk-amd64/bin/jpackage to

In [2]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# 1 Load Data set

In [3]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
df = spark.read.format('com.databricks.spark.csv').\
                               options(header='true', \
                               inferschema='true').load("/content/WineData.csv",header=True);
df.show(5)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.4|             0.7|        0.0|           1.9|    0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|      5|
|          7.8|            0.88|        0.0|           2.6|    0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|      5|
|          7.8|            0.76|       0.04|           2.3|    0.092|               15.0|                54.0|  0.997|3.26|     0.65|    9.8|      5|
|         11.2|            0.28|       0.56|           1.9|    0.075|               17.0|           

In [4]:
df.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [5]:
df.select('quality').distinct().collect()

[Row(quality=6),
 Row(quality=3),
 Row(quality=5),
 Row(quality=4),
 Row(quality=8),
 Row(quality=7)]

In [6]:
# Convert to float format
def string_to_float(x):
    return float(x)

#
def condition(r):
    if (0<= r <= 4):
        label = "low"
    elif(4< r <= 6):
        label = "medium"
    else:
        label = "high"
    return label

In [7]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, DoubleType
string_to_float_udf = udf(string_to_float, DoubleType())
quality_udf = udf(lambda x: condition(x), StringType())

In [8]:
df.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [9]:
df.show()

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.4|             0.7|        0.0|           1.9|    0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|      5|
|          7.8|            0.88|        0.0|           2.6|    0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|      5|
|          7.8|            0.76|       0.04|           2.3|    0.092|               15.0|                54.0|  0.997|3.26|     0.65|    9.8|      5|
|         11.2|            0.28|       0.56|           1.9|    0.075|               17.0|           

In [10]:
# convert the data to dense vector
def transData(data):
    return data.rdd.map(lambda r: [r[-1], Vectors.dense(r[:-1])]).toDF(['label','features'])

In [11]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

data= transData(df)
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    5|[7.4,0.7,0.0,1.9,...|
|    5|[7.8,0.88,0.0,2.6...|
|    5|[7.8,0.76,0.04,2....|
|    6|[11.2,0.28,0.56,1...|
|    5|[7.4,0.7,0.0,1.9,...|
|    5|[7.4,0.66,0.0,1.8...|
|    5|[7.9,0.6,0.06,1.6...|
|    7|[7.3,0.65,0.0,1.2...|
|    7|[7.8,0.58,0.02,2....|
|    5|[7.5,0.5,0.36,6.1...|
|    5|[6.7,0.58,0.08,1....|
|    5|[7.5,0.5,0.36,6.1...|
|    5|[5.6,0.615,0.0,1....|
|    5|[7.8,0.61,0.29,1....|
|    5|[8.9,0.62,0.18,3....|
|    5|[8.9,0.62,0.19,3....|
|    7|[8.5,0.28,0.56,1....|
|    5|[8.1,0.56,0.28,1....|
|    4|[7.4,0.59,0.08,4....|
|    6|[7.9,0.32,0.51,1....|
+-----+--------------------+
only showing top 20 rows



In [12]:
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
labelIndexer.transform(data).show(6)

+-----+--------------------+------------+
|label|            features|indexedLabel|
+-----+--------------------+------------+
|    5|[7.4,0.7,0.0,1.9,...|         0.0|
|    5|[7.8,0.88,0.0,2.6...|         0.0|
|    5|[7.8,0.76,0.04,2....|         0.0|
|    6|[11.2,0.28,0.56,1...|         1.0|
|    5|[7.4,0.7,0.0,1.9,...|         0.0|
|    5|[7.4,0.66,0.0,1.8...|         0.0|
+-----+--------------------+------------+
only showing top 6 rows



In [13]:
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =VectorIndexer(inputCol="features", \
                              outputCol="indexedFeatures", \
                              maxCategories=4).fit(data)

featureIndexer.transform(data).show(6)

+-----+--------------------+--------------------+
|label|            features|     indexedFeatures|
+-----+--------------------+--------------------+
|    5|[7.4,0.7,0.0,1.9,...|[7.4,0.7,0.0,1.9,...|
|    5|[7.8,0.88,0.0,2.6...|[7.8,0.88,0.0,2.6...|
|    5|[7.8,0.76,0.04,2....|[7.8,0.76,0.04,2....|
|    6|[11.2,0.28,0.56,1...|[11.2,0.28,0.56,1...|
|    5|[7.4,0.7,0.0,1.9,...|[7.4,0.7,0.0,1.9,...|
|    5|[7.4,0.66,0.0,1.8...|[7.4,0.66,0.0,1.8...|
+-----+--------------------+--------------------+
only showing top 6 rows



In [14]:
data.printSchema()

root
 |-- label: long (nullable = true)
 |-- features: vector (nullable = true)



In [15]:
# Split the data into train and test
(trainingData, testData) = data.randomSplit([0.6, 0.4],seed=218)

In [16]:
data.select('label').distinct().collect()

[Row(label=7),
 Row(label=6),
 Row(label=5),
 Row(label=3),
 Row(label=8),
 Row(label=4)]

# Construct MLP


In [17]:
# specify layers for the neural network:
# input layer of size 11 (features), two intermediate of size 5 and 4
# and output of size  (classes)
layers = [11, 5, 4, 4, 3 , 6]

# create the trainer and set its parameters
FNN = MultilayerPerceptronClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",\
                                         maxIter=100, layers=layers, blockSize=128, seed=1234)

# Create Pipeline

In [18]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

In [19]:
# Chain indexers and forest in a Pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, FNN, labelConverter])

# Model traning and prediction

In [20]:
# train the model
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

In [21]:
# Make predictions.
predictions = model.transform(testData)

In [22]:
# Select example rows to display.
predictions.select("features","label","predictedLabel").show(5)

+--------------------+-----+--------------+
|            features|label|predictedLabel|
+--------------------+-----+--------------+
|[6.8,0.815,0.0,1....|    3|             5|
|[7.1,0.875,0.05,5...|    3|             5|
|[7.6,1.58,0.0,2.1...|    3|             5|
|[8.3,1.02,0.02,3....|    3|             5|
|[10.4,0.61,0.49,2...|    3|             5|
+--------------------+-----+--------------+
only showing top 5 rows



# Evaluation

In [23]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Predictions accuracy = %g, Test Error = %g" % (accuracy,(1.0 - accuracy)))

Predictions accuracy = 0.574018, Test Error = 0.425982


In [24]:
from pyspark.mllib.evaluation import MulticlassMetrics
import sklearn
import numpy as np

In [25]:
y_true = predictions.select("label")
y_true.show(5)
y_pred = predictions.select("predictedLabel")
y_pred.show(5)
y_true_array = np.array(y_true.collect())
y_pred_array = np.array(y_pred.collect())


+-----+
|label|
+-----+
|    3|
|    3|
|    3|
|    3|
|    3|
+-----+
only showing top 5 rows

+--------------+
|predictedLabel|
+--------------+
|             5|
|             5|
|             5|
|             5|
|             5|
+--------------+
only showing top 5 rows



In [29]:
y_pred_array = [int(i) for i in y_pred_array]

  y_pred_array = [int(i) for i in y_pred_array]


In [30]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true_array, y_pred_array))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00        21
           5       0.66      0.74      0.70       290
           6       0.54      0.48      0.51       253
           7       0.39      0.51      0.44        85
           8       0.00      0.00      0.00         8

    accuracy                           0.57       662
   macro avg       0.26      0.29      0.27       662
weighted avg       0.55      0.57      0.56       662



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
spark.stop()