In [11]:
!pip install pyspark


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline


In [13]:
spark=SparkSession.builder\
.master("local[*]")\
.appName("WineQuality")\
.getOrCreate()
sc=spark.sparkContext

In [14]:
spark

In [15]:
#Mount (connect to) Google drive to be able to read from it (copy data files into HDFS)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
winequality = spark.read.csv('/content/drive/My Drive/Winequality.csv', header = True, inferSchema = True)
winequality.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)
 |-- Id: integer (nullable = true)



In [23]:
# Create a feature vector using the VectorAssembler
assembler = VectorAssembler(inputCols=['fixed acidity', 'volatile acidity', 'citric acid',
                                       'residual sugar', 'chlorides', 'free sulfur dioxide',
                                       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'],
                            outputCol='features')
winequality1 = assembler.transform(winequality)

In [24]:
winequality1.show(5)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+---+--------------------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality| Id|            features|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+---+--------------------+
|          7.4|             0.7|        0.0|           1.9|    0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|      5|  0|[7.4,0.7,0.0,1.9,...|
|          7.8|            0.88|        0.0|           2.6|    0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|      5|  1|[7.8,0.88,0.0,2.6...|
|          7.8|            0.76|       0.04|           2.3|    0.092|               15.0|                54.0|  0.997|3.26|  

In [26]:
# Split the dataset into training and testing sets
#Test size is set to 0.2 - This will split the data into 80% and 20%. 20% of the data will be used for testing the model and 80% will be used for training the model.
train, test = winequality1.randomSplit([0.8, 0.2], seed=42)

In [27]:
# Create and train a RandomForestClassifier
rf = RandomForestClassifier(featuresCol="features", labelCol="quality", numTrees=100)

In [28]:

model = rf.fit(train)

In [29]:
# Make predictions on the testing data
predictions = model.transform(test)

In [30]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="quality", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.6802030456852792


In [31]:
evaluator = MulticlassClassificationEvaluator(labelCol="quality", predictionCol="prediction", metricName="weightedPrecision")
evaluator.setMetricName("weightedPrecision")
precision = evaluator.evaluate(predictions)
print("Precision: ", precision)

# Compute the recall
evaluator = MulticlassClassificationEvaluator(labelCol="quality", predictionCol="prediction", metricName="weightedRecall")
evaluator.setMetricName("weightedRecall")
recall = evaluator.evaluate(predictions)
print("Recall: ", recall)

# Compute the F1 score
evaluator = MulticlassClassificationEvaluator(labelCol="quality", predictionCol="prediction", metricName="f1")
evaluator.setMetricName("f1")
f1_score = evaluator.evaluate(predictions)
print("F1 score: ", f1_score)

Precision:  0.6429921727264634
Recall:  0.6802030456852792
F1 score:  0.6591589423688393
