<a href="https://colab.research.google.com/github/stevejj4/Apache-Spark/blob/main/Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
!pip install findspark
!pip install pyspark



In [19]:
# importing the necessary libraies
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [20]:
# Creating spark session
spark = SparkSession.builder.appName('Classification using SparkML').getOrCreate()


In [21]:
# Downloading the dataset
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0231EN-SkillsNetwork/datasets/drybeans.csv


--2024-08-17 08:05:48--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0231EN-SkillsNetwork/datasets/drybeans.csv
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2484759 (2.4M) [text/csv]
Saving to: ‘drybeans.csv.1’


2024-08-17 08:05:49 (8.38 MB/s) - ‘drybeans.csv.1’ saved [2484759/2484759]



In [22]:
# Loading the dataset
df = spark.read.csv('drybeans.csv', header=True, inferSchema=True)

In [23]:
# prining the size of the dataset
print((df.count(), len(df.columns)))
df.printSchema()

(13611, 17)
root
 |-- Area: integer (nullable = true)
 |-- Perimeter: double (nullable = true)
 |-- MajorAxisLength: double (nullable = true)
 |-- MinorAxisLength: double (nullable = true)
 |-- AspectRation: double (nullable = true)
 |-- Eccentricity: double (nullable = true)
 |-- ConvexArea: integer (nullable = true)
 |-- EquivDiameter: double (nullable = true)
 |-- Extent: double (nullable = true)
 |-- Solidity: double (nullable = true)
 |-- roundness: double (nullable = true)
 |-- Compactness: double (nullable = true)
 |-- ShapeFactor1: double (nullable = true)
 |-- ShapeFactor2: double (nullable = true)
 |-- ShapeFactor3: double (nullable = true)
 |-- ShapeFactor4: double (nullable = true)
 |-- Class: string (nullable = true)



In [24]:
# printing the first five rows of Area, Perimeter, Solidity, roundness,compactness,class
df.select('Area', 'Perimeter', 'Solidity', 'roundness', 'compactness', 'class').show(5)

+-----+---------+-----------+-----------+-----------+-----+
| Area|Perimeter|   Solidity|  roundness|compactness|class|
+-----+---------+-----------+-----------+-----------+-----+
|28395|  610.291|0.988855999|0.958027126|0.913357755|SEKER|
|28734|  638.018|0.984985603|0.887033637|0.953860842|SEKER|
|29380|   624.11|0.989558774|0.947849473|0.908774239|SEKER|
|30008|  645.884|0.976695743|0.903936374|0.928328835|SEKER|
|30140|  620.134| 0.99089325|0.984877069|0.970515523|SEKER|
+-----+---------+-----------+-----------+-----------+-----+
only showing top 5 rows



In [28]:
# Value counts for the columns class from the dataset
df.groupBy('Class').count().show()

+--------+-----+
|   Class|count|
+--------+-----+
|    CALI| 1630|
|   SEKER| 2027|
|    SIRA| 2636|
|   HOROZ| 1928|
|  BOMBAY|  522|
|BARBUNYA| 1322|
|DERMASON| 3546|
+--------+-----+



In [27]:
# Converting class column from string to numerical value
indexer = StringIndexer(inputCol='Class', outputCol='label')
df = indexer.fit(df).transform(df)
df.select('class', 'label').show(5)

+-----+-----+
|class|label|
+-----+-----+
|SEKER|  2.0|
|SEKER|  2.0|
|SEKER|  2.0|
|SEKER|  2.0|
|SEKER|  2.0|
+-----+-----+
only showing top 5 rows



In [30]:
# Printing the values of column label
df.groupBy('label').count().orderBy('count', ascending=False).show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 3546|
|  1.0| 2636|
|  2.0| 2027|
|  3.0| 1928|
|  4.0| 1630|
|  5.0| 1322|
|  6.0|  522|
+-----+-----+



In [32]:
# labe; column and the input columns
# preparing feature vector
vector_assembler = VectorAssembler(inputCols=['Area', 'Perimeter', 'Solidity', 'roundness', 'Compactness'], outputCol='features')
df = vector_assembler.transform(df)
df.show(5)

+-----+---------+---------------+---------------+------------+------------+----------+-------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+-----+-----+--------------------+
| Area|Perimeter|MajorAxisLength|MinorAxisLength|AspectRation|Eccentricity|ConvexArea|EquivDiameter|     Extent|   Solidity|  roundness|Compactness|ShapeFactor1|ShapeFactor2|ShapeFactor3|ShapeFactor4|Class|label|            features|
+-----+---------+---------------+---------------+------------+------------+----------+-------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+-----+-----+--------------------+
|28395|  610.291|    208.1781167|     173.888747| 1.197191424| 0.549812187|     28715|  190.1410973|0.763922518|0.988855999|0.958027126|0.913357755| 0.007331506| 0.003147289| 0.834222388| 0.998723889|SEKER|  2.0|[28395.0,610.291,...|
|28734|  638.018|    200.5247957|    182.7344194| 1.097356461| 0

In [33]:
# display for only the assembled features and lalel column
df.select('features', 'label').show(5, truncate=False)


+-----------------------------------------------------+-----+
|features                                             |label|
+-----------------------------------------------------+-----+
|[28395.0,610.291,0.988855999,0.958027126,0.913357755]|2.0  |
|[28734.0,638.018,0.984985603,0.887033637,0.953860842]|2.0  |
|[29380.0,624.11,0.989558774,0.947849473,0.908774239] |2.0  |
|[30008.0,645.884,0.976695743,0.903936374,0.928328835]|2.0  |
|[30140.0,620.134,0.99089325,0.984877069,0.970515523] |2.0  |
+-----------------------------------------------------+-----+
only showing top 5 rows



In [34]:
# train test split
train_df, test_df = df.randomSplit([0.7, 0.30])

In [35]:
# building the model logistic regression
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol='features', labelCol='label')
lr_model = lr.fit(train_df)


In [36]:
# make predictions on testing data
lr_predictions = lr_model.transform(test_df)

In [37]:
# model evaluation
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
lr_accuracy = evaluator.evaluate(lr_predictions)
print('Accuracy:', lr_accuracy)

Accuracy: 0.9188538405971587


In [38]:
# The confusion matrix
lr_predictions.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|    8|
|  3.0|       5.0|    3|
|  0.0|       5.0|    1|
|  5.0|       1.0|   13|
|  5.0|       2.0|    4|
|  5.0|       4.0|   24|
|  1.0|       1.0|  737|
|  4.0|       5.0|   15|
|  0.0|       1.0|   75|
|  2.0|       2.0|  611|
|  1.0|       0.0|   73|
|  3.0|       1.0|    7|
|  5.0|       3.0|    3|
|  2.0|       3.0|    1|
|  6.0|       6.0|  147|
|  1.0|       4.0|    3|
|  4.0|       4.0|  433|
|  1.0|       5.0|    2|
|  3.0|       4.0|    9|
|  2.0|       1.0|   21|
+-----+----------+-----+
only showing top 20 rows



In [39]:
# The precision
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='weightedPrecision')
lr_precision = evaluator.evaluate(lr_predictions)
print('Precision:', lr_precision)

Precision: 0.919017466333299


In [40]:
# the recall
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='weightedRecall')
lr_recall = evaluator.evaluate(lr_predictions)
print('Recall:', lr_recall)

Recall: 0.9188538405971587


In [41]:
# the balance of precision and recall
lr_f1 = 2 * (lr_precision * lr_recall) / (lr_precision + lr_recall)
print('F1 Score:', lr_f1)

F1 Score: 0.9189356461814268


# This means the model is doing a good job at identifying different classes of dry beans based on these features.