In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=943e27fcac9f587deeb443086f275c82d16ce78d9b2a66c9476b420bb6ce47f9
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [26]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import DecisionTreeClassifier,LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer,VectorAssembler,OneHotEncoder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pyspark.ml.tuning
Spark=SparkSession.builder.appName("IRIS").getOrCreate()

In [27]:
data = spark.read.csv('/content/IRIS.csv', header=True, inferSchema=True)
data.show()

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|
|         4.9|        3.1|         1.5|        0.1|Iris-setosa|
|         5.4|        3.7|         1.5|        0.2|Iris-setosa|
|         4.8|        3.4|         1.6|        0.2|Iris-setosa|
|         4.8|        3.0|         1.4| 

In [28]:
data.select('species').distinct().show()

+---------------+
|        species|
+---------------+
| Iris-virginica|
|    Iris-setosa|
|Iris-versicolor|
+---------------+



In [29]:
stringIndexer  = StringIndexer(inputCol="species", outputCol="label")
assembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"], outputCol="features")
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

In [30]:
dt = DecisionTreeClassifier(labelCol='label', featuresCol='features')
pipeline = Pipeline(stages=[stringIndexer, assembler, dt])
paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [3, 5, 7]).addGrid(dt.minInstancesPerNode, [1, 3, 5]).build()

In [25]:
crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,evaluator=MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy'),numFolds=5)
cvModel = crossval.fit(train_data)
best_model = cvModel.bestModel
predictions = best_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"DecisionTreeClassifier Test Accuracy: {accuracy:.2f}")

DecisionTreeClassifier Test Accuracy: 0.96


In [24]:
rt = RandomForestClassifier(labelCol='label', featuresCol='features')
pipeline = Pipeline(stages=[stringIndexer, assembler, rt])
paramGrid = ParamGridBuilder().addGrid(rt.maxDepth, [3, 5, 7]).addGrid(rt.minInstancesPerNode, [1, 3, 5]).build()
crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,evaluator=MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy'),numFolds=5)
cvModel = crossval.fit(train_data)
best_model = cvModel.bestModel
predictions = best_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f" RandomForestClassifier Test Accuracy: {accuracy:.2f}")

 RandomForestClassifier Test Accuracy: 0.96


In [31]:
nt = NaiveBayes(labelCol='label', featuresCol='features')
pipeline = Pipeline(stages=[stringIndexer, assembler, nt])
paramGrid = ParamGridBuilder().addGrid(nt.smoothing, [0.0, 0.5, 1.0]).build()
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy'),
                          numFolds=5)
cvModel = crossval.fit(train_data)
best_model = cvModel.bestModel
predictions = best_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"NaiveBayes Test Accuracy: {accuracy:.2f}")

NaiveBayes Test Accuracy: 1.00


In [32]:
lt = LogisticRegression(labelCol='label', featuresCol='features')
pipeline = Pipeline(stages=[stringIndexer, assembler, lt])
paramGrid = (ParamGridBuilder().addGrid(lt.regParam, [0.01, 0.1, 1.0]).addGrid(lt.elasticNetParam, [0.0, 0.5, 1.0]).addGrid(lt.maxIter, [10, 50, 100]).build())

crossval = CrossValidator(estimator=pipeline,estimatorParamMaps=paramGrid,evaluator=MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy'),numFolds=5)
cvModel = crossval.fit(train_data)
best_model = cvModel.bestModel
predictions = best_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"LogisticRegression Test Accuracy: {accuracy:.2f}")


LogisticRegression Test Accuracy: 1.00
