In [None]:
!pip install pyspark

In [99]:
# We import the necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer

In [100]:
# We create SparkSession. Then we read our dataset as a dataframe
spark = SparkSession.builder.appName("mllib_siniflandirma").getOrCreate()
irisDataset = spark.read.csv("IRIS.csv", header=True, inferSchema=True)

In [107]:
# The "species" column does not come as an integer in the database,
# That's why we convert the species column to a numeric label column using the StringIndexer.
indexer = StringIndexer(inputCol="species", outputCol="bitkiTuru")
trainingData = indexer.fit(trainingData).transform(trainingData)
# indexed = indexer.fit(irisDataset).transform(irisDataset)

In [108]:
# We bring together the features required for classification
assembler = VectorAssembler(
    inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
    outputCol="features")

# We divide the training (70%) and test (30%) data
(trainingData, testData) = indexed.randomSplit([0.7, 0.3])

In [109]:
# Next, we train a classification model using, for example, DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

In [110]:
# We create the pipeline
pipeline = Pipeline(stages=[assembler, indexer, dt])

In [111]:
# We train the dataset
model = pipeline.fit(trainingData)

In [112]:
# We make predictions using the test dataset
predictions = model.transform(testData)

In [113]:
# We get the accuracy value of the model we use
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.9512195121951219
