### 1. Import libraries

In [1]:
import numpy as np
import pandas as pd
import pyspark
import os
import urllib
import sys

from pyspark.sql.functions import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.feature import *


In [2]:
spark = pyspark.sql.SparkSession.builder.appName('Iris').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/15 06:44:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# print runtime versions
print ('****************')
print ('Python version: {}'.format(sys.version))
print ('Spark version: {}'.format(spark.version))
print ('****************')

****************
Python version: 3.9.12 (main, Apr  5 2022, 01:53:17) 
[Clang 12.0.0 ]
Spark version: 3.3.0
****************


### 2. Read Data

In [4]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

from pyspark import SparkFiles
spark.sparkContext.addFile(url)

iris_df = spark.read.csv("file://"+SparkFiles.get("iris.data"),
                           header=None,
                          #names=['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class'],
                          inferSchema=True)

from functools import reduce

newColumns = ["sepallength", "sepalwidth", "petallength", "petalwidth", "species"]
oldColumns = iris_df.schema.names

iris_df = reduce(lambda iris_df, idx: iris_df.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), iris_df)
iris_df.printSchema()
iris_df.show()

                                                                                

root
 |-- sepallength: double (nullable = true)
 |-- sepalwidth: double (nullable = true)
 |-- petallength: double (nullable = true)
 |-- petalwidth: double (nullable = true)
 |-- species: string (nullable = true)

+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|    species|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|
|        5.4|       3.9|        1.7|       0.4|Iris-setosa|
|        4.6|       3.4|        1.4|       0.3|Iris-setosa|
|        5.0|       3.4|        1.5|       0.2|Iris-setosa|
|        4.4|       2.9|        1.4|       0.2|Iris-setosa|
|        4.9|       3.1|        1.5|       0.1|Iris-setosa|
|    

### 3. Data preprocessing

In [5]:
data = iris_df

In [6]:
# vectorize all numerical columns into a single feature column
feature_cols = data.columns[:-1]
assembler = VectorAssembler(inputCols=feature_cols, 
                            outputCol='features')
data = assembler.transform(data)


# convert text labels into indices
data = data.select(['features', 'species'])
label_indexer = pyspark.ml.feature.StringIndexer(inputCol='species', 
                                                 outputCol='label').fit(data)
data = label_indexer.transform(data)


# only select the features and label column
data = data.select(['features', 'label'])
print("Reading for machine learning")
data.show(10)

Reading for machine learning
+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|
|[4.9,3.0,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
|[5.0,3.6,1.4,0.2]|  0.0|
|[5.4,3.9,1.7,0.4]|  0.0|
|[4.6,3.4,1.4,0.3]|  0.0|
|[5.0,3.4,1.5,0.2]|  0.0|
|[4.4,2.9,1.4,0.2]|  0.0|
|[4.9,3.1,1.5,0.1]|  0.0|
+-----------------+-----+
only showing top 10 rows



### 4. Data splitting

In [7]:


# use Logistic Regression to train on the training set
train, test = data.randomSplit([0.70, 0.30])

train.show(5)
test.show(5)

train.count(), test.count()

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[4.3,3.0,1.1,0.1]|  0.0|
|[4.4,2.9,1.4,0.2]|  0.0|
|[4.4,3.0,1.3,0.2]|  0.0|
|[4.5,2.3,1.3,0.3]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
+-----------------+-----+
only showing top 5 rows

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[4.4,3.2,1.3,0.2]|  0.0|
|[4.6,3.2,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.7,3.2,1.6,0.2]|  0.0|
|[4.8,3.0,1.4,0.1]|  0.0|
+-----------------+-----+
only showing top 5 rows



(104, 46)

### 5. Model Training

In [8]:
reg=0.01

lr = LogisticRegression(regParam=reg)
model = lr.fit(train)


# predict on the test set
prediction = model.transform(test)
print("Prediction")
prediction.show(10)

22/08/15 06:44:33 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/08/15 06:44:33 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
Prediction
+-----------------+-----+--------------------+--------------------+----------+
|         features|label|       rawPrediction|         probability|prediction|
+-----------------+-----+--------------------+--------------------+----------+
|[4.4,3.2,1.3,0.2]|  0.0|[6.20253311437904...|[0.98602204250940...|       0.0|
|[4.6,3.2,1.4,0.2]|  0.0|[5.87454175209752...|[0.97825139070968...|       0.0|
|[4.7,3.2,1.3,0.2]|  0.0|[5.85330807667506...|[0.97579776058443...|       0.0|
|[4.7,3.2,1.6,0.2]|  0.0|[5.56778406523845...|[0.96969418887344...|       0.0|
|[4.8,3.0,1.4,0.1]|  0.0|[5.4165838817044,...|[0.94934618947189...|       0.0|
|[4.8,3.0,1.4,0.3]|  0.0|[4.98634424850693...|[0.93573217827485...|       0.0|
|[4.8,3.1,1.6,0.2]|  0.0|[5.2312

### 6. Model Evaluation

In [9]:
# evaluate the accuracy of the model using the test set
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
accuracy = evaluator.evaluate(prediction)

In [10]:
print('#####################################')
print('Regularization rate is {}'.format(reg))
print("Accuracy is {}".format(accuracy))
print('#####################################')
print()

#####################################
Regularization rate is 0.01
Accuracy is 0.9565217391304348
#####################################

