In [None]:
# -*- coding: utf-8 -*-
# Indentation: Jupyter Notebook

'''
Classification using SparkML
'''

__version__ = 1.0
__author__ = "Sourav Raj"
__author_email__ = "souravraj.iitbbs@gmail.com"


In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
sc =SparkContext()
spark=SparkSession(sc)

In [2]:
from pyspark.sql.functions import *
from pyspark.ml.feature import  VectorAssembler
from pyspark.ml.feature import StringIndexer

In [10]:
iris_df=spark.read.csv('../data/iris.csv.txt', inferSchema=True)

In [4]:
iris_df.take(1)

[Row(_c0=5.1, _c1=3.5, _c2=1.4, _c3=0.2, _c4='Iris-setosa')]

In [11]:
iris_df=iris_df.select(col('_c0').alias('sepal_len'), col('_c1').alias('sepal_width'), col('_c2').alias('petal_len'), col('_c3').alias('petal_width'), col('_c4').alias('species'))

In [12]:
iris_df.take(1)

[Row(sepal_len=5.1, sepal_width=3.5, petal_len=1.4, petal_width=0.2, species='Iris-setosa')]

In [13]:
vectorAssembler =VectorAssembler(inputCols=['sepal_len', 'sepal_width', 'petal_len', 'petal_width'], outputCol='features')

In [14]:
viris_df=vectorAssembler.transform(iris_df)
viris_df.take(1)

[Row(sepal_len=5.1, sepal_width=3.5, petal_len=1.4, petal_width=0.2, species='Iris-setosa', features=DenseVector([5.1, 3.5, 1.4, 0.2]))]

In [16]:
# transform species col to numeric cols
indexer = StringIndexer(inputCol='species', outputCol='label')

In [17]:
iviris_df=indexer.fit(viris_df).transform(viris_df)
iviris_df.take(1)

[Row(sepal_len=5.1, sepal_width=3.5, petal_len=1.4, petal_width=0.2, species='Iris-setosa', features=DenseVector([5.1, 3.5, 1.4, 0.2]), label=0.0)]

# Naive Bayes Classification

In [1]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

ModuleNotFoundError: No module named 'pyspark'

In [22]:
splits=iviris_df.randomSplit([0.6, 0.4], seed=1)
train_df=splits[0]
test_df=splits[1]

In [23]:
train_df.count()

92

In [24]:
test_df.count()

58

In [25]:
iviris_df.count()

150

In [26]:
nb=NaiveBayes(modelType='multinomial') # as more than 2 category

In [28]:
nb_model=nb.fit(train_df)

In [29]:
pred_df=nb_model.transform(test_df)

In [30]:
pred_df.take(1)

[Row(sepal_len=4.5, sepal_width=2.3, petal_len=1.3, petal_width=0.3, species='Iris-setosa', features=DenseVector([4.5, 2.3, 1.3, 0.3]), label=0.0, rawPrediction=DenseVector([-10.3605, -11.0141, -11.7112]), probability=DenseVector([0.562, 0.2924, 0.1456]), prediction=0.0)]

In [31]:
evaluator=MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')

In [32]:
nbaccuracy=evaluator.evaluate(pred_df)

In [33]:
nbaccuracy

0.5862068965517241

# Multilayer Perceptron classification

In [34]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [36]:
# consider 4 layer neural network
layers=[4,5,5,3] #[no of i/p feature, neuron in 2nd layer, neuron in 3rd layer, no of output]

In [37]:
mlp =MultilayerPerceptronClassifier(layers=layers, seed=1)

In [38]:
mlp_model= mlp.fit(train_df)

In [39]:
mlp_pred= mlp_model.transform(test_df)

In [40]:
mlp_evaluator=MulticlassClassificationEvaluator(metricName='accuracy')

In [41]:
mlp_accuracy=mlp_evaluator.evaluate(mlp_pred)
mlp_accuracy

0.9482758620689655

# Decision tree

In [42]:
from pyspark.ml.classification import DecisionTreeClassifier

In [43]:
dt = DecisionTreeClassifier(labelCol='label', featuresCol='features')

In [44]:
dt_model =dt.fit(train_df)
dt_pred=dt_model.transform(test_df)

In [45]:
dt_evaluator=MulticlassClassificationEvaluator(metricName='accuracy')

In [46]:
dt_accuracy=dt_evaluator.evaluate(dt_pred)
dt_accuracy

0.9310344827586207