In [39]:
import findspark
findspark.init()


In [40]:
import pyspark 
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("StudyApp").getOrCreate()


In [41]:
df=spark.read.csv("D:\iris.csv",header=True,inferSchema=True)
df.show(5)
df.count

+-----------+----------+-----------+----------+-------+
|sepallength|sepalwidth|petallength|petalwidth|variety|
+-----------+----------+-----------+----------+-------+
|        5.1|       3.5|        1.4|       0.2| Setosa|
|        4.9|       3.0|        1.4|       0.2| Setosa|
|        4.7|       3.2|        1.3|       0.2| Setosa|
|        4.6|       3.1|        1.5|       0.2| Setosa|
|        5.0|       3.6|        1.4|       0.2| Setosa|
+-----------+----------+-----------+----------+-------+
only showing top 5 rows



<bound method DataFrame.count of DataFrame[sepallength: double, sepalwidth: double, petallength: double, petalwidth: double, variety: string]>

In [42]:
df.printSchema()
df.select('variety').distinct().show()
df.columns[:4]

root
 |-- sepallength: double (nullable = true)
 |-- sepalwidth: double (nullable = true)
 |-- petallength: double (nullable = true)
 |-- petalwidth: double (nullable = true)
 |-- variety: string (nullable = true)

+----------+
|   variety|
+----------+
| Virginica|
|    Setosa|
|Versicolor|
+----------+



['sepallength', 'sepalwidth', 'petallength', 'petalwidth']

In [43]:
#formating
from pyspark.ml.feature import VectorAssembler 
from pyspark.ml.feature import StringIndexer


In [44]:
va=VectorAssembler(inputCols=df.columns[:4],outputCol='Input Feature')
indexer=StringIndexer(inputCol='variety',outputCol='variety_num')#conver into integer
df1=indexer.fit(df).transform(df)
df2=va.transform(df1)
df2.show()

+-----------+----------+-----------+----------+-------+-----------+-----------------+
|sepallength|sepalwidth|petallength|petalwidth|variety|variety_num|    Input Feature|
+-----------+----------+-----------+----------+-------+-----------+-----------------+
|        5.1|       3.5|        1.4|       0.2| Setosa|        0.0|[5.1,3.5,1.4,0.2]|
|        4.9|       3.0|        1.4|       0.2| Setosa|        0.0|[4.9,3.0,1.4,0.2]|
|        4.7|       3.2|        1.3|       0.2| Setosa|        0.0|[4.7,3.2,1.3,0.2]|
|        4.6|       3.1|        1.5|       0.2| Setosa|        0.0|[4.6,3.1,1.5,0.2]|
|        5.0|       3.6|        1.4|       0.2| Setosa|        0.0|[5.0,3.6,1.4,0.2]|
|        5.4|       3.9|        1.7|       0.4| Setosa|        0.0|[5.4,3.9,1.7,0.4]|
|        4.6|       3.4|        1.4|       0.3| Setosa|        0.0|[4.6,3.4,1.4,0.3]|
|        5.0|       3.4|        1.5|       0.2| Setosa|        0.0|[5.0,3.4,1.5,0.2]|
|        4.4|       2.9|        1.4|       0.2| Setosa

In [45]:
finaldata=df2.select('Input Feature',"variety_num")
finaldata.show(5)

+-----------------+-----------+
|    Input Feature|variety_num|
+-----------------+-----------+
|[5.1,3.5,1.4,0.2]|        0.0|
|[4.9,3.0,1.4,0.2]|        0.0|
|[4.7,3.2,1.3,0.2]|        0.0|
|[4.6,3.1,1.5,0.2]|        0.0|
|[5.0,3.6,1.4,0.2]|        0.0|
+-----------------+-----------+
only showing top 5 rows



In [46]:
train,test=finaldata.randomSplit([0.70,0.30])

In [47]:
from pyspark.ml.classification import DecisionTreeClassifier
dtcmodel=DecisionTreeClassifier(labelCol="variety_num",featuresCol="Input Feature")
model=dtcmodel.fit(train)

In [48]:
model

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ccef6404e29b, depth=5, numNodes=15, numClasses=3, numFeatures=4

In [49]:
prediction_result=model.transform(test)

In [50]:
prediction_result.show()

+-----------------+-----------+--------------+-------------+----------+
|    Input Feature|variety_num| rawPrediction|  probability|prediction|
+-----------------+-----------+--------------+-------------+----------+
|[4.4,3.2,1.3,0.2]|        0.0|[36.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.6,3.1,1.5,0.2]|        0.0|[36.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.7,3.2,1.3,0.2]|        0.0|[36.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.8,3.4,1.6,0.2]|        0.0|[36.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.9,2.5,4.5,1.7]|        2.0| [0.0,1.0,0.0]|[0.0,1.0,0.0]|       1.0|
|[4.9,3.1,1.5,0.1]|        0.0|[36.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.9,3.6,1.4,0.1]|        0.0|[36.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[5.0,2.3,3.3,1.0]|        1.0|[0.0,30.0,0.0]|[0.0,1.0,0.0]|       1.0|
|[5.0,3.5,1.3,0.3]|        0.0|[36.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[5.1,3.3,1.7,0.5]|        0.0|[36.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[5.1,3.4,1.5,0.2]|        0.0|[36.0,0.0,0.0]|[1.0,0.0,0.0]|    

In [51]:
prediction_result.select('Input Feature','variety_num','Prediction').show()

+-----------------+-----------+----------+
|    Input Feature|variety_num|Prediction|
+-----------------+-----------+----------+
|[4.4,3.2,1.3,0.2]|        0.0|       0.0|
|[4.6,3.1,1.5,0.2]|        0.0|       0.0|
|[4.7,3.2,1.3,0.2]|        0.0|       0.0|
|[4.8,3.4,1.6,0.2]|        0.0|       0.0|
|[4.9,2.5,4.5,1.7]|        2.0|       1.0|
|[4.9,3.1,1.5,0.1]|        0.0|       0.0|
|[4.9,3.6,1.4,0.1]|        0.0|       0.0|
|[5.0,2.3,3.3,1.0]|        1.0|       1.0|
|[5.0,3.5,1.3,0.3]|        0.0|       0.0|
|[5.1,3.3,1.7,0.5]|        0.0|       0.0|
|[5.1,3.4,1.5,0.2]|        0.0|       0.0|
|[5.1,3.5,1.4,0.2]|        0.0|       0.0|
|[5.2,3.5,1.5,0.2]|        0.0|       0.0|
|[5.4,3.4,1.7,0.2]|        0.0|       0.0|
|[5.5,2.4,3.8,1.1]|        1.0|       1.0|
|[5.5,2.5,4.0,1.3]|        1.0|       1.0|
|[5.5,3.5,1.3,0.2]|        0.0|       0.0|
|[5.6,2.5,3.9,1.1]|        1.0|       1.0|
|[5.7,2.8,4.1,1.3]|        1.0|       1.0|
|[5.7,2.8,4.5,1.3]|        1.0|       1.0|
+----------

In [52]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator=MulticlassClassificationEvaluator(labelCol="variety_num",predictionCol='prediction')
accuracy=evaluator.evaluate(prediction_result)
print("Accurancy",accuracy)
print("Test error model",(1-accuracy))

Accurancy 0.924173865350336
Test error model 0.07582613464966403


In [53]:
#from pyspark.ml.feature import IndexString
#itos=IndexToString(inputCol='prediction',outputCol='Variety Category')
#c=itos.transform(df2)
#print(c.getInputCol()," - ",c.getOutputCol())

In [59]:
from pyspark.ml.classification import RandomForestClassifier
dtcmodel=RandomForestClassifier(labelCol="variety_num",featuresCol="Input Feature")
model=dtcmodel.fit(train)


In [58]:
prediction_result=model.transform(test)
prediction_result.show()

+-----------------+-----------+--------------------+--------------------+----------+
|    Input Feature|variety_num|       rawPrediction|         probability|prediction|
+-----------------+-----------+--------------------+--------------------+----------+
|[4.4,3.2,1.3,0.2]|        0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|[4.6,3.1,1.5,0.2]|        0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|[4.7,3.2,1.3,0.2]|        0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|[4.8,3.4,1.6,0.2]|        0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|[4.9,2.5,4.5,1.7]|        2.0|[0.0,16.947368421...|[0.0,0.8473684210...|       1.0|
|[4.9,3.1,1.5,0.1]|        0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|[4.9,3.6,1.4,0.1]|        0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|[5.0,2.3,3.3,1.0]|        1.0|[0.0,19.947368421...|[0.0,0.9973684210...|       1.0|
|[5.0,3.5,1.3,0.3]|        0.0|      [20.0,0.0,0.0]|       [1.0,0

In [60]:
evaluator=MulticlassClassificationEvaluator(labelCol="variety_num",predictionCol='prediction')
accuracy=evaluator.evaluate(prediction_result)
print("Accurancy",accuracy)
print("Test error model",(1-accuracy))

Accurancy 0.924173865350336
Test error model 0.07582613464966403
