# Iris Data Analysis

In [None]:
import sys
sys.path.append("..")
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, IndexToString
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from helpers.path_translation import translate_to_file_string
from helpers.data_prep_and_print import print_df

## Select the Iris File 

In [None]:
input_file = translate_to_file_string("../../data/iris.data")

## Create the Spark Session 

In [None]:
#create a SparkSession
spark = (SparkSession
       .builder
       .appName("Iris data analysis")
       .getOrCreate())


## Read the Data File

In [None]:
# load data file.
# create a DataFrame using an infered Schema 
df = spark.read.option("header", "false") \
       .option("inferSchema", "true") \
       .option("delimiter", ",") \
       .csv(input_file) \
       .withColumnRenamed("_c0","sepal length")\
       .withColumnRenamed("_c1","sepal width") \
       .withColumnRenamed("_c2","petal length")\
       .withColumnRenamed("_c3","petal width") \
       .withColumnRenamed("_c4","class")
print(df.printSchema())

## Data Preparation
### Transform labels into index

In [None]:
labelIndexer = StringIndexer().setInputCol("class").setOutputCol("label").fit(df)

 ### Build the feature vector

In [None]:
featureCols = ['sepal length', 'sepal width', 'petal length', 'petal width']
assembler =  VectorAssembler(outputCol="features", inputCols=list(featureCols))
labeled_point_ds = assembler.transform(labelIndexer.transform(df))
print_df(labeled_point_ds,10)

In [None]:
#split data for testing
splits = labeled_point_ds.randomSplit([0.6, 0.4 ], 5756)
train = splits[0]
test = splits[1]

## Build the Decision Tree Model

In [None]:

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", impurity="entropy")
dtModel = dt.fit(train)

## Build an Evaluator

In [None]:
evaluator =  MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction", metricName="accuracy")

## Do the Prediction

### Convert Indexed Labels back to Original Labels

In [None]:
predConverter = IndexToString(inputCol="prediction",outputCol="predictedLabel",labels=labelIndexer.labels)

In [None]:
predictions = dtModel.transform(test)
predictionsConverted = predConverter.transform(predictions)

## Evaluate the Model

In [None]:
accuracy = evaluator.evaluate(predictions)
print("Test Error = " ,(1.0 - accuracy))

In [None]:
print_df(predictionsConverted.select("prediction", "label", "predictedLabel", "class", "features"))

In [None]:
spark.stop()