In [19]:
import findspark
findspark.init()

import pyspark;
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier, DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler, IndexToString
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import *
from pyspark import SparkConf, SparkContext 

In [2]:
spark = SparkSession.builder.master('local').appName('playground').config('spark.driver.memory', '5G').getOrCreate()
spark.builder.config('spark.executor.memory', '16G')
spark.builder.config("spark.executor.cores", "4")

<pyspark.sql.session.SparkSession.Builder at 0x107aa6d30>

In [3]:
#Importing Data 
df = spark.read.csv('/Users/spurushe/Downloads/iris.csv', header='true', inferSchema='true')
df.head(5)

[Row(Id=1, SepalLength=5.1, SepalWidth=3.5, PetalLength=1.4, PetalWidth=0.2, Species='setosa'),
 Row(Id=2, SepalLength=4.9, SepalWidth=3.0, PetalLength=1.4, PetalWidth=0.2, Species='setosa'),
 Row(Id=3, SepalLength=4.7, SepalWidth=3.2, PetalLength=1.3, PetalWidth=0.2, Species='setosa'),
 Row(Id=4, SepalLength=4.6, SepalWidth=3.1, PetalLength=1.5, PetalWidth=0.2, Species='setosa'),
 Row(Id=5, SepalLength=5.0, SepalWidth=3.6, PetalLength=1.4, PetalWidth=0.2, Species='setosa')]

In [4]:
type(df)

pyspark.sql.dataframe.DataFrame

In [5]:
#StringIndexer () is an Estimator which returns a Transformer (labelIndexer)
# Converts label String classes to indices --- for e.g. 'good', 'bad', 'ugly' to 0,1,2
labelIndexer = StringIndexer(inputCol="Species", outputCol="indexedLabel").fit(df)

In [6]:
featureAssembler = VectorAssembler(inputCols= [x for x in df.columns if x != 'Species'], outputCol="features")

featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4)

In [7]:
#*******************************************
# SINGLE TRAIN TEST SPLIT
#*******************************************.
(trainingData, testData) = df.randomSplit([0.7, 0.3])

In [8]:
#*******************************************
# TRAINING THE MODEL
#*******************************************.
dec_t = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

In [9]:
# Chain indexers and Decision tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureAssembler, featureIndexer, dec_t])

In [10]:
pipeline.getStages()

[StringIndexer_40519d887ce0b9f43ba3,
 VectorAssembler_45298c653e44d17f956f,
 VectorIndexer_43c6ad263a0207b7b6cb,
 DecisionTreeClassifier_4fb2a7260266e65ef7cf]

In [11]:
# Train model.  This also runs the indexers.
# Estimators' .fit() returns a Transformer (model)
model = pipeline.fit(trainingData)


In [14]:
#*******************************************
# PREDICTION
#*******************************************
predictions = model.transform(testData)

In [18]:
# Quick view at columns of interest
predictions.select('indexedLabel', 'prediction').head(5)

[Row(indexedLabel=2.0, prediction=2.0),
 Row(indexedLabel=2.0, prediction=2.0),
 Row(indexedLabel=2.0, prediction=2.0),
 Row(indexedLabel=2.0, prediction=2.0),
 Row(indexedLabel=2.0, prediction=2.0)]

In [20]:
predictions.columns

['Id',
 'SepalLength',
 'SepalWidth',
 'PetalLength',
 'PetalWidth',
 'Species',
 'indexedLabel',
 'features',
 'indexedFeatures',
 'rawPrediction',
 'probability',
 'prediction']

In [27]:
# Convert indices back to labels
in_to_label = IndexToString(inputCol='indexedLabel', outputCol='Predicted_label').transform(predictions)
in_to_label.select('Species','indexedLabel', 'Predicted_label').head(30)

[Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='versicolor', indexedLabel=0.0, Pre

In [29]:
#*******************************************
# EVALUATION
#
# evaluating the performance of our ML model
#*******************************************

eva = MulticlassClassificationEvaluator(labelCol='indexedLabel', predictionCol='prediction', metricName='accuracy')

accuracy = eva.evaluate(predictions)
print("Accuracy of our DT model in predicting flowers is ", accuracy)

Accuracy of our DT model in predicting flowers is  1.0


In [33]:
## Getting the entire Decision Tree rules.

print(model.stages[3].toDebugString)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4fb2a7260266e65ef7cf) of depth 3 with 7 nodes
  If (feature 3 <= 2.45)
   Predict: 2.0
  Else (feature 3 > 2.45)
   If (feature 0 <= 99.5)
    Predict: 0.0
   Else (feature 0 > 99.5)
    If (feature 3 <= 4.25)
     Predict: 0.0
    Else (feature 3 > 4.25)
     Predict: 1.0

