In [46]:
import findspark
findspark.init()

import pyspark;
from pyspark.ml import Pipeline
from pyspark.ml.feature import *

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import *
from pyspark import SparkConf, SparkContext 
from pyspark.sql.functions import udf

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.linalg import DenseVector

In [2]:
spark = SparkSession.builder.master('local').appName('playground').config('spark.driver.memory', '5G').getOrCreate()
spark.builder.config('spark.executor.memory', '16G')
spark.builder.config("spark.executor.cores", "4")

<pyspark.sql.session.SparkSession.Builder at 0x1079a7c88>

In [3]:
#Importing Data 
df = spark.read.csv('/Users/spurushe/Downloads/iris_train.csv', header='true', inferSchema='true')

In [4]:
df.show(10)

+------+-----------+----------+-----------+----------+---------------+------+----------+
|Row_id|SepalLength|SepalWidth|PetalLength|PetalWidth|          Class|Smelly|    Status|
+------+-----------+----------+-----------+----------+---------------+------+----------+
|     1|        5.1|       3.5|        1.4|       0.2|    Iris-setosa|   Yes|Developing|
|     2|        4.9|       3.0|        1.4|       0.2|    Iris-setosa|   Yes|Developing|
|     3|        5.0|       3.6|        1.4|       0.2|    Iris-setosa|   Yes|Developing|
|     4|        4.8|       3.0|        1.4|       0.1|    Iris-setosa|   Yes|Developing|
|     5|        4.3|       3.0|        1.1|       0.1|    Iris-setosa|   Yes|Developing|
|     6|        5.8|       4.0|        1.2|       0.2|    Iris-setosa|   Yes|Developing|
|     7|        5.7|       4.4|        1.5|       0.4|    Iris-setosa|   Yes|Developing|
|     8|        5.4|       3.9|        1.3|       0.4|    Iris-setosa|   Yes|Developing|
|     9|        6.4| 

###### The feature transformers are discussed in the order they're usually used in my ML pipelines. (Easier to demonstrate). Urs may differ. 

##### Categorical columns have to be massaged before they become part of the input feature space on which the machine learning model is trained. 

Different algorithms differ in the way they handle categorical columns.  
Some algorithms like Decision Trees and Random forests are naturally better suited to handle categorical data.  
How data is handled (indexed and/ or assembled ) before the training phase depends on the algorithm being used.

### 1. String Indexer (using a single categorical column)

In [85]:
# Lets encode the Class column
target_st_indexer = StringIndexer(inputCol='Class', outputCol='indexed_class_column')
#You can chooose the way you want the String categories indexed.

target_st_indexer_model = target_st_indexer.fit(df)
transformed_df = target_st_indexer_model.transform(df)

# lets see what the encoding was like
transformed_df.filter(transformed_df.Class == 'Iris-versicolor').show(5)
transformed_df.filter(transformed_df.Class == 'Iris-virginica').show(5)
transformed_df.filter(transformed_df.Class == 'Iris-setosa').show(5)

+------+-----------+----------+-----------+----------+---------------+------+----------+--------------------+
|Row_id|SepalLength|SepalWidth|PetalLength|PetalWidth|          Class|Smelly|    Status|indexed_class_column|
+------+-----------+----------+-----------+----------+---------------+------+----------+--------------------+
|     9|        6.4|       3.2|        4.5|       1.5|Iris-versicolor|   Yes|Developing|                 0.0|
|    10|        6.9|       3.1|        4.9|       1.5|Iris-versicolor|   Yes|Developing|                 0.0|
|    11|        5.5|       2.3|        4.0|       1.3|Iris-versicolor|   Yes|Developing|                 0.0|
|    12|        6.5|       2.8|        4.6|       1.5|Iris-versicolor|   Yes|Developing|                 0.0|
|    13|        5.7|       2.8|        4.5|       1.3|Iris-versicolor|   Yes|Developing|                 0.0|
+------+-----------+----------+-----------+----------+---------------+------+----------+--------------------+
only showi

You can see that each category in the 'Class' column now is indexed with a separate number index. 

In [11]:
target_st_indexer_model.labels

['Iris-versicolor', 'Iris-setosa', 'Iris-virginica']

### 1.1. String Indexer (using multiple categorical columns)

But what do you do when you have many categorical columns

In [101]:
features = [item[0] for item in df.dtypes if item[1] =='string' and item[0] != 'Class']
features

['Smelly', 'Status']

In [102]:
# Indexers for each of the categorical columns
st_indexers = list(map(lambda name: StringIndexer(inputCol=name, outputCol="indexed_"+name)
                               , features))

pipeline = Pipeline(stages=st_indexers)

In [103]:
#Two individual indexers for each categorical (predictor) columns, remember Class is the target column 
st_indexers

[StringIndexer_49c79f04a86218c84870, StringIndexer_47a7a84f58c965adf397]

In [104]:
indexer_model = pipeline.fit(df)
transformed_df = indexer_model.transform(df)
transformed_df.show(10)

#String Indexing completed 

+------+-----------+----------+-----------+----------+---------------+------+----------+--------------+--------------+
|Row_id|SepalLength|SepalWidth|PetalLength|PetalWidth|          Class|Smelly|    Status|indexed_Smelly|indexed_Status|
+------+-----------+----------+-----------+----------+---------------+------+----------+--------------+--------------+
|     1|        5.1|       3.5|        1.4|       0.2|    Iris-setosa|   Yes|Developing|           0.0|           0.0|
|     2|        4.9|       3.0|        1.4|       0.2|    Iris-setosa|   Yes|Developing|           0.0|           0.0|
|     3|        5.0|       3.6|        1.4|       0.2|    Iris-setosa|   Yes|Developing|           0.0|           0.0|
|     4|        4.8|       3.0|        1.4|       0.1|    Iris-setosa|   Yes|Developing|           0.0|           0.0|
|     5|        4.3|       3.0|        1.1|       0.1|    Iris-setosa|   Yes|Developing|           0.0|           0.0|
|     6|        5.8|       4.0|        1.2|     

### 2. Vector Assembler

This transformer helps in collecting all the raw features and massaged categorical (encoded/indexed) features into a single feature vector.  
**Vector Assembler HAS to come AFTER an Indexer (StringIndexer etc) if your data has categorical predictor columns**
  
A Vector Assembler cannot take categorical features for assembling.
For eg. it cannot do [0.1, 76, 98, 'Yes', 'Red', 1]  
It accepts boolean, numerical and vector types only, not StringType.

In [105]:
transformed_df.dtypes

[('Row_id', 'int'),
 ('SepalLength', 'double'),
 ('SepalWidth', 'double'),
 ('PetalLength', 'double'),
 ('PetalWidth', 'double'),
 ('Class', 'string'),
 ('Smelly', 'string'),
 ('Status', 'string'),
 ('indexed_Smelly', 'double'),
 ('indexed_Status', 'double')]

In [106]:
# Collecting the numerical (raw and massaged) input features 
#Row id does not provide any information 
vecAss_features = [item[0] for item in transformed_df.dtypes if item[1] != 'string' and item[0] != 'Row_id' ]
vecAss_features

['SepalLength',
 'SepalWidth',
 'PetalLength',
 'PetalWidth',
 'indexed_Smelly',
 'indexed_Status']

In [107]:
vec_assembler = VectorAssembler(inputCols=vecAss_features, outputCol="assembled_features")


#How the assembler works by itself 
assembled_pipeline = Pipeline(stages=[vec_assembler])
assembled_pipeline_model = assembled_pipeline.fit(transformed_df)

asb_transformed_df = assembled_pipeline_model.transform(transformed_df)
asb_transformed_df.select("assembled_features").show()

+--------------------+
|  assembled_features|
+--------------------+
|[5.1,3.5,1.4,0.2,...|
|[4.9,3.0,1.4,0.2,...|
|[5.0,3.6,1.4,0.2,...|
|[4.8,3.0,1.4,0.1,...|
|[4.3,3.0,1.1,0.1,...|
|[5.8,4.0,1.2,0.2,...|
|[5.7,4.4,1.5,0.4,...|
|[5.4,3.9,1.3,0.4,...|
|[6.4,3.2,4.5,1.5,...|
|[6.9,3.1,4.9,1.5,...|
|[5.5,2.3,4.0,1.3,...|
|[6.5,2.8,4.6,1.5,...|
|[5.7,2.8,4.5,1.3,...|
|[6.3,3.3,4.7,1.6,...|
|[5.1,3.5,1.4,0.3,...|
|[5.7,3.8,1.7,0.3,...|
|[5.1,3.8,1.5,0.3,...|
|[6.9,3.1,5.4,2.1,...|
|[6.7,3.1,5.6,2.4,...|
|[5.4,3.4,1.7,0.2,...|
+--------------------+
only showing top 20 rows



You can see that all the numerical data has been assembled into vectors (one vector per record).

### 2. Vector Indexer

In [108]:
vec_indexer = VectorIndexer(inputCol="assembled_features", outputCol="indexed_ml_features", maxCategories=2)

In [109]:
# Final Transformation pipeline (this should create the input features space that goes into your ml algo)
# i.e. your ML algo should use "indexed_ml_features" as its inputColumn

trans_pipeline = Pipeline(stages=[vec_assembler, vec_indexer])
trans_pipeline_model = trans_pipeline.fit(transformed_df)

ml_input_features_df = trans_pipeline_model.transform(transformed_df)
ml_input_features_df.select("indexed_ml_features").show()

+--------------------+
| indexed_ml_features|
+--------------------+
|[5.1,3.5,1.4,0.2,...|
|[4.9,3.0,1.4,0.2,...|
|[5.0,3.6,1.4,0.2,...|
|[4.8,3.0,1.4,0.1,...|
|[4.3,3.0,1.1,0.1,...|
|[5.8,4.0,1.2,0.2,...|
|[5.7,4.4,1.5,0.4,...|
|[5.4,3.9,1.3,0.4,...|
|[6.4,3.2,4.5,1.5,...|
|[6.9,3.1,4.9,1.5,...|
|[5.5,2.3,4.0,1.3,...|
|[6.5,2.8,4.6,1.5,...|
|[5.7,2.8,4.5,1.3,...|
|[6.3,3.3,4.7,1.6,...|
|[5.1,3.5,1.4,0.3,...|
|[5.7,3.8,1.7,0.3,...|
|[5.1,3.8,1.5,0.3,...|
|[6.9,3.1,5.4,2.1,...|
|[6.7,3.1,5.6,2.4,...|
|[5.4,3.4,1.7,0.2,...|
+--------------------+
only showing top 20 rows



In [71]:
#Trying to see the structure of the Vector Indexed columns

#my_Dense = udf(lambda vec: vec.toDense)
#x = ml_input_features_df.withColumn("dense", my_Dense("indexed_ml_features"))

#print(x.select("dense"))

DataFrame[dense: string]


In [110]:
dec_t = DecisionTreeClassifier(labelCol="indexed_class_column", featuresCol="indexed_ml_features")

In [111]:
(trainingData, testData) = ml_input_features_df.randomSplit([0.7, 0.3], seed=2018)

In [120]:
ml_input_features_df.columns

['Row_id',
 'SepalLength',
 'SepalWidth',
 'PetalLength',
 'PetalWidth',
 'Class',
 'Smelly',
 'Status',
 'indexed_Smelly',
 'indexed_Status',
 'assembled_features',
 'indexed_ml_features']

In [88]:
vec_assembler.getInputCols()

['SepalLength',
 'SepalWidth',
 'PetalLength',
 'PetalWidth',
 'indexed_Smelly',
 'indexed_Status']

In [112]:
final_ml_pipiline = Pipeline(stages=[target_st_indexer, dec_t])

In [113]:
#Train final DT
dt_model = final_ml_pipiline.fit(trainingData)

In [114]:
#predictions
predictions = dt_model.transform(testData)

In [115]:
predictions.columns

['Row_id',
 'SepalLength',
 'SepalWidth',
 'PetalLength',
 'PetalWidth',
 'Class',
 'Smelly',
 'Status',
 'indexed_Smelly',
 'indexed_Status',
 'assembled_features',
 'indexed_ml_features',
 'indexed_class_column',
 'rawPrediction',
 'probability',
 'prediction']

In [116]:
# Convert indices back to labels
in_to_label = IndexToString(inputCol='indexed_class_column', outputCol='Predicted_label').transform(predictions)
in_to_label.select('Class','indexed_class_column', 'Predicted_label').head(30)

[Row(Class='Iris-setosa', indexed_class_column=2.0, Predicted_label='Iris-setosa'),
 Row(Class='Iris-setosa', indexed_class_column=2.0, Predicted_label='Iris-setosa'),
 Row(Class='Iris-versicolor', indexed_class_column=0.0, Predicted_label='Iris-versicolor'),
 Row(Class='Iris-setosa', indexed_class_column=2.0, Predicted_label='Iris-setosa'),
 Row(Class='Iris-virginica', indexed_class_column=1.0, Predicted_label='Iris-virginica'),
 Row(Class='Iris-setosa', indexed_class_column=2.0, Predicted_label='Iris-setosa'),
 Row(Class='Iris-versicolor', indexed_class_column=0.0, Predicted_label='Iris-versicolor'),
 Row(Class='Iris-setosa', indexed_class_column=2.0, Predicted_label='Iris-setosa'),
 Row(Class='Iris-versicolor', indexed_class_column=0.0, Predicted_label='Iris-versicolor'),
 Row(Class='Iris-setosa', indexed_class_column=2.0, Predicted_label='Iris-setosa'),
 Row(Class='Iris-setosa', indexed_class_column=2.0, Predicted_label='Iris-setosa'),
 Row(Class='Iris-versicolor', indexed_class_co

In [117]:
#*******************************************
# EVALUATION
#
# evaluating the performance of our ML model
#*******************************************

eva = MulticlassClassificationEvaluator(labelCol='indexed_class_column', predictionCol='prediction', metricName='accuracy')

accuracy = eva.evaluate(predictions)
print("Accuracy of our DT model in predicting flowers is ", accuracy)

Accuracy of our DT model in predicting flowers is  0.9393939393939394


In [None]:
spark.stop()