In [23]:
import findspark
findspark.init()

import pyspark;
from pyspark.ml import Pipeline
from pyspark.ml.feature import *

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import *
from pyspark import SparkConf, SparkContext 
from pyspark.sql.functions import udf

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.linalg import DenseVector

In [2]:
spark = SparkSession.builder.master('local').appName('playground').config('spark.driver.memory', '5G').getOrCreate()
spark.builder.config('spark.executor.memory', '16G')
spark.builder.config("spark.executor.cores", "4")

<pyspark.sql.session.SparkSession.Builder at 0x107ba7eb8>

In [3]:
#Importing Data 
df = spark.read.csv('iris_train.csv', header='true', inferSchema='true')

In [4]:
df.show(5)

+------+-----------+----------+-----------+----------+-----------+------+----------+
|Row_id|SepalLength|SepalWidth|PetalLength|PetalWidth|      Class|Smelly|    Status|
+------+-----------+----------+-----------+----------+-----------+------+----------+
|     1|        5.1|       3.5|        1.4|       0.2|Iris-setosa|   Yes|Developing|
|     2|        4.9|       3.0|        1.4|       0.2|Iris-setosa|   Yes|Developing|
|     3|        5.0|       3.6|        1.4|       0.2|Iris-setosa|   Yes|Developing|
|     4|        4.8|       3.0|        1.4|       0.1|Iris-setosa|   Yes|Developing|
|     5|        4.3|       3.0|        1.1|       0.1|Iris-setosa|   Yes|Developing|
+------+-----------+----------+-----------+----------+-----------+------+----------+
only showing top 5 rows



### 1. String Indexing the Target Variable

In [5]:
# # Lets encode the Class column
# target_st_indexer = StringIndexer(inputCol='Class', outputCol='indexed_class_column')
# #You can chooose the way you want the String categories indexed.

# target_st_indexer_model = target_st_indexer.fit(df)
# transformed_df = target_st_indexer_model.transform(df)


You can see that each category in the 'Class' column now is indexed with a separate number index. 

In [24]:
# target_st_indexer_model.labels

### 2. String Indexing multiple predictor categorical columns

In [25]:
# Excluding the target column
# features = [item[0] for item in df.dtypes if item[1] =='string' and item[0] != 'Class']
# features

In [8]:
# Indexers for each of the categorical columns
# st_indexers = list(map(lambda name: StringIndexer(inputCol=name, outputCol="indexed_"+name)
#                                , features))

# pipeline = Pipeline(stages=st_indexers)

In [26]:
#Two individual indexers for each categorical (predictor) columns, remember Class is the target column 
# st_indexers

In [11]:
# indexer_model = pipeline.fit(df)
# transformed_df = indexer_model.transform(df)

#To see what the results might look like
#transformed_df.show(10)

#String Indexing completed 

### 2. Vector Assembler

This transformer helps in collecting all the raw features and massaged categorical (encoded/indexed) features into a single feature vector.  
Vector Assembler HAS to come AFTER an Indexer (StringIndexer etc) if your data has categorical predictor columns
  
A Vector Assembler cannot take categorical features for assembling.
For eg. it cannot do [0.1, 76, 98, 'Yes', 'Red', 1]  
It accepts boolean, numerical and vector types only, not StringType.

In [27]:
# transformed_df.dtypes

In [29]:
# Collecting the numerical (raw and massaged) input features 
#Row id does not provide any information 
# vecAss_features = [item[0] for item in transformed_df.dtypes if item[1] != 'string' and item[0] != 'Row_id' ]
# vecAss_features

In [30]:
# vec_assembler = VectorAssembler(inputCols=vecAss_features, outputCol="assembled_features")


# #How the assembler works by itself 
# assembled_pipeline = Pipeline(stages=[vec_assembler])
# assembled_pipeline_model = assembled_pipeline.fit(transformed_df)

# asb_transformed_df = assembled_pipeline_model.transform(transformed_df)
# asb_transformed_df.select("assembled_features").show()

You can see that all the numerical data has been assembled into vectors (one vector per record).

### 2. Vector Indexer

In [15]:
# vec_indexer = VectorIndexer(inputCol="assembled_features", outputCol="indexed_ml_features", maxCategories=2)

In [31]:
# Final Transformation pipeline (this should create the input features space that goes into your ml algo)
# i.e. your ML algo should use "indexed_ml_features" as its inputColumn

# trans_pipeline = Pipeline(stages=[vec_assembler, vec_indexer])
# trans_pipeline_model = trans_pipeline.fit(transformed_df)

# ml_input_features_df = trans_pipeline_model.transform(transformed_df)
# ml_input_features_df.select("indexed_ml_features").show()

In [110]:
# dec_t = DecisionTreeClassifier(labelCol="indexed_class_column", featuresCol="indexed_ml_features")

In [10]:
# 1. Lets encode the categorical target column
target_st_indexer = StringIndexer(inputCol='Class', outputCol='indexed_class_column')




# 2. String Indexing the categorical predictor columns 
features = [item[0] for item in df.dtypes if item[1] =='string' and item[0] != 'Class']
#print("Categorical pre")
st_indexers = list(map(lambda name: StringIndexer(inputCol=name, outputCol="indexed_"+name)
                               , features))




# 3. Vector Assembler (create feature vector)
vecAss_features = [item[0] for item in df.dtypes if item[1] != 'string' and item[0] != 'Row_id' ] +\
            ['indexed_Smelly', 'indexed_Status']
#print(vecAss_features)
vec_assembler = VectorAssembler(inputCols=vecAss_features, outputCol="assembled_features")



# 4. Vector Indexer
vec_indexer = VectorIndexer(inputCol="assembled_features", outputCol="indexed_ml_features", maxCategories=2)



# 5. ML algorithm Decision Tree here 
dec_t = DecisionTreeClassifier(labelCol="indexed_class_column", featuresCol="indexed_ml_features")

['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'indexed_Smelly', 'indexed_Status']


In [15]:
# Transformation Pipeline

trans_pipeline = Pipeline(stages=[target_st_indexer] + st_indexers +[ vec_assembler, vec_indexer])
trans_pipeline.getStages()

[StringIndexer_418ea0303ccf89b2aaa7,
 StringIndexer_4139970c7e63b1eb8d0a,
 StringIndexer_4396a20ffa517aa285f2,
 VectorAssembler_48c89eed70c494913edf,
 VectorIndexer_4e559fc1cfc1745c636c]

### Some concerns I have which I need advice for 
1. StringIndexer should be followed by a OneHotEncoder or a VectorIndexer to eliminate the order of indices.Using the string indexer’s output directly as a feature will not make sense simply because if apples is indexed as 1 and orange as 2, this might be inferred as apples are at a higher rank than orange or orange is greater than an apple. Which is not the case 

2. One hot encoding (OHE) explodes every categorical column into n different features, n being the number of categories in that one categorical column So for eg. 



In [34]:
import pandas as pd
d = {'Weather': ['Hot', 'Cold', "Humid"], 'Weather_Hot': [1,0,0], 'Weather_Cold': [0,1,0], 'Weather_Humid': [0,0,1]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,Weather,Weather_Hot,Weather_Cold,Weather_Humid
0,Hot,1,0,0
1,Cold,0,1,0
2,Humid,0,0,1


With many different categorical columns this will explode the number of features a.k.a Curse of dimensionality. 
Ive prevented this using a VectorIndexer which does this in the background using maxCategories = 2.  
**Does VectorIndexer this address the concern raised in #1 ?** 

3. The test/train split should be done on the transformed data.  
**Is this correct?**    
  
**Does the above 'trans_pipeline' pipeline correct to you?**

In [16]:
# Running all transformations BEFORE the train/test split
transformed_df = trans_pipeline.fit(df).transform(df)

In [17]:
#
#
# TEST TRAIN SPLIT
#
#
(trainData, testData) = transformed_df.randomSplit([0.8, 0.2], seed=2018)


In [18]:
#
# TRAINING THE DECISION TREE MODEL
#
#
dec_t_model = dec_t.fit(trainData)

In [19]:
#
# Predictions
#
#

predictions = dec_t_model.transform(testData)

In [22]:

#
#
# MODEL EVALUATION
#
#
eva = MulticlassClassificationEvaluator(labelCol='indexed_class_column',\
                                        predictionCol='prediction',\
                                        metricName='accuracy')

print("Accuracy of our DT model in predicting flowers is ", eva.evaluate(predictions))

Accuracy of our DT model in predicting flowers is  0.9047619047619048


In [17]:
spark.stop()