In [1]:
import findspark
findspark.init()

import pyspark;

In [2]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages net.snowflake:snowflake-jdbc:3.6.24,net.snowflake:spark-snowflake_2.11:2.4.12-spark_2.3 pyspark-shell'

In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import *
from pyspark import SparkConf, SparkContext 

#ML
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.classification import GBTClassifier, DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler, IndexToString
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
spark = SparkSession.builder.master('local').appName('playground').config('spark.driver.memory', '5G').getOrCreate()
spark.builder.config('spark.executor.memory', '16G')
spark.builder.config("spark.executor.cores", "4")

<pyspark.sql.session.SparkSession.Builder at 0x11164f9b0>

In [5]:
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"
sfOptions = {"sfURL":"*", "sfAccount":"*", "sfUser":"*", "sfPassword":"*", "sfDatabase":"*", "sfSchema":"*", "sfWarehouse":"*"}

In [6]:
#Importing Data 
df = spark.read.csv('/Users/spurushe/Downloads/iris.csv', header='true', inferSchema='true')
df.show(5)

+---+-----------+----------+-----------+----------+-------+
| Id|SepalLength|SepalWidth|PetalLength|PetalWidth|Species|
+---+-----------+----------+-----------+----------+-------+
|  1|        5.1|       3.5|        1.4|       0.2| setosa|
|  2|        4.9|       3.0|        1.4|       0.2| setosa|
|  3|        4.7|       3.2|        1.3|       0.2| setosa|
|  4|        4.6|       3.1|        1.5|       0.2| setosa|
|  5|        5.0|       3.6|        1.4|       0.2| setosa|
+---+-----------+----------+-----------+----------+-------+
only showing top 5 rows



In [7]:
type(df)

pyspark.sql.dataframe.DataFrame

In [7]:
#StringIndexer () is an Estimator which returns a Transformer (labelIndexer)
# Converts label String classes to indices --- for e.g. 'good', 'bad', 'ugly' to 0,1,2
labelIndexer = StringIndexer(inputCol="Species", outputCol="indexedLabel").fit(df)

In [8]:
featureAssembler = VectorAssembler(inputCols= [x for x in df.columns if x != 'Species' and x != 'Id'], outputCol="features")

featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4)

In [11]:
fa = featureAssembler.transform(df)
featureIndexer.fit(fa).transform(fa).show()

+---+-----------+----------+-----------+----------+-------+-----------------+-----------------+
| Id|SepalLength|SepalWidth|PetalLength|PetalWidth|Species|         features|  indexedFeatures|
+---+-----------+----------+-----------+----------+-------+-----------------+-----------------+
|  1|        5.1|       3.5|        1.4|       0.2| setosa|[5.1,3.5,1.4,0.2]|[5.1,3.5,1.4,0.2]|
|  2|        4.9|       3.0|        1.4|       0.2| setosa|[4.9,3.0,1.4,0.2]|[4.9,3.0,1.4,0.2]|
|  3|        4.7|       3.2|        1.3|       0.2| setosa|[4.7,3.2,1.3,0.2]|[4.7,3.2,1.3,0.2]|
|  4|        4.6|       3.1|        1.5|       0.2| setosa|[4.6,3.1,1.5,0.2]|[4.6,3.1,1.5,0.2]|
|  5|        5.0|       3.6|        1.4|       0.2| setosa|[5.0,3.6,1.4,0.2]|[5.0,3.6,1.4,0.2]|
|  6|        5.4|       3.9|        1.7|       0.4| setosa|[5.4,3.9,1.7,0.4]|[5.4,3.9,1.7,0.4]|
|  7|        4.6|       3.4|        1.4|       0.3| setosa|[4.6,3.4,1.4,0.3]|[4.6,3.4,1.4,0.3]|
|  8|        5.0|       3.4|        1.5|

In [9]:
# Chain indexers and Decision tree in a Pipeline
trans_pipeline = Pipeline(stages=[labelIndexer, featureAssembler, featureIndexer])
transformed_df = trans_pipeline.fit(df).transform(df)

In [10]:
#*******************************************
# SINGLE TRAIN TEST SPLIT
#*******************************************.
(trainingData, testData) = transformed_df.randomSplit([0.7, 0.3])

In [11]:
#*******************************************
# TRAINING THE MODEL
#*******************************************.
dec_t = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

In [12]:
# Train model.  This also runs the indexers.
# Estimators' .fit() returns a Transformer (model)
model = dec_t.fit(trainingData)


In [13]:
#*******************************************
# PREDICTION
#*******************************************
predictions = model.transform(testData)

In [14]:
# Quick view at columns of interest
predictions.select('Id', 'indexedLabel', 'prediction', 'probability').show(5)

+---+------------+----------+-------------+
| Id|indexedLabel|prediction|  probability|
+---+------------+----------+-------------+
|  1|         2.0|       2.0|[0.0,0.0,1.0]|
|  6|         2.0|       2.0|[0.0,0.0,1.0]|
|  9|         2.0|       2.0|[0.0,0.0,1.0]|
| 10|         2.0|       2.0|[0.0,0.0,1.0]|
| 11|         2.0|       2.0|[0.0,0.0,1.0]|
+---+------------+----------+-------------+
only showing top 5 rows



In [15]:
predictions.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- SepalLength: double (nullable = true)
 |-- SepalWidth: double (nullable = true)
 |-- PetalLength: double (nullable = true)
 |-- PetalWidth: double (nullable = true)
 |-- Species: string (nullable = true)
 |-- indexedLabel: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- indexedFeatures: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [19]:
# Convert indices back to labels
in_to_label = IndexToString(inputCol='indexedLabel', outputCol='Predicted_label').transform(predictions)
in_to_label.select('Species','indexedLabel', 'Predicted_label').head(30)

[Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='setosa', indexedLabel=2.0, Predicted_label='setosa'),
 Row(Species='versicolor', indexedLabel=0.0, Predicted_label='versicolor'),
 Row(Species='versicolor', indexedLabel=0.0, Predicted_label='versicolor'),
 Row(Species='versicolor', indexedLabel=0.0, Predicted_label='versicolor'),
 Row(Species='versicolor', indexedLabel=0.0, Predicted_label='versicolor'),
 Row(Species='ve

In [16]:
#*******************************************
# EVALUATION
#
# evaluating the performance of our ML model
#*******************************************

eva = MulticlassClassificationEvaluator(labelCol='indexedLabel', predictionCol='prediction', metricName='accuracy')

accuracy = eva.evaluate(predictions)
print("Accuracy of our DT model in predicting flowers is ", accuracy)

Accuracy of our DT model in predicting flowers is  0.9787234042553191


In [21]:
## Getting the entire Decision Tree rules.

print(model.toDebugString)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4b47940cb17f3c92f771) of depth 5 with 17 nodes
  If (feature 2 <= 2.35)
   Predict: 2.0
  Else (feature 2 > 2.35)
   If (feature 3 <= 1.75)
    If (feature 2 <= 5.05)
     If (feature 0 <= 4.95)
      If (feature 1 <= 2.45)
       Predict: 0.0
      Else (feature 1 > 2.45)
       Predict: 1.0
     Else (feature 0 > 4.95)
      Predict: 0.0
    Else (feature 2 > 5.05)
     If (feature 0 <= 6.05)
      Predict: 0.0
     Else (feature 0 > 6.05)
      Predict: 1.0
   Else (feature 3 > 1.75)
    If (feature 2 <= 4.85)
     If (feature 0 <= 5.95)
      Predict: 0.0
     Else (feature 0 > 5.95)
      Predict: 1.0
    Else (feature 2 > 4.85)
     Predict: 1.0



#### Splitting the probability column into the respective number of columns 
(One column per category of the target variable)

In [17]:
subset = predictions.select('Id', 'indexedLabel', 'prediction', 'probability')

In [18]:
subset.count()

47

In [19]:
subset.show(5)

+---+------------+----------+-------------+
| Id|indexedLabel|prediction|  probability|
+---+------------+----------+-------------+
|  1|         2.0|       2.0|[0.0,0.0,1.0]|
|  6|         2.0|       2.0|[0.0,0.0,1.0]|
|  9|         2.0|       2.0|[0.0,0.0,1.0]|
| 10|         2.0|       2.0|[0.0,0.0,1.0]|
| 11|         2.0|       2.0|[0.0,0.0,1.0]|
+---+------------+----------+-------------+
only showing top 5 rows



In [20]:
subset.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- indexedLabel: double (nullable = false)
 |-- prediction: double (nullable = false)
 |-- probability: vector (nullable = true)



### Using a udf here (permisson problems with rdd solution)

In [21]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, DoubleType

def to_array(col):
    def to_array_(v):
        return v.toArray().tolist()
    return udf(to_array_, ArrayType(DoubleType()))(col)

d = predictions\
    .withColumn("xs", to_array(col("probability")))\
    .select(['Id','probability', 'prediction'] + [col("xs")[i] for i in range(3)])



In [22]:
d.columns

['Id', 'probability', 'prediction', 'xs[0]', 'xs[1]', 'xs[2]']

### Clustering

In [23]:
kmeans = KMeans(k = 3, seed=2018, featuresCol="probability", predictionCol='clustered_prediction')

In [24]:
clustered_df = kmeans.fit(d).transform(d)

In [25]:
clustered_df.columns

['Id',
 'probability',
 'prediction',
 'xs[0]',
 'xs[1]',
 'xs[2]',
 'clustered_prediction']

In [26]:
clustered_df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)
 |-- xs[0]: double (nullable = true)
 |-- xs[1]: double (nullable = true)
 |-- xs[2]: double (nullable = true)
 |-- clustered_prediction: integer (nullable = false)



In [27]:
selections = [x[0] for x in clustered_df.dtypes if x[1] != 'vector']
selections

['Id', 'prediction', 'xs[0]', 'xs[1]', 'xs[2]', 'clustered_prediction']

In [28]:
clustered_df.columns

['Id',
 'probability',
 'prediction',
 'xs[0]',
 'xs[1]',
 'xs[2]',
 'clustered_prediction']

In [29]:
clustered_df.show(5)

+---+-------------+----------+-----+-----+-----+--------------------+
| Id|  probability|prediction|xs[0]|xs[1]|xs[2]|clustered_prediction|
+---+-------------+----------+-----+-----+-----+--------------------+
|  1|[0.0,0.0,1.0]|       2.0|  0.0|  0.0|  1.0|                   0|
|  6|[0.0,0.0,1.0]|       2.0|  0.0|  0.0|  1.0|                   0|
|  9|[0.0,0.0,1.0]|       2.0|  0.0|  0.0|  1.0|                   0|
| 10|[0.0,0.0,1.0]|       2.0|  0.0|  0.0|  1.0|                   0|
| 11|[0.0,0.0,1.0]|       2.0|  0.0|  0.0|  1.0|                   0|
+---+-------------+----------+-----+-----+-----+--------------------+
only showing top 5 rows



In [30]:
clustered_df.select(selections)\
        .write.format(SNOWFLAKE_SOURCE_NAME)\
        .mode("overwrite")\
        .options(**sfOptions)\
        .option("dbtable", "dev.zsp.iris_clustered")\
        .save()

In [31]:
spark.stop()