In [None]:
# setup-- 
import os
import pyspark
from splicemachine.spark.context import PySpliceContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

# make sure pyspark tells workers to use python3 not 2 if both are installed
jdbc_host = os.environ['JDBC_HOST']

conf = pyspark.SparkConf()
sc = pyspark.SparkContext(conf=conf)

spark = SparkSession.builder.config(conf=conf).getOrCreate()

splicejdbc=f"jdbc:splice://{jdbc_host}:1527/splicedb;user=splice;password=admin"

splice = PySpliceContext(spark, splicejdbc)


In [None]:
%%scala
%%spark <> --noUI
import java.net.InetAddress
val driver_host = InetAddress.getLocalHost.getHostAddress
SparkSession.builder()
	.appName("jt1test2")
	.master("k8s://https://kubernetes.default.svc.cluster.local:443")
	.config("spark.kubernetes.container.image", "splicemachine/sm_k8_spark:0.0.4")
	.config("spark.executor.instances", "2")
	.config("spark.submit.deployMode", "cluster")
	.config("spark.submit.deployMode", "cluster")
	.config("spark.driver.extraClassPath", "/opt/spark/conf:/opt/spark/jars/*")
	.config("spark.executor.extraClassPath", "./:/opt/hbase/conf:/opt/splicemachine/lib/*:/opt/spark/jars/*:/opt/hbase/lib/*")
	.config("splice.spark.executor.extraLibraryPath", "/opt/native")
	.config("spark.files", "/opt/spark/conf/hbase-site.xml,/opt/spark/conf/core-site.xml,/opt/spark/conf/hdfs-site.xml,/opt/spark/jars/hbase_sql-2.8.0.1926-cdh5.14.0.jar")
	.config("spark.kubernetes.authenticate.caCertFile", "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt")
	.config("spark.kubernetes.authenticate.oauthTokenFile", "/var/run/secrets/kubernetes.io/serviceaccount/token")
	.config("spark.driver.host", driver_host)
	.config("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")


# Decision Trees

The [Decision Tree](https://spark.apache.org/docs/latest/mllib-decision-tree.html) is a greedy algorithm that performs a recursive binary partitioning of the feature space for predictive modeling; locally optimal decisions are made at each node in hopes of a globally optimal decision.
Because of it's greedy nature, it cannot guarantee the globally optimal tree.

At its most simplified core, decision trees are simply a system of if-else statements, always taking the most optimal answer, resulting in what is hopefully the most optimal decision, although optimality is not guaranteed. Here's an illustration:

<img class="fitwidth" src="http://mines.humanoriented.com/classes/2010/fall/csci568/portfolio_exports/lguo/image/decisionTree/classification.jpg">


## A Scala Example

The example in this notebook demonstrates how to:

* Load a CSV dataset from a public S3 bucket.
* Parse that data as an RDD.
* Perform classification using a decision tree.

Our decision tree uses Gini impurity as an impurity measure, and a maximum tree depth of 5. The test error is calculated to measure the algorithm accuracy. 

For more information about Decision Trees, see this [Apache Spark page](https://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-classifier).


In [None]:
%%scala 
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.evaluation.ClusteringEvaluator
import org.apache.spark.sql.Row
import org.apache.spark.ml.feature.{PCA, StandardScaler,StringIndexer,VectorAssembler,IndexToString}
import org.apache.spark.ml.{Pipeline, PipelineModel}

//Create the data schema
import org.apache.spark.sql.types.StructType
val schema = new StructType()
    .add("sepal_length", "float")
    .add("sepal_width", "float")
    .add("petal_length", "float")
    .add("petal_width", "float")
    .add("species", "string")


//Grabbing data
val rdd = sc.textFile("s3a://splice-demo/iris.csv")
val header = rdd.first()
val rdd2 = rdd.filter(x => x != header).map(_.split(","))
    .map(p => Row(p(0).toFloat,p(1).toFloat,p(2).toFloat,p(3).toFloat,p(4)))

val df = spark.createDataFrame(rdd2, schema)


val features = header.split(",").filter(_ != "species")

// StringIndex our label (species) because all values (columns) must be numerical
val indexer = new StringIndexer()
   .setInputCol("species")
   .setOutputCol("label")

// Assemble our feature vector
val assembler = new VectorAssembler()
  .setInputCols(features)
  .setOutputCol("features")


// Assemble our Pipeline for proper parralelism
val pipeline = new Pipeline()
  .setStages(Array(indexer, assembler))

val df2 = pipeline.fit(df).transform(df)


// Display feature columns
df2.select(features.head, features:_*).display(10)

// Inspect the schema
println("Schema:")
df2.schema.foreach(i => println(i))
println()
// Inspect the features vector from the features (VectorAssembler)
println("Features:")
df2.select("features", features:_*).show(10, truncate=false)
// Look at species and label coversion (StringIndexer)
print("Label from Species:")
df2.select("species", "label").distinct().display(3)

In [None]:
%%scala 
import org.apache.spark.ml.classification.{DecisionTreeClassifier,DecisionTreeClassificationModel}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

// Split the data into training and test sets (30% held out for testing)
val splits = df2.randomSplit(Array(0.7, 0.3))
val (trainingData, testData) = (splits(0), splits(1))

// Train a DecisionTree model.
//  Empty categoricalFeaturesInfo indicates all features are continuous.
val numClasses = 3
val impurity = "gini"
val maxDepth = 5
val maxBins = 32

val model = new DecisionTreeClassifier()
            .setFeaturesCol("features")
            .setLabelCol("label")
            .setMaxDepth(5)
            .setImpurity(impurity)
            .setMaxBins(maxBins)

// Convert indexed labels back to original labels (species values).
val labelConverter = new IndexToString()
  .setInputCol("prediction")
  .setOutputCol("predictedLabel")
  .setLabels(Array("Iris-versicolor","Iris-virginica","Iris-setosa"))

val pipeline2 = new Pipeline()
            .setStages(Array(model, labelConverter))

val trainedModel = pipeline2.fit(trainingData)
val predictions = trainedModel.transform(testData)

// Select example rows to display.
predictions.select("predictedLabel", "species", "features").show(5)

// Select (prediction, true label) and compute test error.
val evaluator = new MulticlassClassificationEvaluator()
  .setLabelCol("label")
  .setPredictionCol("prediction")
  .setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
println(s"Accuracy = ${(accuracy)}")

val treeModel = trainedModel.stages(0).asInstanceOf[DecisionTreeClassificationModel]
println(s"Learned classification tree model:\n ${treeModel.toDebugString}")

## PySpark Example

This section presents a PySpark Decision Tree example.

In [None]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
import plotly.express as px
from beakerx.object import beakerx
beakerx.pandas_display_table()

data = spark.createDataFrame(px.data.iris()).drop('species_id')

# Convert species column into int type
si = StringIndexer(inputCol='species', outputCol='species_vec')

# Create a vector of features
cols = [c for c in data.columns if c != 'species']
va = VectorAssembler(inputCols=cols, outputCol='features')

# Define stages of a Pipeline for Spark
pipeline = Pipeline(stages = [si, va])

data = pipeline.fit(data).transform(data)

# Show the final dataset
display(data.orderBy('sepal_width').toPandas()[:10])

### Visualize our Data 

Let's visualize our data with [Plotly](https://plot.ly/python/)

* The X axis is the `petal_length`.
* The Y axis is `sepal_width`.
* The Z axis is the `sepal_length`.
* The Datapoint size is the `petal_width`.
* The color is the species type: `versicolor, virginica, setosa`.

In the next cell, you change any of the variables in the plot function to see a new chart layout; trying different combinations can give you new insight into the data.

In [None]:
# Hover over any datapoint to get it's exact dimensions
px.scatter_3d(data.toPandas(), x='petal_length', y='sepal_width', z='sepal_length', size='petal_width', color='species')

### Create Our Decision Tree

Now we can create our Decision Tree to predict the species, based on its `sepal_length, sepal_width, petal_length,` and `petal_width`.

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
from splicemachine.ml.utilities import SpliceMultiClassificationEvaluator

# The data has already been preprocessed above into a feature vector called "features"
# Create the decision tree
dt = DecisionTreeClassifier(labelCol='species_vec', featuresCol='features', maxDepth=20)

# Split our dataset into training and testing
train, test = data.randomSplit([0.8,0.2])

# Train on our training data
model = dt.fit(train)
# Make predictions
predictions = model.transform(test)

predictions.select(['features','species','species_vec','prediction']).show()

# Evaluate results
e = SpliceMultiClassificationEvaluator(spark, label_column='species_vec')
e.input(predictions)
results = e.get_results(dict=True)

That's turns out to be quite a good Decision Tree. Run the next cell to visualize it:

In [None]:
from splicemachine.ml.utilities import DecisionTreeVisualizer as dtv
import pprint

print(dtv.visualize(model, ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], ['versicolor', 'virginica' ,'setosa'],'First_Decision_Tree', visual=False))