# Predicting Forest Cover with Decision Trees

In [None]:
import org.apache.spark.sql.{functions => F}

## Preparing the Data

In [None]:
val data_path = "../data/covtype/covtype.data"

In [None]:
val dataWithoutHeader = spark.read.option("header", "false").
    option("inferSchema", "true").csv(data_path)

In [None]:
val colNames = Seq(
        "Elevation", "Aspect", "Slope",
        "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology",
        "Horizontal_Distance_To_Roadways",
        "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
        "Horizontal_Distance_To_Fire_Points"
      ) ++ (
        (0 until 4).map(i => s"Wilderness_Area_$i")
      ) ++ (
        (0 until 40).map(i => s"Soil_Type_$i")
      ) ++ Seq("Cover_Type")

In [None]:
val data = dataWithoutHeader.toDF(colNames:_*).withColumn("Cover_Type", F.col("Cover_type").cast("double"))

In [None]:
data.head

# A First Decision Tree

In [None]:
val Array(trainData, testData) = data.randomSplit(Array(0.9, 0.1))
trainData.cache()
testData.cache()

In [None]:
import org.apache.spark.ml.feature.VectorAssembler

In [None]:
val inputCols = trainData.columns.filter(_ != "Cover_Type")
val assembler = new VectorAssembler().setInputCols(inputCols)
    .setOutputCol("featureVector")

In [None]:
val assembledTrainData = assembler.transform(trainData)

In [None]:
assembledTrainData.select("featureVector").show(truncate = false)

In [None]:
import org.apache.spark.ml.classification.DecisionTreeClassifier

In [None]:
val classifier = new DecisionTreeClassifier().
    setSeed(42).
    setLabelCol("Cover_Type").
    setFeaturesCol("featureVector").
    setPredictionCol("prediction")


In [None]:
val model = classifier.fit(assembledTrainData)

In [None]:
println(model.toDebugString)

In [None]:
model.featureImportances.toArray.zip(inputCols).sorted.reverse.foreach(println)

In [None]:
val predictions = model.transform(assembledTrainData)
predictions.select("Cover_Type", "prediction", "probability").show(truncate = false)

In [None]:
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

In [None]:
val evaluator = new MulticlassClassificationEvaluator().
    setLabelCol("Cover_Type").
    setPredictionCol("prediction")

In [None]:
evaluator.setMetricName("accuracy").evaluate(predictions)

In [None]:
evaluator.setMetricName("f1").evaluate(predictions)

In [None]:
import org.apache.spark.mllib.evaluation.MulticlassMetrics

In [None]:
val predictionRDD = predictions.select("prediction", "Cover_Type").as[(Double, Double)].rdd

In [None]:
val multiclassMetrics = new MulticlassMetrics(predictionRDD)

In [None]:
multiclassMetrics.confusionMatrix

In [None]:
val confusionMatrix = predictions.select("prediction", "Cover_Type")
    .groupBy("Cover_Type").pivot("prediction").count().na.fill(0).orderBy("Cover_Type")

In [None]:
confusionMatrix.show()

In [None]:
import org.apache.spark.sql.DataFrame

In [None]:
def classProbabilities(data: DataFrame): Array[Double] = {
    val total = data.count()
    data.groupBy("Cover_Type").count().
      orderBy("Cover_Type").
      select("count").as[Double].
      map(_ / total).
      collect()
  }

In [None]:
val trainPriorProbabilities = classProbabilities(trainData)
val testPriorProbabilities = classProbabilities(testData)
trainPriorProbabilities.zip(testPriorProbabilities).map{
    case(trainProb, cvProb) => trainProb * cvProb
}.sum

## Tuning Decision Trees

In [None]:
val inputCols = trainData.columns.filter(_ != "Cover_Type")

val assembler = new VectorAssembler().
    setInputCols(inputCols).
    setOutputCol("featureVector")

val classifier = new DecisionTreeClassifier().
    setSeed(42).
    setLabelCol("Cover_Type").
    setFeaturesCol("featureVector").
    setPredictionCol("prediction")

In [None]:
import org.apache.spark.ml.{Pipeline, PipelineModel}

In [None]:
val pipeline = new Pipeline().setStages(Array(assembler, classifier))

In [None]:
import org.apache.spark.ml.tuning.{ParamGridBuilder, 
                                   TrainValidationSplit}

In [None]:
val paramGrid = new ParamGridBuilder().
      addGrid(classifier.impurity, Seq("gini", "entropy")).
      addGrid(classifier.maxDepth, Seq(1, 20)).
      build()

val multiclassEval = new MulticlassClassificationEvaluator().
    setLabelCol("Cover_Type").
    setPredictionCol("prediction").
    setMetricName("accuracy")

val validator = new TrainValidationSplit().
    setSeed(42).
    setEstimator(pipeline).
    setEvaluator(multiclassEval).
    setEstimatorParamMaps(paramGrid).
    setTrainRatio(0.9)

In [None]:
val validatorModel = validator.fit(trainData)

In [None]:
val paramsAndMetrics = validatorModel.validationMetrics.
      zip(validatorModel.getEstimatorParamMaps).sortBy(-_._1)

paramsAndMetrics.foreach { case (metric, params) =>
    println(metric)
    println(params)
    println()
}

In [None]:
val bestModel = validatorModel.bestModel
println(bestModel.asInstanceOf[PipelineModel].stages.last.extractParamMap)

In [None]:
println(validatorModel.validationMetrics.max)

In [None]:
val testAccuracy = multiclassEval.evaluate(bestModel.transform(testData))
println(testAccuracy)

In [None]:
val trainAccuracy = multiclassEval.evaluate(bestModel.transform(trainData))
println(trainAccuracy)

## Categorical Features Revisited

In [None]:
import org.apache.spark.sql.functions._

In [None]:
def unencodeOneHot(data: DataFrame): DataFrame = {
    val wildernessCols = (0 until 4).map(i => s"Wilderness_Area_$i").toArray

    val wildernessAssembler = new VectorAssembler().
      setInputCols(wildernessCols).
      setOutputCol("wilderness")

    val unhotUDF = udf((vec: Vector[Double]) => vec.toArray.indexOf(1.0).toDouble)

    val withWilderness = wildernessAssembler.transform(data).
      drop(wildernessCols:_*).
      withColumn("wilderness", unhotUDF($"wilderness"))

    val soilCols = (0 until 40).map(i => s"Soil_Type_$i").toArray

    val soilAssembler = new VectorAssembler().
      setInputCols(soilCols).
      setOutputCol("soil")

    soilAssembler.transform(withWilderness).
      drop(soilCols:_*).
      withColumn("soil", unhotUDF($"soil"))
  }

In [None]:
import org.apache.spark.ml.feature.VectorIndexer

In [None]:
// val unencTrainData = unencodeOneHot(trainData)
// val unencTestData = unencodeOneHot(testData)

// val inputCols = unencTrainData.columns.filter(_ != "Cover_Type")
// val assembler = new VectorAssembler().
//     setInputCols(inputCols).
//     setOutputCol("featureVector")

// val indexer = new VectorIndexer().
//     setMaxCategories(40).
//     setInputCol("featureVector").
//     setOutputCol("indexedVector")

// val classifier = new DecisionTreeClassifier().
//     setSeed(42).
//     setLabelCol("Cover_Type").
//     setFeaturesCol("indexedVector").
//     setPredictionCol("prediction")

// val pipeline = new Pipeline().setStages(Array(assembler, indexer, classifier))

## Random Decision Forests

In [None]:
import org.apache.spark.ml.classification.RandomForestClassifier

In [None]:
val classifier = new RandomForestClassifier().
      setSeed(42).
      setLabelCol("Cover_Type").
      setFeaturesCol("featureVector").
      setPredictionCol("prediction").
      setImpurity("entropy").
      setMaxDepth(20).
      setMaxBins(300)

In [None]:
val pipeline = new Pipeline().setStages(Array(assembler, classifier))

val paramGrid = new ParamGridBuilder().
    addGrid(classifier.minInfoGain, Seq(0.0, 0.05)).
    build()

val multiclassEval = new MulticlassClassificationEvaluator().
    setLabelCol("Cover_Type").
    setPredictionCol("prediction").
    setMetricName("accuracy")

val validator = new TrainValidationSplit().
    setSeed(42).
    setEstimator(pipeline).
    setEvaluator(multiclassEval).
    setEstimatorParamMaps(paramGrid).
    setTrainRatio(0.9)

val validatorModel = validator.fit(trainData)

In [None]:
val bestModel = validatorModel.bestModel

val forestModel = bestModel.asInstanceOf[PipelineModel].
    stages.last.asInstanceOf[RandomForestClassificationModel]

println(forestModel.extractParamMap)
println(forestModel.getNumTrees)
forestModel.featureImportances.toArray.zip(inputCols).
    sorted.reverse.foreach(println)

val testAccuracy = multiclassEval.evaluate(bestModel.transform(testData))
println(testAccuracy)

bestModel.transform(unencTestData.drop("Cover_Type")).select("prediction").show()