In [24]:
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics

//Data is in libsvm format: sparse data
val data = MLUtils.loadLibSVMFile(sc, "./data/mllib/sample_libsvm_data.txt")

// Split the data into training and test sets (30% held out for testing)
val splits = data.randomSplit(Array(0.7, 0.3))
val (trainingData, testData) = (splits(0), splits(1))

In [25]:
// Train a RandomForest model.
// Empty categoricalFeaturesInfo indicates all features are continuous.
val numClasses = 2
val categoricalFeaturesInfo = Map[Int, Int]()
val numTrees = 3 // Use more in practice.
val featureSubsetStrategy = "auto" // Let the algorithm choose.
val impurity = "gini"
val maxDepth = 4
val maxBins = 32

In [26]:
val model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
  numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)

In [27]:
val labelAndPredicts = testData.map{ point => 
    val prediction = model.predict(point.features)
    (point.label, prediction)
}

In [28]:
//build the metrics for model evaluation
val metrics = new BinaryClassificationMetrics(labelAndPredicts)

In [29]:
val testErr = labelAndPredicts.filter(x => x._1 != x._2).count.toDouble/labelAndPredicts.count 
println(s"Test Classification error = $testErr")
println(s"Test areaUnderPR = ${metrics.areaUnderPR()}.")
println(s"Test areaUnderROC = ${metrics.areaUnderROC()}.")
 

Test Classification error = 0.02564102564102564
Test areaUnderPR = 0.9943019943019943.
Test areaUnderROC = 0.9814814814814814.
