# Random Forest Example

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tree').getOrCreate()

In [4]:
# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").load("sample_libsvm_data.txt")

In [5]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [6]:
data.head()

Row(label=0.0, features=SparseVector(692, {127: 51.0, 128: 159.0, 129: 253.0, 130: 159.0, 131: 50.0, 154: 48.0, 155: 238.0, 156: 252.0, 157: 252.0, 158: 252.0, 159: 237.0, 181: 54.0, 182: 227.0, 183: 253.0, 184: 252.0, 185: 239.0, 186: 233.0, 187: 252.0, 188: 57.0, 189: 6.0, 207: 10.0, 208: 60.0, 209: 224.0, 210: 252.0, 211: 253.0, 212: 252.0, 213: 202.0, 214: 84.0, 215: 252.0, 216: 253.0, 217: 122.0, 235: 163.0, 236: 252.0, 237: 252.0, 238: 252.0, 239: 253.0, 240: 252.0, 241: 252.0, 242: 96.0, 243: 189.0, 244: 253.0, 245: 167.0, 262: 51.0, 263: 238.0, 264: 253.0, 265: 253.0, 266: 190.0, 267: 114.0, 268: 253.0, 269: 228.0, 270: 47.0, 271: 79.0, 272: 255.0, 273: 168.0, 289: 48.0, 290: 238.0, 291: 252.0, 292: 252.0, 293: 179.0, 294: 12.0, 295: 75.0, 296: 121.0, 297: 21.0, 300: 253.0, 301: 243.0, 302: 50.0, 316: 38.0, 317: 165.0, 318: 253.0, 319: 233.0, 320: 208.0, 321: 84.0, 328: 253.0, 329: 252.0, 330: 165.0, 343: 7.0, 344: 178.0, 345: 252.0, 346: 240.0, 347: 71.0, 348: 19.0, 349: 28.0,

In [7]:
# Split the data into training and test sets (30% held out for testing)
trainingData, testData = data.randomSplit([0.7, 0.3])

In [8]:
trainingData.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [19]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=20)
gbc = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)

In [20]:
# Train model.  This also runs the indexers.
dtc_model = dtc.fit(trainingData)
rfc_model = rfc.fit(trainingData)
gbc_model = gbc.fit(trainingData)

In [21]:
# Make predictions.
dtc_preds = dtc_model.transform(testData)
rfc_preds = rfc_model.transform(testData)
gbc_preds = gbc_model.transform(testData)

In [22]:
rfc_preds.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [23]:
rfc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[95,96,97,12...|   [19.0,1.0]|[0.95,0.05]|       0.0|
|  0.0|(692,[100,101,102...|   [13.0,7.0]|[0.65,0.35]|       0.0|
|  0.0|(692,[122,123,124...|   [19.0,1.0]|[0.95,0.05]|       0.0|
|  0.0|(692,[122,123,148...|   [20.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [20.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [20.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [18.0,2.0]|  [0.9,0.1]|       0.0|
|  0.0|(692,[124,125,126...|   [19.0,1.0]|[0.95,0.05]|       0.0|
|  0.0|(692,[126,127,128...|   [19.0,1.0]|[0.95,0.05]|       0.0|
|  0.0|(692,[126,127,128...|   [18.0,2.0]|  [0.9,0.1]|       0.0|
|  0.0|(692,[126,127,128...|   [19.0,1.0]|[0.95,0.05]|       0.0|
|  0.0|(692,[126,127,128...|   [19.0,1.0]|[0.95,0.05]|       0.0|
|  0.0|(69

In [24]:
# Select example rows to display.
rfc_preds.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[95,96,97,12...|
|       0.0|  0.0|(692,[100,101,102...|
|       0.0|  0.0|(692,[122,123,124...|
|       0.0|  0.0|(692,[122,123,148...|
|       0.0|  0.0|(692,[124,125,126...|
+----------+-----+--------------------+
only showing top 5 rows



In [25]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [26]:
accuracy = evaluator.evaluate(rfc_preds)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0


In [27]:
# Not a very good example to show this!
rfc_model.featureImportances

SparseVector(692, {216: 0.0407, 235: 0.0333, 238: 0.0048, 264: 0.0036, 273: 0.0146, 287: 0.0048, 300: 0.0324, 302: 0.0136, 319: 0.0368, 327: 0.0069, 330: 0.0137, 341: 0.0098, 350: 0.0401, 352: 0.0035, 357: 0.0326, 358: 0.0099, 371: 0.0062, 379: 0.0365, 385: 0.0401, 407: 0.0128, 413: 0.0296, 429: 0.0209, 430: 0.0038, 433: 0.0447, 435: 0.0034, 436: 0.0039, 442: 0.0291, 457: 0.0059, 461: 0.0372, 462: 0.1388, 463: 0.0182, 471: 0.007, 483: 0.043, 489: 0.0884, 490: 0.004, 496: 0.0051, 497: 0.0354, 511: 0.0038, 512: 0.05, 540: 0.0204, 610: 0.0059, 628: 0.0044})