# Decision Trees & Random Forests - code along

In [1]:
import findspark
findspark.init("/home/rodolfo/spark-3.3.1-bin-hadoop3")
from pyspark.sql import SparkSession

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier

In [2]:
spark = SparkSession.builder.appName("trees").getOrCreate()

22/12/11 15:40:25 WARN Utils: Your hostname, rodolfo-300E5M-300E5L resolves to a loopback address: 127.0.1.1; using 192.168.15.11 instead (on interface wlp3s0)
22/12/11 15:40:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/11 15:40:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/12/11 15:40:31 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/12/11 15:40:31 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
df = spark.read.format("libsvm").load("../../data/sample_libsvm_data.txt")

22/12/11 15:42:17 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


                                                                                

In [5]:
df.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
+-----+--------------------+
only showing top 5 rows



In [6]:
train, test = df.randomSplit([0.7, 0.3])

In [7]:
decision_tree_classifier = DecisionTreeClassifier().fit(train)
random_forest_classifier = RandomForestClassifier(numTrees=100).fit(train)
gradient_boosting_classifier = GBTClassifier().fit(train)

                                                                                

In [8]:
decision_tree_test_pred = decision_tree_classifier.transform(test)
random_forest_test_pred = random_forest_classifier.transform(test)
gradient_boosting_test_pred = gradient_boosting_classifier.transform(test)

In [9]:
decision_tree_test_pred.show(2)

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[121,122,123...|   [32.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [32.0,0.0]|  [1.0,0.0]|       0.0|
+-----+--------------------+-------------+-----------+----------+
only showing top 2 rows



In [10]:
random_forest_test_pred.show(2)

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[121,122,123...|  [100.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [99.0,1.0]|[0.99,0.01]|       0.0|
+-----+--------------------+-------------+-----------+----------+
only showing top 2 rows



In [12]:
gradient_boosting_test_pred.show(2)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[121,122,123...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 2 rows



In [17]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [18]:
accuracy_evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

In [20]:
print("Decision tree accuracy", accuracy_evaluator.evaluate(decision_tree_test_pred))
print("Random forest accuracy", accuracy_evaluator.evaluate(random_forest_test_pred))
print("Gradient boosting accuracy", accuracy_evaluator.evaluate(gradient_boosting_test_pred))

Decision tree accuracy 0.9696969696969697
Random forest accuracy 1.0
Gradient boosting accuracy 0.9696969696969697


In [21]:
random_forest_classifier.featureImportances

SparseVector(692, {148: 0.0006, 149: 0.0012, 154: 0.0004, 174: 0.0009, 177: 0.0003, 179: 0.0005, 180: 0.0004, 210: 0.0004, 215: 0.0005, 216: 0.0065, 217: 0.0023, 234: 0.0138, 235: 0.0056, 242: 0.0005, 243: 0.0027, 244: 0.0241, 261: 0.0005, 272: 0.0161, 285: 0.0005, 290: 0.0021, 294: 0.001, 295: 0.0029, 300: 0.0366, 314: 0.001, 319: 0.0005, 322: 0.0033, 323: 0.0108, 324: 0.0005, 325: 0.0005, 328: 0.0122, 329: 0.0066, 342: 0.0037, 343: 0.001, 345: 0.0041, 346: 0.0022, 350: 0.0162, 351: 0.019, 354: 0.0007, 355: 0.0004, 356: 0.001, 357: 0.0075, 370: 0.0011, 372: 0.0001, 373: 0.0191, 374: 0.0006, 377: 0.0145, 378: 0.0429, 379: 0.0076, 380: 0.0067, 381: 0.0015, 382: 0.0025, 383: 0.0005, 384: 0.0025, 385: 0.0064, 387: 0.0028, 399: 0.0073, 400: 0.0063, 401: 0.0082, 403: 0.0006, 405: 0.0115, 406: 0.0337, 407: 0.0115, 408: 0.001, 413: 0.0064, 414: 0.0006, 415: 0.0012, 427: 0.0063, 428: 0.0083, 429: 0.0004, 431: 0.0009, 432: 0.0101, 433: 0.0382, 434: 0.0349, 435: 0.0099, 438: 0.002, 442: 0.0014, 