# 08. Tree-based methods

In [1]:
# -> Define SparkSession

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tree based methods').getOrCreate()

import utils

In [2]:
# -> Load modules

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.regression import DecisionTreeRegressor, RandomForestRegressor, GBTRegressor

from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import RegressionEvaluator

import pyspark.sql.functions as F
import pandas as pd

## 8.3 Lab: Decision Trees

### *8.3.1 Fitting Classification Trees*

In [3]:
# -> Load Carseats dataset

Carseats = spark.read.csv('data/Carseats.csv',header=True,inferSchema=True)
Carseats = Carseats.na.drop(how='any')
Carseats = Carseats.withColumn('High', F.when(F.col('Sales')<=8,'No').otherwise('Yes'))

print('\nCarseats dataset:'); Carseats.show(5)
print('\nData types:'); Carseats.printSchema()

# -> Prepare data:

categoricalCols = ['ShelveLoc','Urban', 'US', 'High']
continuousCols = ['CompPrice', 'Income', 'Advertising', 'Population', 'Price', 'Age', 'Sales']

data = utils.prepare_data(df = Carseats,
                    labelCol = 'High',
                    label_is_categorical = True,
                    categoricalCols = categoricalCols,
                    continuousCols = continuousCols
                   )

# -> Split the dataset intro train and test samples:

train, test = data.randomSplit([0.5, 0.5], seed=11)

# -> Define a Decision tree regression model:

model = DecisionTreeClassifier(featuresCol="features", labelCol='label')

# -> Fit the model:

model_fit = model.fit(train)

# -> Make predictions using the test data:

predictions = model_fit.transform(test)

print('\nPredictions:')
predictions.select('label', 'prediction').show(5)

# -> Evaluate the model using the test data:

metrics = MulticlassMetrics(predictions.select('label','prediction').rdd)

print('\nConfusion Matrix:')
confusion_matrix = pd.DataFrame(metrics.confusionMatrix().toArray())

print(confusion_matrix)
print('\nAccuracy = {:.3f}'.format(metrics.accuracy))


Carseats dataset:
+---+-----+---------+------+-----------+----------+-----+---------+---+---------+-----+---+----+
|_c0|Sales|CompPrice|Income|Advertising|Population|Price|ShelveLoc|Age|Education|Urban| US|High|
+---+-----+---------+------+-----------+----------+-----+---------+---+---------+-----+---+----+
|  1|  9.5|      138|    73|         11|       276|  120|      Bad| 42|       17|  Yes|Yes| Yes|
|  2|11.22|      111|    48|         16|       260|   83|     Good| 65|       10|  Yes|Yes| Yes|
|  3|10.06|      113|    35|         10|       269|   80|   Medium| 59|       12|  Yes|Yes| Yes|
|  4|  7.4|      117|   100|          4|       466|   97|   Medium| 55|       14|  Yes|Yes|  No|
|  5| 4.15|      141|    64|          3|       340|  128|      Bad| 38|       13|  Yes| No|  No|
+---+-----+---------+------+-----------+----------+-----+---------+---+---------+-----+---+----+
only showing top 5 rows


Data types:
root
 |-- _c0: integer (nullable = true)
 |-- Sales: double (nullable 

### *8.3.2 Fitting Regression Trees*

In [4]:
# -> Load Boston dataset:

Boston = spark.read.csv('data/Boston.csv',header=True,inferSchema=True)

print('\nBoston data:'); Boston.show(5)
print('\nData types:'); Boston.printSchema()

# -> Prepare data

continuousCols = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat']


data = utils.prepare_data(df = Boston,
                    labelCol = 'medv',
                    label_is_categorical = False,
                    categoricalCols = [],
                    continuousCols = continuousCols
                   )

# -> Split the data into train and test samples:

train, test = data.randomSplit([0.6, 0.5], seed=11)

# -> Define the model:

model = DecisionTreeRegressor(featuresCol="features", labelCol='medv')

# -> Fit the model
model_fit = model.fit(train)

# -> Feature importances:

importance_values = dict(zip(continuousCols, model_fit.featureImportances))
sorted_importance_values = sorted(importance_values.items(), key=lambda kv: kv[1], reverse=True)

print('Feature Importances:')
print(pd.DataFrame(sorted_importance_values, columns=['Feature', 'Importance']).rename_axis('Rank', axis=1))

# -> Make predictions on the test sample:

predictions = model_fit.transform(test)
print('\nPredictions:'); predictions.select('label', 'prediction').show(5)

# -> Compute test error on the test sample: 

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mse")
mse = evaluator.evaluate(predictions)
print("\nMean Squared Error (MSE) on test data = {:.3f}".format(mse))


Boston data:
+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
|   crim|  zn|indus|chas|  nox|   rm| age|   dis|rad|tax|ptratio| black|lstat|medv|
+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
|0.00632|18.0| 2.31|   0|0.538|6.575|65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|0.02731| 0.0| 7.07|   0|0.469|6.421|78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|0.02729| 0.0| 7.07|   0|0.469|7.185|61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|
|0.03237| 0.0| 2.18|   0|0.458|6.998|45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|
|0.06905| 0.0| 2.18|   0|0.458|7.147|54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|
+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
only showing top 5 rows


Data types:
root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm:

### *8.3.3 Random Forest*

In [6]:
# -> Prepare data:

continuousCols = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat']

data = utils.prepare_data(df = Boston,
                    labelCol = 'medv',
                    label_is_categorical = False,
                    categoricalCols = [],
                    continuousCols = continuousCols
                   )

# -> Split the data into train and test samples:

train, test = data.randomSplit([0.5, 0.5], seed=11)

# -> Define the  model:

model = RandomForestRegressor(featuresCol="features", labelCol='label',seed=42)

# -> Fit the model
model_fit = model.fit(train)

# -> Feature importances:

importance_values = dict(zip(continuousCols, model_fit.featureImportances))
sorted_importance_values = sorted(importance_values.items(), key=lambda kv: kv[1], reverse=True)

print('\nFeature Importances:')
print(pd.DataFrame(sorted_importance_values, columns=['Feature', 'Importance']).rename_axis('Rank', axis=1))

# -> Make predictions on test sample:

predictions = model_fit.transform(test)
print('\nPredictions:'); predictions.select('label', 'prediction').show(5)

# -> Compute test error on test sample:

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mse")
mse = evaluator.evaluate(predictions)
print("\nMean Squared Error (MSE) on test data = {:.3f}".format(mse))


Feature Importances:
Rank  Feature  Importance
0       lstat    0.381374
1          rm    0.249550
2       indus    0.082887
3         dis    0.051316
4         age    0.042730
5         tax    0.042486
6       black    0.036187
7     ptratio    0.034370
8        crim    0.031308
9         nox    0.024283
10        rad    0.017777
11         zn    0.003881
12       chas    0.001851

Predictions:
+-----+------------------+
|label|        prediction|
+-----+------------------+
| 22.0| 23.20617572110621|
| 32.7|33.602401264953684|
| 24.5|  26.1613587722465|
| 50.0|  42.3506891025641|
| 44.0| 39.98073275335775|
+-----+------------------+
only showing top 5 rows


Mean Squared Error (MSE) on test data = 13.365


### *8.3.4 Boosting*

In [7]:
# -> Prepare data:

continuousCols = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat']

data = utils.prepare_data(df = Boston,
                    labelCol = 'medv',
                    label_is_categorical = False,
                    categoricalCols = [],
                    continuousCols = continuousCols
                   )

# -> Split the data into train and test samples:

train, test = data.randomSplit([0.5, 0.5], seed=11)

# -> Define the  model:

model = GBTRegressor(featuresCol="features", labelCol='label',seed=42, maxIter=10)

# -> Fit the model:

model_fit = model.fit(train)

# -> Feature importances:

importance_values = dict(zip(continuousCols, model_fit.featureImportances))
sorted_importance_values = sorted(importance_values.items(), key=lambda kv: kv[1], reverse=True)

print('\nFeature Importances:')
print(pd.DataFrame(sorted_importance_values, columns=['Feature', 'Importance']).rename_axis('Rank', axis=1))

# -> Make predictions on test sample:

predictions = model_fit.transform(test)
print('\nPredictions:'); predictions.select('label', 'prediction').show(5)

# -> Compute test error on test sample:

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mse")
mse = evaluator.evaluate(predictions)
print("\nMean Squared Error (MSE) on test data = {:.3f}".format(mse))


Feature Importances:
Rank  Feature  Importance
0       lstat    0.175391
1          rm    0.164404
2        crim    0.122764
3         dis    0.105787
4         age    0.105508
5       black    0.067354
6          zn    0.057196
7         nox    0.056076
8       indus    0.044667
9         tax    0.036804
10        rad    0.032230
11    ptratio    0.026436
12       chas    0.005384

Predictions:
+-----+------------------+
|label|        prediction|
+-----+------------------+
| 22.0|22.621810393953808|
| 32.7| 34.38035620375161|
| 24.5| 27.99043304384952|
| 50.0| 43.64635230730424|
| 44.0|  43.7587746385424|
+-----+------------------+
only showing top 5 rows


Mean Squared Error (MSE) on test data = 22.922
