In [None]:
import pyspark

from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.config("spark.driver.memory", "8g").appName("DecisionTree").getOrCreate()

# Preparing the Data

In [None]:
data_without_header = spark.read.option("inferSchema", True)\
                      .option("header", False).csv("data/covtype.data")

data_without_header.printSchema()

In [None]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col


colnames = [
    "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
    "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points"
] + \
    [f"Wilderness_Area_{i}" for i in range(4)] + \
    [f"Soil_Type_{i}" for i in range(40)] + \
    ["Cover_Type"]

data = data_without_header.toDF(*colnames). \
    withColumn("Cover_Type", col("Cover_Type").cast(DoubleType()))

data.head()

# Our First Decision Tree

In [None]:
(train_data, test_data) = data.randomSplit([0.9, 0.1])
train_data.cache()
test_data.cache()

In [None]:
from pyspark.ml.feature import VectorAssembler

input_cols = colnames[:-1]
vector_assembler = VectorAssembler(inputCols=input_cols, outputCol="featureVector")

assembled_train_data = vector_assembler.transform(train_data)

assembled_train_data.select("featureVector").show(truncate = False)

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

classifier = DecisionTreeClassifier(seed=1234, labelCol="Cover_Type", featuresCol="featureVector",
                                    predictionCol="prediction")

model = classifier.fit(assembled_train_data)
print(model.toDebugString)

In [None]:
import pandas as pd

pd.DataFrame(model.featureImportances.toArray(), 
             index=input_cols, columns=["importance"]). \
    sort_values(by="importance", ascending=False)

In [None]:
predictions = model.transform(assembled_train_data)
predictions.select("Cover_Type", "prediction", "probability").show(10, truncate=False)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="Cover_Type",
                                        predictionCol="prediction")

evaluator.setMetricName("accuracy").evaluate(predictions)
evaluator.setMetricName("f1").evaluate(predictions)

In [None]:
confusion_matrix = predictions.groupBy("Cover_Type").\
    pivot("prediction", range(1,8)).count().\
    na.fill(0.0).\
    orderBy("Cover_Type")

confusion_matrix.show()

In [None]:
from pyspark.sql import DataFrame

def class_probabilities(data):
    total = data.count()
    return data.groupBy("Cover_Type").count(). \
        orderBy("Cover_Type"). \
        select(col("count").cast(DoubleType())). \
        withColumn("count_proportion", col("count") / total). \
        select("count_proportion").collect()


train_prior_probabilities = class_probabilities(train_data)
test_prior_probabilities = class_probabilities(test_data)

train_prior_probabilities

In [None]:
train_prior_probabilities = [p[0] for p in train_prior_probabilities]
test_prior_probabilities = [p[0] for p in test_prior_probabilities]

sum([train_p * cv_p for train_p, cv_p in zip(train_prior_probabilities, test_prior_probabilities)])

# Tuning Decision Tress

In [None]:
from pyspark.ml import Pipeline

assembler = VectorAssembler(inputCols=input_cols, outputCol="featureVector")
classifier = DecisionTreeClassifier(seed=1234, labelCol="Cover_Type",
                                    featuresCol="featureVector",
                                    predictionCol="prediction")

pipeline = Pipeline(stages=[assembler, classifier])

In [None]:
from pyspark.ml.tuning import ParamGridBuilder

paramGrid = ParamGridBuilder(). \
    addGrid(classifier.impurity, ["gini", "entropy"]). \
    addGrid(classifier.maxDepth, [1, 20]). \
    addGrid(classifier.maxBins, [40, 300]). \
    addGrid(classifier.minInfoGain, [0.0, 0.05]). \
    build()

multiclassEval = MulticlassClassificationEvaluator(). \
    setLabelCol("Cover_Type"). \
    setPredictionCol("prediction"). \
    setMetricName("accuracy")

In [None]:
from pyspark.ml.tuning import TrainValidationSplit

validator = TrainValidationSplit(seed=1234,
                                 estimator=pipeline,
                                 evaluator=multiclassEval,
                                 estimatorParamMaps=paramGrid,
                                 trainRatio=0.9)

validator_model = validator.fit(train_data)

In [None]:
from pprint import pprint

best_model = validator_model.bestModel
pprint(best_model.stages[1].extractParamMap())

In [None]:
validator_model = validator.fit(train_data)

metrics = validator_model.validationMetrics
params = validator_model.getEstimatorParamMaps()
metrics_and_params = list(zip(metrics, params))

metrics_and_params.sort(key=lambda x: x[0], reverse=True)
metrics_and_params

In [None]:
metrics.sort(reverse=True)
print(metrics[0])

In [None]:
multiclassEval.evaluate(best_model.transform(test_data))

# Categorical Features Revisited

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def unencode_one_hot(data):
    wilderness_cols = ['Wilderness_Area_' + str(i) for i in range(4)]
    wilderness_assembler = VectorAssembler(). \
                            setInputCols(wilderness_cols). \
                            setOutputCol("wilderness")

    unhot_udf = udf(lambda v: v.toArray().tolist().index(1))

    with_wilderness = wilderness_assembler.transform(data). \
      drop(*wilderness_cols). \
      withColumn("wilderness", unhot_udf(col("wilderness")).cast(IntegerType()))

    soil_cols = ['Soil_Type_' + str(i) for i in range(40)]
    soil_assembler = VectorAssembler(). \
                      setInputCols(soil_cols). \
                      setOutputCol("soil")
    with_soil = soil_assembler. \
                transform(with_wilderness). \
                drop(*soil_cols). \
                withColumn("soil", unhot_udf(col("soil")).cast(IntegerType()))

    return with_soil

In [None]:
unenc_train_data = unencode_one_hot(train_data)
unenc_train_data.printSchema()

In [None]:
unenc_train_data.groupBy('wilderness').count().show()

In [None]:
from pyspark.ml.feature import VectorIndexer

cols = unenc_train_data.columns
input_cols = [c for c in cols if c!='Cover_Type']

assembler = VectorAssembler().setInputCols(input_cols).setOutputCol("featureVector")

indexer = VectorIndexer(). \
    setMaxCategories(40). \
    setInputCol("featureVector"). \
    setOutputCol("indexedVector")

classifier = DecisionTreeClassifier(). \
    setLabelCol("Cover_Type"). \
    setFeaturesCol("indexedVector"). \
    setPredictionCol("prediction")

pipeline = Pipeline().setStages([assembler, indexer, classifier])

# Random Forests

In [None]:
from pyspark.ml.classification import RandomForestClassifier

classifier = RandomForestClassifier(seed=1234, labelCol="Cover_Type",
                                    featuresCol="indexedVector",
                                    predictionCol="prediction")

In [None]:
unenc_train_data.columns

In [None]:
# Skipped in book

cols = unenc_train_data.columns
input_cols = [c for c in cols if c!='Cover_Type']

assembler = VectorAssembler().setInputCols(input_cols).setOutputCol("featureVector")

indexer = VectorIndexer(). \
    setMaxCategories(40). \
    setInputCol("featureVector"). \
    setOutputCol("indexedVector")

pipeline = Pipeline().setStages([assembler, indexer, classifier])

paramGrid = ParamGridBuilder(). \
    addGrid(classifier.impurity, ["gini", "entropy"]). \
    addGrid(classifier.maxDepth, [1, 20]). \
    addGrid(classifier.maxBins, [40, 300]). \
    addGrid(classifier.minInfoGain, [0.0, 0.05]). \
    build()

multiclassEval = MulticlassClassificationEvaluator(). \
    setLabelCol("Cover_Type"). \
    setPredictionCol("prediction"). \
    setMetricName("accuracy")

validator = TrainValidationSplit(seed=1234,
  estimator=pipeline,
  evaluator=multiclassEval,
  estimatorParamMaps=paramGrid,
  trainRatio=0.9)

validator_model = validator.fit(unenc_train_data)

best_model = validator_model.bestModel

In [None]:
forest_model = best_model.stages[2]

feature_importance_list = list(zip(input_cols,
                                  forest_model.featureImportances.toArray()))
feature_importance_list.sort(key=lambda x: x[1], reverse=True)

pprint(feature_importance_list)

# Making Predictions

In [None]:
unenc_test_data = unencode_one_hot(test_data)

best_model.transform(unenc_test_data.drop("Cover_Type")).\
                    select("prediction").show(1)