In [56]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np

In [2]:
spark = SparkSession.builder.appName('ForestCoverTypeClassifier') \
.config('spark.warehouse.dir','/apps/hive/warehouse') \
.config('spark.driver.memory', '4G') \
.config('spark.sql.shuffle.partitions', 4) \
.enableHiveSupport().getOrCreate()

In [3]:
# Having set the driver and driver options we should have spark representing spark session 
# available straight away
spark.version

'3.1.3'

In [4]:
sc = spark.sparkContext

In [5]:
sc.setLogLevel('ERROR')

In [6]:
#  Covtype dataset publicly available dataset provides information on
# types of forest-covering parcels of land in Colorado, USA
fileloc = 'C:/Users/Administrator/Downloads/covtype.data'
dataWithoutHeader = spark.read  \
.option("inferSchema", True) \
.option("header", False) \
.csv(fileloc) 

In [27]:
# columns 10 to 14 are for wilderness_area and next 40 columns for soil type
colNames = [
    "Elevation", "Aspect", "Slope",
    "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points"] 
for x in range(4):
    colNames.append("Wilderness_Area_$" + str(x))
for x in range(40):
    colNames.append("Soil_type_$" + str(x))
colNames.append('Cover_Type')
print(colNames)

['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area_$0', 'Wilderness_Area_$1', 'Wilderness_Area_$2', 'Wilderness_Area_$3', 'Soil_type_$0', 'Soil_type_$1', 'Soil_type_$2', 'Soil_type_$3', 'Soil_type_$4', 'Soil_type_$5', 'Soil_type_$6', 'Soil_type_$7', 'Soil_type_$8', 'Soil_type_$9', 'Soil_type_$10', 'Soil_type_$11', 'Soil_type_$12', 'Soil_type_$13', 'Soil_type_$14', 'Soil_type_$15', 'Soil_type_$16', 'Soil_type_$17', 'Soil_type_$18', 'Soil_type_$19', 'Soil_type_$20', 'Soil_type_$21', 'Soil_type_$22', 'Soil_type_$23', 'Soil_type_$24', 'Soil_type_$25', 'Soil_type_$26', 'Soil_type_$27', 'Soil_type_$28', 'Soil_type_$29', 'Soil_type_$30', 'Soil_type_$31', 'Soil_type_$32', 'Soil_type_$33', 'Soil_type_$34', 'Soil_type_$35', 'Soil_type_$36', 'Soil_type_$37', 'Soil_type_$38', 'Soil_type_$39', 'Cover_Type']


In [28]:
coldict = dict(zip(['_c' + str(x) for x in range(55)], colNames))

In [29]:
coldict

{'_c0': 'Elevation',
 '_c1': 'Aspect',
 '_c2': 'Slope',
 '_c3': 'Horizontal_Distance_To_Hydrology',
 '_c4': 'Vertical_Distance_To_Hydrology',
 '_c5': 'Horizontal_Distance_To_Roadways',
 '_c6': 'Hillshade_9am',
 '_c7': 'Hillshade_Noon',
 '_c8': 'Hillshade_3pm',
 '_c9': 'Horizontal_Distance_To_Fire_Points',
 '_c10': 'Wilderness_Area_$0',
 '_c11': 'Wilderness_Area_$1',
 '_c12': 'Wilderness_Area_$2',
 '_c13': 'Wilderness_Area_$3',
 '_c14': 'Soil_type_$0',
 '_c15': 'Soil_type_$1',
 '_c16': 'Soil_type_$2',
 '_c17': 'Soil_type_$3',
 '_c18': 'Soil_type_$4',
 '_c19': 'Soil_type_$5',
 '_c20': 'Soil_type_$6',
 '_c21': 'Soil_type_$7',
 '_c22': 'Soil_type_$8',
 '_c23': 'Soil_type_$9',
 '_c24': 'Soil_type_$10',
 '_c25': 'Soil_type_$11',
 '_c26': 'Soil_type_$12',
 '_c27': 'Soil_type_$13',
 '_c28': 'Soil_type_$14',
 '_c29': 'Soil_type_$15',
 '_c30': 'Soil_type_$16',
 '_c31': 'Soil_type_$17',
 '_c32': 'Soil_type_$18',
 '_c33': 'Soil_type_$19',
 '_c34': 'Soil_type_$20',
 '_c35': 'Soil_type_$21',
 '_c36'

In [30]:
from pyspark.sql.functions import *

In [31]:
# lets create the data frame wiht column names
# and cast the label that we have to forecast to double
data = dataWithoutHeader.select([col(c).alias(coldict.get(c, c)) for c in dataWithoutHeader.columns])

In [32]:
data = data.withColumn('Cover_Type', data.Cover_Type.cast('double'))

In [33]:
# Split into 90% train (+ CV), 10% test
trainData, testData = data.randomSplit([0.9, 0.1])

In [53]:
trainData.show(1,False)

+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+----------------------------------+------------------+------------------+------------------+------------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+----------+
|Elevation|Aspect|Slope|Horizontal_Distance_To_Hydrology|Vertical_Distance_To_Hydrology|Horizontal_Distance_To_Roadways|Hillshade_9am|Hillshade_Noon|Hillshade_3pm|H

In [57]:
# inputCols = list(filter(lambda x: x != 'Cover_Type', trainData.columns))
# type(trainData.columns)
len(trainData.columns)
type(trainData.columns[54])
# filter(lambda x : str(x) != 'Cover_Type', trainData.columns)
col_list = trainData.columns
col_list
col_list.remove('Cover_Type')
col_list

['Elevation',
 'Aspect',
 'Slope',
 'Horizontal_Distance_To_Hydrology',
 'Vertical_Distance_To_Hydrology',
 'Horizontal_Distance_To_Roadways',
 'Hillshade_9am',
 'Hillshade_Noon',
 'Hillshade_3pm',
 'Horizontal_Distance_To_Fire_Points',
 'Wilderness_Area_$0',
 'Wilderness_Area_$1',
 'Wilderness_Area_$2',
 'Wilderness_Area_$3',
 'Soil_type_$0',
 'Soil_type_$1',
 'Soil_type_$2',
 'Soil_type_$3',
 'Soil_type_$4',
 'Soil_type_$5',
 'Soil_type_$6',
 'Soil_type_$7',
 'Soil_type_$8',
 'Soil_type_$9',
 'Soil_type_$10',
 'Soil_type_$11',
 'Soil_type_$12',
 'Soil_type_$13',
 'Soil_type_$14',
 'Soil_type_$15',
 'Soil_type_$16',
 'Soil_type_$17',
 'Soil_type_$18',
 'Soil_type_$19',
 'Soil_type_$20',
 'Soil_type_$21',
 'Soil_type_$22',
 'Soil_type_$23',
 'Soil_type_$24',
 'Soil_type_$25',
 'Soil_type_$26',
 'Soil_type_$27',
 'Soil_type_$28',
 'Soil_type_$29',
 'Soil_type_$30',
 'Soil_type_$31',
 'Soil_type_$32',
 'Soil_type_$33',
 'Soil_type_$34',
 'Soil_type_$35',
 'Soil_type_$36',
 'Soil_type_$37

In [59]:
# Simple Decision Tree
# inputCols = list(filter(lambda x: x != 'Cover_Type', trainData.columns))
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml.feature import VectorIndexer
assembler = VectorAssembler().setInputCols(col_list).setOutputCol('featureVector')
#  and use the vector assembler transformer to transform the training data
assembledTrainData = assembler.transform(trainData)
print("\nThe feature vector produced by the assembler")
assembledTrainData.select("featureVector").show(truncate = False)


The feature vector produced by the assembler
+-----------------------------------------------------------------------------------------------------+
|featureVector                                                                                        |
+-----------------------------------------------------------------------------------------------------+
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1863.0,37.0,17.0,120.0,18.0,90.0,217.0,202.0,115.0,769.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1879.0,28.0,19.0,30.0,12.0,95.0,209.0,196.0,117.0,778.0,1.0,1.0])   |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1888.0,33.0,22.0,150.0,46.0,108.0,209.0,185.0,103.0,735.0,1.0,1.0]) |
|(54,[0,1,2,3,4,5,6,7,8,9,13,14],[1889.0,28.0,22.0,150.0,23.0,120.0,205.0,185.0,108.0,759.0,1.0,1.0]) |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1889.0,353.0,30.0,95.0,39.0,67.0,153.0,172.0,146.0,600.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1896.0,337.0,12.0,30.0,6.0,175.0,195.0,224.0,168.0,732.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,

In [60]:
# the classifier is an estimator whose parameters we set
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
DecisionTreeClassifier
classifier = DecisionTreeClassifier(featuresCol='featureVector', labelCol='Cover_Type', 
                                    predictionCol='prediction', seed=100)
model = classifier.fit(assembledTrainData)

In [61]:
print("\nThe model printed out")
print(model.toDebugString)


The model printed out
DecisionTreeClassificationModel: uid=DecisionTreeClassifier_35647b828831, depth=5, numNodes=49, numClasses=8, numFeatures=54
  If (feature 0 <= 3049.5)
   If (feature 0 <= 2564.5)
    If (feature 10 <= 0.5)
     If (feature 0 <= 2459.5)
      If (feature 3 <= 15.0)
       Predict: 4.0
      Else (feature 3 > 15.0)
       Predict: 3.0
     Else (feature 0 > 2459.5)
      If (feature 17 <= 0.5)
       Predict: 2.0
      Else (feature 17 > 0.5)
       Predict: 3.0
    Else (feature 10 > 0.5)
     If (feature 9 <= 5433.0)
      Predict: 2.0
     Else (feature 9 > 5433.0)
      If (feature 5 <= 572.0)
       Predict: 2.0
      Else (feature 5 > 572.0)
       Predict: 5.0
   Else (feature 0 > 2564.5)
    If (feature 0 <= 2956.5)
     If (feature 15 <= 0.5)
      If (feature 17 <= 0.5)
       Predict: 2.0
      Else (feature 17 > 0.5)
       Predict: 3.0
     Else (feature 15 > 0.5)
      Predict: 3.0
    Else (feature 0 > 2956.5)
     If (feature 3 <= 191.0)
      If (

In [62]:
print("\nThe feature importances that are provided by the model in descending order")
# model provides column numbers - lets zip with column names to put comprehensible stuff
sorted([x for x  in zip(colNames, model.featureImportances.toArray())], key=lambda x: -x[1])


The feature importances that are provided by the model in descending order


[('Elevation', 0.7924945051788969),
 ('Horizontal_Distance_To_Hydrology', 0.041689463182791195),
 ('Wilderness_Area_$0', 0.03255111826682026),
 ('Soil_type_$3', 0.030317682186528125),
 ('Soil_type_$31', 0.02809401736308813),
 ('Hillshade_Noon', 0.025565383800904024),
 ('Soil_type_$1', 0.023019547300731476),
 ('Wilderness_Area_$2', 0.011499187322143518),
 ('Horizontal_Distance_To_Roadways', 0.006202649430273973),
 ('Soil_type_$22', 0.00564310239829267),
 ('Hillshade_9am', 0.002505064671203571),
 ('Horizontal_Distance_To_Fire_Points', 0.0004182788983263319),
 ('Aspect', 0.0),
 ('Slope', 0.0),
 ('Vertical_Distance_To_Hydrology', 0.0),
 ('Hillshade_3pm', 0.0),
 ('Wilderness_Area_$1', 0.0),
 ('Wilderness_Area_$3', 0.0),
 ('Soil_type_$0', 0.0),
 ('Soil_type_$2', 0.0),
 ('Soil_type_$4', 0.0),
 ('Soil_type_$5', 0.0),
 ('Soil_type_$6', 0.0),
 ('Soil_type_$7', 0.0),
 ('Soil_type_$8', 0.0),
 ('Soil_type_$9', 0.0),
 ('Soil_type_$10', 0.0),
 ('Soil_type_$11', 0.0),
 ('Soil_type_$12', 0.0),
 ('Soil_

In [63]:
#  we use the model transformer to get predictions from training data
predictions = model.transform(assembledTrainData)
print("\nTake a look at the predictions")
predictions.select("Cover_Type", "prediction", "probability").show(truncate = False)


Take a look at the predictions
+----------+----------+------------------------------------------------------------------------------------------------+
|Cover_Type|prediction|probability                                                                                     |
+----------+----------+------------------------------------------------------------------------------------------------+
|6.0       |3.0       |[0.0,0.0,0.03750835003340013,0.6285905143620575,0.04849699398797595,0.0,0.28540414161656646,0.0]|
|6.0       |3.0       |[0.0,0.0,0.03750835003340013,0.6285905143620575,0.04849699398797595,0.0,0.28540414161656646,0.0]|
|6.0       |3.0       |[0.0,0.0,0.03750835003340013,0.6285905143620575,0.04849699398797595,0.0,0.28540414161656646,0.0]|
|6.0       |3.0       |[0.0,0.0,0.03750835003340013,0.6285905143620575,0.04849699398797595,0.0,0.28540414161656646,0.0]|
|6.0       |3.0       |[0.0,0.0,0.03750835003340013,0.6285905143620575,0.04849699398797595,0.0,0.28540414161656646,0.0]|


In [64]:
# now lets evaluate - initialize an evaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='Cover_Type', predictionCol= 'prediction')

In [68]:
#  lets get the classification metrics
accuracy = evaluator.setMetricName("accuracy").evaluate(predictions)
f1 = evaluator.setMetricName("f1").evaluate(predictions)
print("\nPrinting classification metrics")
print("accuracy: " , accuracy)
print("f1: " , f1)


Printing classification metrics
accuracy:  0.70070871789967
f1:  0.6829352420240993


In [69]:
predictions.rdd.take(2)

[Row(Elevation=1863, Aspect=37, Slope=17, Horizontal_Distance_To_Hydrology=120, Vertical_Distance_To_Hydrology=18, Horizontal_Distance_To_Roadways=90, Hillshade_9am=217, Hillshade_Noon=202, Hillshade_3pm=115, Horizontal_Distance_To_Fire_Points=769, Wilderness_Area_$0=0, Wilderness_Area_$1=0, Wilderness_Area_$2=0, Wilderness_Area_$3=1, Soil_type_$0=0, Soil_type_$1=1, Soil_type_$2=0, Soil_type_$3=0, Soil_type_$4=0, Soil_type_$5=0, Soil_type_$6=0, Soil_type_$7=0, Soil_type_$8=0, Soil_type_$9=0, Soil_type_$10=0, Soil_type_$11=0, Soil_type_$12=0, Soil_type_$13=0, Soil_type_$14=0, Soil_type_$15=0, Soil_type_$16=0, Soil_type_$17=0, Soil_type_$18=0, Soil_type_$19=0, Soil_type_$20=0, Soil_type_$21=0, Soil_type_$22=0, Soil_type_$23=0, Soil_type_$24=0, Soil_type_$25=0, Soil_type_$26=0, Soil_type_$27=0, Soil_type_$28=0, Soil_type_$29=0, Soil_type_$30=0, Soil_type_$31=0, Soil_type_$32=0, Soil_type_$33=0, Soil_type_$34=0, Soil_type_$35=0, Soil_type_$36=0, Soil_type_$37=0, Soil_type_$38=0, Soil_type_

In [70]:
#  for getting the confusion matrix out 
#  we have multiclassmetrics available from mllib  which expects RDDs
#  we can always get the underlying RDD
from pyspark.mllib.evaluation import MulticlassMetrics
predictionRDD = predictions.select('prediction', 'Cover_Type').rdd
multiclassMetrics = MulticlassMetrics(predictionRDD)
print("\nPrinting the confusion matrix obtaned from multiclass metrics")

print(multiclassMetrics.confusionMatrix().toArray())




Printing the confusion matrix obtaned from multiclass metrics
[[1.25887e+05 5.90460e+04 1.59000e+02 0.00000e+00 0.00000e+00 0.00000e+00
  5.36400e+03]
 [4.67860e+04 2.02939e+05 4.42200e+03 1.16000e+02 5.50000e+01 0.00000e+00
  8.02000e+02]
 [0.00000e+00 5.43600e+03 2.60830e+04 6.60000e+02 0.00000e+00 0.00000e+00
  0.00000e+00]
 [0.00000e+00 1.80000e+01 1.45200e+03 1.00600e+03 0.00000e+00 0.00000e+00
  0.00000e+00]
 [3.00000e+00 7.75900e+03 6.91000e+02 0.00000e+00 7.90000e+01 0.00000e+00
  0.00000e+00]
 [0.00000e+00 5.76600e+03 9.21800e+03 5.88000e+02 0.00000e+00 0.00000e+00
  0.00000e+00]
 [7.97200e+03 9.10000e+01 5.80000e+01 0.00000e+00 0.00000e+00 0.00000e+00
  1.03190e+04]]


In [71]:
print(assembledTrainData.count())
print(multiclassMetrics.confusionMatrix().toArray().sum())

522775
522775.0


In [72]:
 data.groupBy("Cover_Type").count().orderBy('Cover_Type').select('count').rdd.map(lambda x: x[0]/522775).collect()

[0.4052221318923055,
 0.5419176509970829,
 0.06839271196977667,
 0.00525465066233083,
 0.018158863755917937,
 0.03322079288412797,
 0.03923293960116685]

In [73]:
def classProbabilities(data):
    '''
    to calcluate distribution of result classes in any part dataframe that we obtain using splits
    '''
    total = data.count()
    return data.groupBy("Cover_Type").count() \
      .orderBy("Cover_Type") \
      .select("count") \
      .rdd.map(lambda x: x[0]/522775).collect()  

In [74]:
from functools import reduce
print("------RANDOM CLASSIFIER PROBABILITIES--------")
trainProbabilities = classProbabilities(trainData)
testProbabilities = classProbabilities(testData)
randomClassificationProbability = reduce(lambda x, y: x + y , [ a[0] * a[1] 
                                    for a  in zip(trainProbabilities, testProbabilities)])
print("\nRandom classification prbability - " , randomClassificationProbability)

------RANDOM CLASSIFIER PROBABILITIES--------

Random classification prbability -  0.041904663410863796


In [75]:
print("\nNow using pipelines and grids to find the best fit")
#   evaluate(trainData, testData)

# inputCols = list(filter(lambda x: x != 'Cover_Type', trainData.columns))
assembler = VectorAssembler().setInputCols(col_list).setOutputCol('featureVector')
classifier = DecisionTreeClassifier(featuresCol='featureVector', labelCol='Cover_Type', 
                                    predictionCol='prediction', seed=100)
from pyspark.ml import PipelineModel, Pipeline 
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit 

pipeline = Pipeline(stages=[assembler, classifier])

paramGrid = ParamGridBuilder()  \
.addGrid(classifier.impurity, ["gini", "entropy"]) \
.addGrid(classifier.maxDepth, [1, 20]) \
.addGrid(classifier.maxBins, [40, 300]) \
.addGrid(classifier.minInfoGain, [0.0, 0.05]) \
.build()

multiClassEval = MulticlassClassificationEvaluator(labelCol='Cover_Type', 
                                        predictionCol= 'prediction', metricName='accuracy')

# we employ the traihihg validation split to run the pipeline
#   and setting train ratio we can hold out a cross vaildatoin set from within the training data
#     val validator = new TrainValidationSplit().
#       setSeed(Random.nextLong()).
#       setEstimator(pipeline).
#       setEvaluator(multiclassEval).
#       setEstimatorParamMaps(paramGrid).
#       setTrainRatio(0.9)
validator = TrainValidationSplit(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=multiClassEval,
    trainRatio=0.9,
    seed= 100
)

#  we fit it to the train data
validatorModel = validator.fit(trainData)



Now using pipelines and grids to find the best fit


In [76]:
paramsAndMetrics =  sorted(
    zip(validatorModel.validationMetrics, validatorModel.getEstimatorParamMaps()), 
    key = lambda x: -x[0])

In [77]:
paramsAndMetrics

[(0.9095193573622851,
  {Param(parent='DecisionTreeClassifier_abad881777f8', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'entropy',
   Param(parent='DecisionTreeClassifier_abad881777f8', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 20,
   Param(parent='DecisionTreeClassifier_abad881777f8', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 40,
   Param(parent='DecisionTreeClassifier_abad881777f8', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0}),
 (0.9085462420576619,
  {Param(parent='DecisionTreeClassifier_abad881777f8', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy

In [78]:
bestModel = validatorModel.bestModel
ppmodel = bestModel.stages[-1]

In [86]:
len(bestModel.stages)

2

In [80]:
print("Printing the max validation metrics achieved by any of the runs")
print(reduce(lambda x, y: x if x > y else y, validatorModel.validationMetrics))

#  find out the performance for the test data
testAccuracy = multiClassEval.evaluate(bestModel.transform(testData))
print("\nAccuracy on the test data: " , testAccuracy)

trainAccuracy = multiClassEval.evaluate(bestModel.transform(trainData))
print("\nAccuracy on the train data: " , trainAccuracy)

Printing the max validation metrics achieved by any of the runs
0.9095193573622851

Accuracy on the test data:  0.9130106289815753

Accuracy on the train data:  0.9506384199703505


In [81]:
import numpy as np
tarr = np.array([0, 0, 1.0])
tarr.tolist()

[0.0, 0.0, 1.0]

In [82]:
# UnOneHotEncode
def unencodeOneHot(data):
    wildernessCols = list(map(lambda x: "Wilderness_Area_$" + str(x), range(4)))
    wildernessAssembler = VectorAssembler(
          inputCols = wildernessCols, 
          outputCol = "wilderness"
    )

    #   we have a udf here which is going to find the item which is 1.0 in the 4 columns
    #  for wilderness and the 40 for the soil
    unhotUDF = udf(lambda vec: vec.toArray().tolist().index(1.0))

    withWilderness = wildernessAssembler.transform(data)  
    withWilderness = withWilderness.select([x for x in withWilderness.columns 
                                            if x not in wildernessCols])
    withWilderness = withWilderness.withColumn('wilderness', unhotUDF('wilderness').cast('double'))

    soilCols = list(map(lambda x: "Soil_type_$" + str(x), range(40)))
    soilAssembler = VectorAssembler(
          inputCols = soilCols, 
          outputCol = "soil"
    )

    soilDF = soilAssembler.transform(withWilderness)
    soilDF = soilDF.select([x for x in soilDF.columns 
                                            if x not in soilCols])
    soilDF = soilDF.withColumn('soil', unhotUDF('soil').cast('double'))

    return soilDF


In [83]:
unencTrainData = unencodeOneHot(trainData)

In [84]:
unencTrainData.groupBy('soil').count().orderBy('soil').show()

+----+-----+
|soil|count|
+----+-----+
| 0.0| 2714|
| 1.0| 6753|
| 2.0| 4349|
| 3.0|11176|
| 4.0| 1437|
| 5.0| 5920|
| 6.0|   95|
| 7.0|  160|
| 8.0| 1012|
| 9.0|29329|
|10.0|11161|
|11.0|26954|
|12.0|15668|
|13.0|  540|
|14.0|    2|
|15.0| 2576|
|16.0| 3088|
|17.0| 1724|
|18.0| 3622|
|19.0| 8287|
+----+-----+
only showing top 20 rows



# Random Forest

In [89]:
unencode_col_list = unencTrainData.columns
unencode_col_list.remove('Cover_Type')
unencode_col_list

['Elevation',
 'Aspect',
 'Slope',
 'Horizontal_Distance_To_Hydrology',
 'Vertical_Distance_To_Hydrology',
 'Horizontal_Distance_To_Roadways',
 'Hillshade_9am',
 'Hillshade_Noon',
 'Hillshade_3pm',
 'Horizontal_Distance_To_Fire_Points',
 'wilderness',
 'soil']

In [90]:
unencTrainData = unencodeOneHot(trainData)
unencTestData = unencodeOneHot(testData)
# inputCols = list(filter(lambda x: x != 'Cover_Type', unencTrainData.columns))

assemblerForest = VectorAssembler().setInputCols(
    unencode_col_list).setOutputCol('featureVector')

indexerForest = VectorIndexer(
      maxCategories = 40,
      inputCol = "featureVector", 
      outputCol = "indexedVector")

classifierForest = RandomForestClassifier(
      seed = 100, 
      labelCol = "Cover_Type", 
      featuresCol = "indexedVector", 
      predictionCol = "prediction", 
      impurity = "entropy",
      maxDepth = 20,
      maxBins = 300)

pipelineForest = Pipeline(stages=[assemblerForest, 
                                  indexerForest, classifierForest])

paramGridForest = ParamGridBuilder()  \
.addGrid(classifierForest.minInfoGain, [0.0, 0.05]) \
.addGrid(classifierForest.numTrees, [1, 10]) \
.build()

multiClassEvalForest = MulticlassClassificationEvaluator(labelCol='Cover_Type', 
                                        predictionCol= 'prediction',
                                        metricName='accuracy')

validatorForest = TrainValidationSplit(
    estimator=pipelineForest, 
    estimatorParamMaps=paramGridForest, 
    evaluator=multiClassEvalForest,
    trainRatio=0.9,
    seed= 100
)

#  we fit it to the train data
validatorForest = validatorForest.fit(unencTrainData)


In [91]:
bestModel = validatorForest.bestModel
print("\nThe best forest model parameters")
print(bestModel.stages[-1].extractParamMap())
print("\nThe nubmer of trees used by the forestModel")
print(bestModel.stages[-1].getNumTrees)


The best forest model parameters
{Param(parent='RandomForestClassifier_7d5d75932eb9', name='bootstrap', doc='Whether bootstrap samples are used when building trees.'): True, Param(parent='RandomForestClassifier_7d5d75932eb9', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False, Param(parent='RandomForestClassifier_7d5d75932eb9', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10, Param(parent='RandomForestClassifier_7d5d75932eb9', name='featureSubsetStrategy', doc="The number of features to consider for

In [92]:
print("\nFeature importances obtained from random forest")
sorted([x for x  in zip(unencTrainData.columns, 
                        bestModel.stages[-1].featureImportances.toArray())], 
       key=lambda x: -x[1])

testAccuracy = multiClassEvalForest.evaluate(bestModel.transform(unencTestData))
print(testAccuracy)

#     bestModel.transform(unencTestData.drop("Cover_Type")).select("prediction").show()


Feature importances obtained from random forest
0.948280302900218
