In [1]:
import os.path
import urllib.request
import gzip
import shutil

if not os.path.exists('covtype.data.gz'):
    urllib.request.urlretrieve('http://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz', 'covtype.data.gz')
    with gzip.open('covtype.data.gz', 'rb') as f_in:
        with open('covtype.data', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
if not os.path.exists('covtype.info'):
    urllib.request.urlretrieve('http://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info', 'covtype.info')
if not os.path.exists('old_covtype.info'):
    urllib.request.urlretrieve('http://archive.ics.uci.edu/ml/machine-learning-databases/covtype/old_covtype.info', 'old_covtype.info')


In [2]:
%matplotlib inline
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("forest-cover-type").getOrCreate()

In [3]:
spark.sparkContext.uiWebUrl

'http://60335d42cda3:4040'

In [4]:
data = spark.read.csv('covtype.data', inferSchema='true')
# Wilderness_Area and Soil_Type columns are one hot encoded
Wilderness_Area_cols = ['Wilderness_Area_' + str(i) for i in range(4)]
Soil_Type_cols = ['Soil_Type_' + str(i) for i in range(40)]

headers = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
          'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
          'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 
          'Horizontal_Distance_To_Fire_Points', *Wilderness_Area_cols,
           *Soil_Type_cols, 'Cover_Type']
data = data.toDF(*headers)
data.printSchema()

root
 |-- Elevation: integer (nullable = true)
 |-- Aspect: integer (nullable = true)
 |-- Slope: integer (nullable = true)
 |-- Horizontal_Distance_To_Hydrology: integer (nullable = true)
 |-- Vertical_Distance_To_Hydrology: integer (nullable = true)
 |-- Horizontal_Distance_To_Roadways: integer (nullable = true)
 |-- Hillshade_9am: integer (nullable = true)
 |-- Hillshade_Noon: integer (nullable = true)
 |-- Hillshade_3pm: integer (nullable = true)
 |-- Horizontal_Distance_To_Fire_Points: integer (nullable = true)
 |-- Wilderness_Area_0: integer (nullable = true)
 |-- Wilderness_Area_1: integer (nullable = true)
 |-- Wilderness_Area_2: integer (nullable = true)
 |-- Wilderness_Area_3: integer (nullable = true)
 |-- Soil_Type_0: integer (nullable = true)
 |-- Soil_Type_1: integer (nullable = true)
 |-- Soil_Type_2: integer (nullable = true)
 |-- Soil_Type_3: integer (nullable = true)
 |-- Soil_Type_4: integer (nullable = true)
 |-- Soil_Type_5: integer (nullable = true)
 |-- Soil_Type

In [5]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

wildernessAssembler = VectorAssembler(inputCols=Wilderness_Area_cols, outputCol='Wilderness_Area_features')
soilTypeAssembler = VectorAssembler(inputCols=Soil_Type_cols, outputCol='Soil_Type_features')
pipeline = Pipeline(stages=[wildernessAssembler, soilTypeAssembler])
data = pipeline.fit(data).transform(data)
data = data.drop(*Wilderness_Area_cols, *Soil_Type_cols)
data.printSchema()

root
 |-- Elevation: integer (nullable = true)
 |-- Aspect: integer (nullable = true)
 |-- Slope: integer (nullable = true)
 |-- Horizontal_Distance_To_Hydrology: integer (nullable = true)
 |-- Vertical_Distance_To_Hydrology: integer (nullable = true)
 |-- Horizontal_Distance_To_Roadways: integer (nullable = true)
 |-- Hillshade_9am: integer (nullable = true)
 |-- Hillshade_Noon: integer (nullable = true)
 |-- Hillshade_3pm: integer (nullable = true)
 |-- Horizontal_Distance_To_Fire_Points: integer (nullable = true)
 |-- Cover_Type: integer (nullable = true)
 |-- Wilderness_Area_features: vector (nullable = true)
 |-- Soil_Type_features: vector (nullable = true)



In [6]:
train, test = data.randomSplit([0.8, 0.2])
labelCol = 'Cover_Type'
train.groupby(labelCol).count().show()

+----------+------+
|Cover_Type| count|
+----------+------+
|         1|170048|
|         6| 13911|
|         3| 28741|
|         5|  7587|
|         4|  2214|
|         7| 16563|
|         2|226654|
+----------+------+



In [None]:
from pyspark.ml.feature import PCA
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

cols = data.columns[:]
cols.remove(labelCol)
pcaAssembler = VectorAssembler(inputCols=cols, outputCol='pcaFeatures')
pca = PCA(k=48, inputCol='pcaFeatures', outputCol='pcaOutFeatures')
classifier = RandomForestClassifier(featuresCol='pcaOutFeatures', 
                                    labelCol=labelCol,
                                    numTrees=50,
                                    maxDepth=25)
pcaPipeline = Pipeline(stages=[pcaAssembler, pca, classifier])
pcaPipelineModel = pcaPipeline.fit(train)
df = pcaPipelineModel.transform(train)

evaluator = MulticlassClassificationEvaluator(labelCol='Cover_Type')
evaluator.evaluate(df)

In [None]:
from pyspark.ml.feature import PCA
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

cols = data.columns[:]
cols.remove(labelCol)
assembler = VectorAssembler(inputCols=cols, outputCol='features')
classifier = RandomForestClassifier(featuresCol='features', 
                                    labelCol=labelCol,
                                    numTrees=20,
                                    maxDepth=25)
pipeline = Pipeline(stages=[assembler, classifier])
pipelineModel = pipeline.fit(train)
df = pipelineModel.transform(train)

evaluator = MulticlassClassificationEvaluator(labelCol=labelCol)
evaluator.evaluate(df)

In [None]:
# df.select('pcaOutFeatures').show()
# df.describe('pcaOutFeatures').show()

pcaModel = pcaPipelineModel.stages[-1]
pcaModel.explainedVariance

In [None]:
from pyspark.sql.functions import col, udf, max
from pyspark.sql.types import ArrayType, DoubleType
from scipy.spatial import distance
import numpy as np

def l2norm(v):
    x = v.toArray()
    d = np.linalg.norm(x)
#     return float(d)
    return d

# l2norm(df.select('pcaOutFeatures').head()[0])
df.select('pcaOutFeatures')
l2_norm_udf = udf(lambda x: l2norm(x), DoubleType())
dfDist = df.withColumn('dist', l2_norm_udf(df['pcaOutFeatures']))
dfDist.agg(max(col('dist'))).first()[0]