In [2]:
from __future__ import division, print_function, unicode_literals # For the compatibility with Python 2

from pyspark.sql import SparkSession
spark_session = SparkSession.builder\
                            .enableHiveSupport()\
                            .appName("sparksql")\
                            .master("local[4]")\
                            .getOrCreate()
                



In [3]:
sc = spark_session.sparkContext
df = spark_session.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferschema", "true")\
        .option("mode", "DROPMALFORMED")\
        .load("/data/covertype2/train.csv")\
        .repartition(60)


In [None]:
from pyspark.ml.feature import StringIndexer
cat_cols=['Soil_Type','Wild_Type']
cat_cols_index={'Soil_Type':'Soil_Index','Wild_Type':'Wild_Index'}
cat_cols_encoder={'Soil_Index':'SoilEncoder','Wild_Index':'WildEncoder'}

stringIndexer = StringIndexer(inputCol = "Soil_Type", outputCol = "Soil_Index")
model1 = stringIndexer.fit(df)
indexedDF = model1.transform(df)

stringIndexer2 = StringIndexer(inputCol = "Wild_Type", outputCol = "Wild_Index")
model2 = stringIndexer2.fit(indexedDF)
indexedDF2 = model2.transform(indexedDF)

from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCol = "Soil_Index", outputCol = "SoilEncoder")
encoder.setDropLast(False)
encodedDF = encoder.transform(indexedDF2)

encoder2 = OneHotEncoder(inputCol = "Wild_Index", outputCol = "WildEncoder")
encoder2.setDropLast(False)
encodedDF2 = encoder2.transform(encodedDF)

from pyspark.ml.feature import VectorAssembler
vector_assembler = VectorAssembler(inputCols=['SoilEncoder','WildEncoder','Elevation','Aspect','Slope','Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology','Horizontal_Distance_To_Roadways','Hillshade_9am','Hillshade_Noon','Hillshade_3pm','Horizontal_Distance_To_Fire_Points'], outputCol='features')
finalDF = vector_assembler.transform(encodedDF2)

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
rf = RandomForestClassifier(labelCol='Target',featuresCol= "features",numTrees=100, maxDepth=9)

trainingData, testData = finalDF.randomSplit([0.8, 0.2], seed = 123)

model = rf.fit(trainingData)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator




In [None]:
dfTest = spark_session.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferschema", "true")\
        .option("mode", "DROPMALFORMED")\
        .load("/data/covertype2")\
        .repartition(60)prediction = model.transform(test)



In [None]:
model1 = stringIndexer.fit(dfTest)
indexedDFTest = model1.transform(dfTest)

model2 = stringIndexer2.fit(indexedDFTest)
indexedDF2Test= model2.transform(indexedDFTest)

encodedDFTest = encoder.transform(indexedDF2Test)

encodedDF2Test = encoder2.transform(encodedDFTest)
finalDFTest = vector_assembler.transform(encodedDF2Test)
predictions = model.transform(finalDFTest)
evaluator = MulticlassClassificationEvaluator(labelCol = "Target", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions)
print(accuracy)