# Boston Housing Classification Random Forest

In [None]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import expr
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
inputFile = "../data/Boston_Housing_Data.csv"

Spark session creation 

In [None]:
spark = (SparkSession
       .builder
       .appName("BostonHousingClassificationRandomForest")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

Prepare training and test data.

In [None]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

In [None]:
labledPointDataSet = assembler.transform(df)
splits = labledPointDataSet.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Random Forest Classifier

In [None]:
# TODO Find out and optimize parameters of Random Forest
rf = RandomForestClassifier(labelCol="CAT", featuresCol="features")

Train the model 

In [None]:
rfModel = rf.fit(training)

Test the model

In [None]:
predictions = rfModel.transform(test)
predictions.show()

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="CAT",rawPredictionCol="prediction", metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print("Test Error",(1.0 - accuracy))