# Boston Housing KNN

In [None]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler, BucketedRandomProjectionLSH, MinHashLSH
from pyspark.ml.classification import LinearSVC
from pyspark.sql.session import SparkSession, Row
from pyspark.sql.functions import desc, expr
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from helpers.helper_functions import translate_to_file_string

In [None]:
inputFile = translate_to_file_string("../data/Boston_Housing_Data.csv")

Spark session creation 

In [None]:
spark = (SparkSession
       .builder
       .appName("BostonHousingKNN")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

Prepare training and test data.

In [None]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

In [None]:
labledPointDataSet = assembler.transform(df)
splits = labledPointDataSet.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

LHS Euclidean Distance

In [None]:
# TODO optimize the params to minimize the test error
# TODO try the MinHashLSH too
lhsED = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", bucketLength =2.0, numHashTables=3)

Train the model 

In [None]:
modelED = lhsED.fit(training)

Test the model

In [None]:
resultList = []
# The Nearest neighbor testing
# TODO add other aggregation methods 
for row in test.collect() :
     neighbors = modelED.approxNearestNeighbors(training, row.features, 5)
     grouped = neighbors.groupBy(df.CAT).count()
     if grouped.count() > 0 :
          result = grouped.orderBy(desc("count")).first().CAT
          newRow = Row(CAT=row.CAT, features=row.features, prediction=float                    (result))
          resultList.append(newRow)	

predictions = spark.createDataFrame(resultList)
predictions.createOrReplaceTempView("resultList")
predictions.show()

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="CAT",rawPredictionCol="prediction", metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print("Test Error",(1.0 - accuracy))