# Boston Housing KNN

In [None]:
import sys
sys.path.append("..")
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StandardScaler, VectorAssembler, BucketedRandomProjectionLSH, MinHashLSH
from pyspark.ml.classification import LinearSVC
from pyspark.sql import Row
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import desc, expr
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from helpers.path_translation import translate_to_file_string


In [None]:
inputFile = translate_to_file_string("../data/Boston_Housing_Data.csv")

Spark session creation 

In [None]:
spark = (SparkSession
       .builder
       .appName("boston_housing_knn")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

Prepare training and test data.

In [None]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

In [None]:
labledPointDataSet = assembler.transform(df)
scaledDataSet = scaler.fit(labledPointDataSet).transform(labledPointDataSet)
splits = scaledDataSet.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

## LHS Euclidean Distance

In [None]:
lhsED = BucketedRandomProjectionLSH(inputCol="scaledFeatures", outputCol="hashes", bucketLength =2.0, numHashTables=3)

Train the model 

In [None]:
modelED = lhsED.fit(training)

Apply lhs and make predictions

In [None]:
resultList = []
# The Nearest neighbor testing
for row in test.collect() :
     neighbors = modelED.approxNearestNeighbors(training, row.scaledFeatures, 5)
     grouped = neighbors.groupBy(df.CAT).count()
     result = grouped.orderBy(desc("count")).first().CAT
     newRow = Row(CAT=row.CAT, scaledFeatures=row.scaledFeatures, prediction=float (result))
     resultList.append(newRow)	

predictions = spark.createDataFrame(resultList)
predictions.show()

In [None]:
predictionAndLabels = predictions.select(predictions.prediction, predictions.CAT)
countcorrect = predictionAndLabels.filter("CAT == prediction").count()
countincorrect = predictionAndLabels.filter("CAT != prediction").count()
countall = predictionAndLabels.count()
accuracy = countcorrect/countall
print(f"countcorrect: {countcorrect}")
print(f"countincorrect: {countincorrect}")
print(f"countall: {countall}")
print(f"accuracy: {accuracy}")
print(f"Test Error {1-accuracy}")

## Minhash

In [None]:
lsh_minhash = MinHashLSH(inputCol="scaledFeatures", outputCol="hashes", numHashTables=3)

Train the model

In [None]:
model_minhash = lsh_minhash.fit(training)

Apply lhs and make predictions

In [None]:
resultList = []
# The Nearest neighbor testing
for row in test.collect() :
     neighbors = model_minhash.approxNearestNeighbors(training, row.scaledFeatures, 5)
     grouped = neighbors.groupBy(df.CAT).count()
     result = grouped.orderBy(desc("count")).first().CAT
     newRow = Row(CAT=row.CAT, scaledFeatures=row.scaledFeatures, prediction=float (result))
     resultList.append(newRow)	

predictions = spark.createDataFrame(resultList)
predictions.show()

In [None]:
predictionAndLabels = predictions.select(predictions.prediction, predictions.CAT)
countcorrect = predictionAndLabels.filter("CAT == prediction").count()
countincorrect = predictionAndLabels.filter("CAT != prediction").count()
countall = predictionAndLabels.count()
accuracy = countcorrect/countall
print(f"countcorrect: {countcorrect}")
print(f"countincorrect: {countincorrect}")
print(f"countall: {countall}")
print(f"accuracy: {accuracy}")
print(f"Test Error {1-accuracy}")

In [None]:
spark.stop()