# Boston Housing KNN

In [1]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler, BucketedRandomProjectionLSH, MinHashLSH
from pyspark.ml.classification import LinearSVC
from pyspark.sql.session import SparkSession, Row
from pyspark.sql.functions import desc, expr
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from helpers.helper_functions import translate_to_file_string
from helpers.helper_functions import translate_to_file_string

In [2]:
inputFile = translate_to_file_string("../data/Boston_Housing_Data.csv")

Spark session creation 

In [3]:
spark = (SparkSession
       .builder
       .appName("boston_housing_knn")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [4]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)
 |-- CAT: integer (nullable = true)
 |-- CATBOOL: boolean (nullable = true)

None


Prepare training and test data.

In [5]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


In [6]:
labledPointDataSet = assembler.transform(df)
splits = labledPointDataSet.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

LHS Euclidean Distance

In [7]:
lhsED = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", bucketLength =2.0, numHashTables=3)

Train the model 

In [8]:
modelED = lhsED.fit(training)

Apply lhs and make predictions

In [9]:
resultList = []
# The Nearest neighbor testing
for row in test.collect() :
     neighbors = modelED.approxNearestNeighbors(training, row.features, 5)
     grouped = neighbors.groupBy(df.CAT).count()
     result = grouped.orderBy(desc("count")).first().CAT
     newRow = Row(CAT=row.CAT, features=row.features, prediction=float (result))
     resultList.append(newRow)	

predictions = spark.createDataFrame(resultList)
predictions.createOrReplaceTempView("resultList")
predictions.show()

+---+--------------------+----------+
|CAT|            features|prediction|
+---+--------------------+----------+
|  0|[0.02731,0.0,7.07...|       0.0|
|  1|[0.02763,75.0,2.9...|       0.0|
|  0|[0.03427,0.0,5.19...|       0.0|
|  0|[0.03659,25.0,4.8...|       0.0|
|  1|[0.03768,80.0,1.5...|       0.0|
|  0|[0.05789,12.5,6.0...|       0.0|
|  1|[0.06127,40.0,6.4...|       0.0|
|  0|[0.06162,0.0,4.39...|       0.0|
|  1|[0.07875,45.0,3.4...|       1.0|
|  0|[0.0795,60.0,1.69...|       0.0|
|  0|[0.08265,0.0,13.9...|       0.0|
|  0|[0.12802,0.0,8.56...|       0.0|
|  0|[0.14476,0.0,10.0...|       0.0|
|  0|[0.19539,0.0,10.8...|       0.0|
|  0|[0.19657,22.0,5.8...|       0.0|
|  0|[0.22438,0.0,9.69...|       0.0|
|  0|[0.22876,0.0,8.56...|       0.0|
|  0|[0.37578,0.0,10.5...|       0.0|
|  0|[0.52058,0.0,6.2,...|       0.0|
|  0|[0.62976,0.0,8.14...|       0.0|
+---+--------------------+----------+
only showing top 20 rows



In [11]:
predictionAndLabels = predictions.select(predictions.prediction, predictions.CAT)
countcorrect = predictionAndLabels.filter("CAT == prediction").count()
countincorrect = predictionAndLabels.filter("CAT != prediction").count()
countall = predictionAndLabels.count()
accuracy = countcorrect/countall
print(f"countcorrect: {countcorrect}")
print(f"countincorrect: {countincorrect}")
print(f"countall: {countall}")
print(f"accuracy: {accuracy}")
print(f"Test Error {1-accuracy}")

countcorrect: 30
countincorrect: 5
countall: 35
accuracy: 0.8571428571428571
Test Error 0.1428571428571429


In [12]:
spark.stop()