# Boston Housing KNN

In [61]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler, BucketedRandomProjectionLSH, MinHashLSH
from pyspark.ml.classification import LinearSVC
from pyspark.sql.session import SparkSession, Row
from pyspark.sql.functions import desc, expr
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [62]:
inputFile = "../data/Boston_Housing_Data.csv"

Spark session creation 

In [63]:
spark = (SparkSession
       .builder
       .appName("ChurnDataPreprocessing")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [64]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)
 |-- CAT: integer (nullable = true)
 |-- CATBOOL: boolean (nullable = true)

None


Prepare training and test data.

In [65]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


In [66]:
labledPointDataSet = assembler.transform(df)
splits = labledPointDataSet.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

LHS Euclidean Distance

In [67]:
lhsED = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", bucketLength =2.0, numHashTables=3)

Train the model 

In [68]:
modelED = lhsED.fit(training)

Test the model

In [69]:
resultList = []
# The Nearest neighbor testing
for row in test.collect() :
     neighbors = modelED.approxNearestNeighbors(training, row.features, 5)
     grouped = neighbors.groupBy(df.CAT).count()
     result = grouped.orderBy(desc("count")).first().CAT
     newRow = Row(CAT=row.CAT, features=row.features, prediction=float (result))
     resultList.append(newRow)	

predictions = spark.createDataFrame(resultList)
predictions.createOrReplaceTempView("resultList")
predictions.show()

+---+--------------------+----------+
|CAT|            features|prediction|
+---+--------------------+----------+
|  1|[0.01311,90.0,1.2...|       1.0|
|  0|[0.01439,60.0,2.9...|       1.0|
|  0|[0.03871,52.5,5.3...|       0.0|
|  0|[0.0456,0.0,13.89...|       0.0|
|  0|[0.04932,33.0,2.1...|       1.0|
|  0|[0.05023,35.0,6.0...|       0.0|
|  0|[0.05372,0.0,13.9...|       0.0|
|  0|[0.06417,0.0,5.96...|       0.0|
|  0|[0.06466,70.0,2.2...|       0.0|
|  0|[0.06617,0.0,3.24...|       0.0|
|  0|[0.08244,30.0,4.9...|       0.0|
|  0|[0.09849,0.0,25.6...|       0.0|
|  1|[0.1,34.0,6.09,0....|       0.0|
|  0|[0.10153,0.0,12.8...|       0.0|
|  0|[0.11504,0.0,2.89...|       0.0|
|  0|[0.11747,12.5,7.8...|       0.0|
|  0|[0.14455,12.5,7.8...|       0.0|
|  0|[0.15038,0.0,25.6...|       0.0|
|  0|[0.17004,12.5,7.8...|       0.0|
|  0|[0.19186,0.0,7.38...|       0.0|
+---+--------------------+----------+
only showing top 20 rows



In [70]:
evaluator = BinaryClassificationEvaluator(labelCol="CAT",rawPredictionCol="prediction", metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print("Test Error",(1.0 - accuracy))

Test Error 0.2455357142857143
