# Boston Housing Classification Logistic Regression

In [None]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler, ChiSqSelector
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import expr
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from helpers.helper_functions import translate_to_file_string

In [None]:
inputFile = translate_to_file_string("../data/Boston_Housing_Data.csv")

Spark session creation 

In [None]:
spark = (SparkSession
       .builder
       .appName("BostonHoudingClass")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

Prepare training and test data.

In [None]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)
labledPointDataSet = assembler.transform(df)

In [None]:
# set the num feactures between 1 and 13
selector = ChiSqSelector(numTopFeatures=13, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="CAT")

In [None]:

splits = selector.fit(labledPointDataSet).transform(labledPointDataSet).randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Logistic regression

In [None]:
lr = LogisticRegression(labelCol="CAT",featuresCol="selectedFeatures", maxIter=100, \
                           regParam=0, standardization=True, aggregationDepth=2)

Train the model 

In [None]:
lrModel = lr.fit(training)

Test the model

In [None]:
predictions = lrModel.transform(test)
predictions.show()

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="CAT",rawPredictionCol="rawPrediction", metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print("Test Error",(1.0 - accuracy))