# Boston Housing Classification Logistic Regression

In [None]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import expr
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from socket import gethostname, gethostbyname
from helpers.helper_functions import translate_to_file_string

In [None]:
inputFile = "hdfs:///data/Boston_Housing_Data.csv"

Spark session creation 

In [None]:
spark = (SparkSession
       .builder 
       .master("yarn")  
       .appName("BostonHousingClass")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

Prepare training and test data.

In [None]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

In [None]:
labledPointDataSet = assembler.transform(df)
splits = labledPointDataSet.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Logistic regression

In [None]:
#TODO Optimize the paramters 
lr = LogisticRegression(labelCol="CAT",featuresCol="features", maxIter=100, \
                           regParam=0)

Train the model 

In [None]:
lrModel = lr.fit(training)

Test the model

In [None]:
predictions = lrModel.transform(test)
predictions.show()

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="CAT",rawPredictionCol="rawPrediction", metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print("Test Error",(1.0 - accuracy))