# Boston Housing Linear Regression

In [None]:
import sys
sys.path.append("..")
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from helpers.data_prep_and_print import print_df
from helpers.path_translation import translate_to_file_string

Select the Imput File

In [None]:
inputFile = translate_to_file_string("../data/Boston_Housing_Data.csv")

SparkSession creation

In [None]:
spark = (SparkSession
       .builder
       .appName("BostonHousingRegression")
       .getOrCreate())

Create a DataFrame using an ifered schema 

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)
print(df.printSchema())

Select the features

In [None]:
#TODO remove additional features
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")   
assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

Data Preparation

In [None]:
labeledPointDataSet = assembler.transform(df)

Prepare training and test data.

In [None]:
splits = labeledPointDataSet.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Build and train the Model

In [None]:
# TODO Optimize the LR Model 
lr = LinearRegression(maxIter=1000, regParam=0.0, elasticNetParam=0.0, loss='squaredError', featuresCol="features", labelCol="MEDV")
lrModel = lr.fit(training)
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

Test the Model

In [None]:
predictionsLR = lrModel.transform(test)
print_df(predictionsLR)

In [None]:
evaluator = RegressionEvaluator(labelCol="MEDV",predictionCol="prediction", metricName="rmse")

In [None]:
print("root mean square error = " , evaluator.evaluate(predictionsLR))
spark.stop()