# Boston Housing Linear Regression

In [None]:
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from helpers.helper_functions import translate_to_file_string

# for pretty printing
def printDf(sprkDF): 
    newdf = sprkDF.toPandas()
    from IPython.display import display, HTML
    return HTML(newdf.to_html())

Select the Imput File

In [None]:
inputFile = translate_to_file_string("../data/Boston_Housing_Data.csv")

SparkSession creation

In [None]:
spark = (SparkSession
       .builder
       .appName("BostonHousingRegression")
       .getOrCreate())

Create a DataFrame using an ifered schema 

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)
print(df.printSchema())

Select the features

In [None]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT") 
featureCols.remove("RM")
featureCols.remove("CHAS")
#featureCols.remove("PTRATIO")
assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

Data Preparation

In [None]:
labeledPointDataSet = assembler.transform(df)

Prepare training and test data.

In [None]:
splits = labeledPointDataSet.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Build and train the Model

In [None]:
lr = LinearRegression(maxIter=100, regParam= 0.3, elasticNetParam=0, featuresCol="features", labelCol="MEDV", standardization=True)
#  elasticNetParam = Param(parent='undefined', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.')
lrModel = lr.fit(training)

Test the Model

In [None]:
predictionsLR = lrModel.transform(test)
# predictionsLR.show()
printDf(predictionsLR)

In [None]:
evaluator = RegressionEvaluator(labelCol="MEDV",predictionCol="prediction", metricName="rmse")

In [None]:
print("root mean square error = " , evaluator.evaluate(predictionsLR))
spark.stop()