# Boston Housing Linear Regression

In [78]:
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

Select the Imput File

In [79]:
inputFile = "../data/Boston_Housing_Data.csv"

SparkSession creation

In [80]:
spark = (SparkSession
       .builder
       .appName("BostonHousingRegression")
       .getOrCreate())

Create a DataFrame using an ifered schema 

In [81]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)
print(df.printSchema())

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)
 |-- CAT: integer (nullable = true)

None


Select the features

In [82]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT") 
#featureCols.remove("B")
#featureCols.remove("RAD")
#featureCols.remove("PTRATIO")
assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

Data Preparation

In [83]:
labeledPointDataSet = assembler.transform(df)

Prepare training and test data.

In [84]:
splits = labeledPointDataSet.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Build and train the Model

In [85]:
lr = LinearRegression(maxIter=100, regParam= 0.3, elasticNetParam=0, featuresCol="features", labelCol="MEDV", standardization=True)
#  elasticNetParam = Param(parent='undefined', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.')
lrModel = lr.fit(training)

Test the Model

In [86]:
predictionsLR = lrModel.transform(test)
predictionsLR.show()

+-------+----+-----+----+------+-----+----+------+---+---+-------+------+-----+----+---+--------------------+------------------+
|   CRIM|  ZN|INDUS|CHAS|   NOX|   RM| AGE|   DIS|RAD|TAX|PTRATIO|     B|LSTAT|MEDV|CAT|            features|        prediction|
+-------+----+-----+----+------+-----+----+------+---+---+-------+------+-----+----+---+--------------------+------------------+
|0.01311|90.0| 1.22|   0| 0.403|7.249|21.9|8.6966|  5|226|   17.9|395.93| 4.81|35.4|  1|[0.01311,90.0,1.2...|31.089823178644174|
|0.01439|60.0| 2.93|   0| 0.401|6.604|18.8|6.2196|  1|265|   15.6| 376.7| 4.38|29.1|  0|[0.01439,60.0,2.9...|31.556648692863845|
|0.03871|52.5| 5.32|   0| 0.405|6.209|31.3|7.3172|  6|293|   16.6| 396.9| 7.14|23.2|  0|[0.03871,52.5,5.3...| 26.78174033602274|
| 0.0456| 0.0|13.89|   1|  0.55|5.888|56.0|3.1121|  5|276|   16.4| 392.8|13.51|23.3|  0|[0.0456,0.0,13.89...|25.900449635345783|
|0.04932|33.0| 2.18|   0| 0.472|6.849|70.3|3.1827|  7|222|   18.4| 396.9| 7.53|28.2|  0|[0.04932,

In [87]:
evaluator = RegressionEvaluator(labelCol="MEDV",predictionCol="prediction", metricName="rmse")

In [88]:
print("root mean square error = " , evaluator.evaluate(predictionsLR))
spark.stop()

root mean square error =  4.563994666307601
