## PySpark example

In [1]:
# after: https://www.geeksforgeeks.org/pyspark-linear-regression-using-apache-mllib/

import pyspark
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('housing_price_model').getOrCreate()

df=spark.read.csv('Real estate.csv',inferSchema=True,header=True)
df.show(10)

+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
| No|X1 transaction date|X2 house age|X3 distance to the nearest MRT station|X4 number of convenience stores|X5 latitude|X6 longitude|Y house price of unit area|
+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
|  1|           2012.917|        32.0|                              84.87882|                             10|   24.98298|   121.54024|                      37.9|
|  2|           2012.917|        19.5|                              306.5947|                              9|   24.98034|   121.53951|                      42.2|
|  3|           2013.583|        13.3|                              561.9845|                              5|   24.98746|   121.54391|                      47.3|
|  4|             2013.5|   

In [2]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler=VectorAssembler(inputCols=['X1 transaction date',
                                     'X2 house age',
                                     'X3 distance to the nearest MRT station',
                                     'X4 number of convenience stores',
                                     'X5 latitude',
                                     'X6 longitude'],outputCol='features')
output=assembler.transform(df)
output.select('features','Y house price of unit area').show(5)

+--------------------+--------------------------+
|            features|Y house price of unit area|
+--------------------+--------------------------+
|[2012.917,32.0,84...|                      37.9|
|[2012.917,19.5,30...|                      42.2|
|[2013.583,13.3,56...|                      47.3|
|[2013.5,13.3,561....|                      54.8|
|[2012.833,5.0,390...|                      43.1|
+--------------------+--------------------------+
only showing top 5 rows



In [3]:
final_data=output.select('features','Y house price of unit area')
#splitting data into train and test
train_data,test_data=final_data.randomSplit([0.7,0.3])

train_data.describe().show()
test_data.describe().show()

+-------+--------------------------+
|summary|Y house price of unit area|
+-------+--------------------------+
|  count|                       291|
|   mean|         38.48762886597939|
| stddev|        13.644649761608282|
|    min|                      11.6|
|    max|                     117.5|
+-------+--------------------------+

+-------+--------------------------+
|summary|Y house price of unit area|
+-------+--------------------------+
|  count|                       123|
|   mean|         36.77967479674797|
| stddev|        13.494989715978283|
|    min|                       7.6|
|    max|                      78.0|
+-------+--------------------------+



In [4]:
from pyspark.ml.regression import LinearRegression
model = LinearRegression(maxIter=5000, featuresCol='features', labelCol='Y house price of unit area', 
                         standardization=False)

trained_model = model.fit(train_data)

results = trained_model.evaluate(test_data)
print('Mean Squared Error :',results.meanSquaredError)

Mean Squared Error : 73.20686458681352


In [5]:
unlabeled_data = test_data.select('features')
unlabeled_data.show(5)

predictions = trained_model.transform(unlabeled_data)
predictions.show()

+--------------------+
|            features|
+--------------------+
|[2012.667,0.0,185...|
|[2012.667,1.5,23....|
|[2012.667,3.1,383...|
|[2012.667,12.4,17...|
|[2012.667,15.5,81...|
+--------------------+
only showing top 5 rows

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[2012.667,0.0,185...|39.789802555940696|
|[2012.667,1.5,23....| 46.08676524455586|
|[2012.667,3.1,383...| 44.89126791759554|
|[2012.667,12.4,17...| 30.39431888310719|
|[2012.667,15.5,81...|38.926020330080064|
|[2012.667,15.6,28...| 42.97455856810484|
|[2012.667,29.4,45...| 9.425031192917231|
|[2012.667,30.4,17...|26.611945166983787|
|[2012.667,33.4,18...|37.257660160896194|
|[2012.75,13.5,419...| 11.55182338708255|
|[2012.75,14.1,261...|21.835341941819934|
|[2012.75,29.3,529...| 42.20486882581463|
|[2012.75,34.4,512...|40.620087025712564|
|[2012.75,37.9,488...| 32.33556090537604|
|[2012.833,0.0,185...| 40.89586924658943|
|[2012.833,0

In [6]:
predictions = trained_model.transform(assembler.transform(spark.createDataFrame([
    [1.0,1.0,1.0,1.0,1.0,1.0],
    [100.0,1.0,1.0,1.0,1.0,1.0]
], ['X1 transaction date',
    'X2 house age',
    'X3 distance to the nearest MRT station',
    'X4 number of convenience stores',
    'X5 latitude',
    'X6 longitude'])
).select("features"))
predictions.show()

+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|[1.0,1.0,1.0,1.0,...|-17099.237975643595|
|[100.0,1.0,1.0,1....|-16439.595792668075|
+--------------------+-------------------+

