In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 51 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 71.8 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=bc199ee024543198116bdb2fff823b36b25f84729cd411f307510b1009c75bf8
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark=SparkSession.builder.appName('LinearRegression').getOrCreate()

In [6]:
real_estate= spark.read.csv('/Real estate.csv', header = True, inferSchema = True)

In [None]:
real_estate.printSchema()

root
 |-- No: integer (nullable = true)
 |-- X1 transaction date: double (nullable = true)
 |-- X2 house age: double (nullable = true)
 |-- X3 distance to the nearest MRT station: double (nullable = true)
 |-- X4 number of convenience stores: integer (nullable = true)
 |-- X5 latitude: double (nullable = true)
 |-- X6 longitude: double (nullable = true)
 |-- Y house price of unit area: double (nullable = true)



In [None]:
real_estate.show(2)

+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
| No|X1 transaction date|X2 house age|X3 distance to the nearest MRT station|X4 number of convenience stores|X5 latitude|X6 longitude|Y house price of unit area|
+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
|  1|           2012.917|        32.0|                              84.87882|                             10|   24.98298|   121.54024|                      37.9|
|  2|           2012.917|        19.5|                              306.5947|                              9|   24.98034|   121.53951|                      42.2|
+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
only showing top 2 rows



In [None]:
real_estate.describe().show(2)

+-------+-----+-------------------+-----------------+--------------------------------------+-------------------------------+------------------+------------------+--------------------------+
|summary|   No|X1 transaction date|     X2 house age|X3 distance to the nearest MRT station|X4 number of convenience stores|       X5 latitude|      X6 longitude|Y house price of unit area|
+-------+-----+-------------------+-----------------+--------------------------------------+-------------------------------+------------------+------------------+--------------------------+
|  count|  414|                414|              414|                                   414|                            414|               414|               414|                       414|
|   mean|207.5| 2013.1489710144933|17.71256038647343|                    1083.8856889130436|              4.094202898550725|24.969030072463745|121.53336108695667|         37.98019323671498|
+-------+-----+-------------------+---------------

In [7]:
from pyspark.ml.feature import VectorAssembler
assemble=VectorAssembler(inputCols=['X1 transaction date',
 'X2 house age',
 'X3 distance to the nearest MRT station',
 'X4 number of convenience stores',
 'X5 latitude',
 'X6 longitude'],
 outputCol="features")

In [8]:
data_set=assemble.transform(real_estate)
data_set

DataFrame[No: int, X1 transaction date: double, X2 house age: double, X3 distance to the nearest MRT station: double, X4 number of convenience stores: int, X5 latitude: double, X6 longitude: double, Y house price of unit area: double, features: vector]

In [9]:
data_set.select(["features","Y House price of unit area"]).show(2)

+--------------------+--------------------------+
|            features|Y House price of unit area|
+--------------------+--------------------------+
|[2012.917,32.0,84...|                      37.9|
|[2012.917,19.5,30...|                      42.2|
+--------------------+--------------------------+
only showing top 2 rows



In [11]:
train_data,test_data=data_set.randomSplit([0.7,0.3])
train_data.show(truncate=False)

+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+------------------------------------------------+
|No |X1 transaction date|X2 house age|X3 distance to the nearest MRT station|X4 number of convenience stores|X5 latitude|X6 longitude|Y house price of unit area|features                                        |
+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+------------------------------------------------+
|1  |2012.917           |32.0        |84.87882                              |10                             |24.98298   |121.54024   |37.9                      |[2012.917,32.0,84.87882,10.0,24.98298,121.54024]|
|2  |2012.917           |19.5        |306.5947                              |9                              |24.98034   |121.53951   |42.2                  

In [12]:
test_data.show(truncate=False)

+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+-----------------------------------------------+
|No |X1 transaction date|X2 house age|X3 distance to the nearest MRT station|X4 number of convenience stores|X5 latitude|X6 longitude|Y house price of unit area|features                                       |
+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+-----------------------------------------------+
|5  |2012.833           |5.0         |390.5684                              |5                              |24.97937   |121.54245   |43.1                      |[2012.833,5.0,390.5684,5.0,24.97937,121.54245] |
|6  |2012.667           |7.1         |2175.03                               |3                              |24.96305   |121.51254   |32.1                      

In [13]:
from pyspark.ml.regression import LinearRegression
lr=LinearRegression(labelCol='Y house price of unit area')
lrModel=lr.fit(train_data)


In [15]:
lr=lrModel.transform(test_data)
lr.show()

+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+--------------------+------------------+
| No|X1 transaction date|X2 house age|X3 distance to the nearest MRT station|X4 number of convenience stores|X5 latitude|X6 longitude|Y house price of unit area|            features|        prediction|
+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+--------------------+------------------+
|  5|           2012.833|         5.0|                              390.5684|                              5|   24.97937|   121.54245|                      43.1|[2012.833,5.0,390...| 46.09524114710257|
|  6|           2012.667|         7.1|                               2175.03|                              3|   24.96305|   121.51254|                      32.1|[2012.667,7.1,217...|30.9988931

In [None]:
test_stats=lrModel.evaluate(test_data)
print(f"RMSE:{test_stats.rootMeanSquaredError}")
print(f"RMSE:{test_stats.r2}")
print(f"RMSE:{test_stats.meanSquaredError}")

RMSE:10.226417275769185
RMSE:0.5751173947531265
RMSE:104.57961029815044
