In [None]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.master("local[4]").appName("HousePricePrediction").getOrCreate()


file_path = "house-prices.csv"  
data = spark.read.csv(file_path, header=True, inferSchema=True)


data.show()


+----+------+----+--------+---------+------+-----+------------+
|Home| Price|SqFt|Bedrooms|Bathrooms|Offers|Brick|Neighborhood|
+----+------+----+--------+---------+------+-----+------------+
|   1|114300|1790|       2|        2|     2|   No|        East|
|   2|114200|2030|       4|        2|     3|   No|        East|
|   3|114800|1740|       3|        2|     1|   No|        East|
|   4| 94700|1980|       3|        2|     3|   No|        East|
|   5|119800|2130|       3|        3|     3|   No|        East|
|   6|114600|1780|       3|        2|     2|   No|       North|
|   7|151600|1830|       3|        3|     3|  Yes|        West|
|   8|150700|2160|       4|        2|     2|   No|        West|
|   9|119200|2110|       4|        2|     3|   No|        East|
|  10|104000|1730|       3|        3|     3|   No|        East|
|  11|132500|2030|       3|        2|     3|  Yes|        East|
|  12|123000|1870|       2|        2|     2|  Yes|        East|
|  13|102600|1910|       3|        2|   

In [None]:
from pyspark.ml.feature import VectorAssembler


assembler = VectorAssembler(inputCols=["SqFt", "Bedrooms", "Bathrooms"], outputCol="features")
data = assembler.transform(data).select("features", "Price")


data.show()


+----------------+------+
|        features| Price|
+----------------+------+
|[1790.0,2.0,2.0]|114300|
|[2030.0,4.0,2.0]|114200|
|[1740.0,3.0,2.0]|114800|
|[1980.0,3.0,2.0]| 94700|
|[2130.0,3.0,3.0]|119800|
|[1780.0,3.0,2.0]|114600|
|[1830.0,3.0,3.0]|151600|
|[2160.0,4.0,2.0]|150700|
|[2110.0,4.0,2.0]|119200|
|[1730.0,3.0,3.0]|104000|
|[2030.0,3.0,2.0]|132500|
|[1870.0,2.0,2.0]|123000|
|[1910.0,3.0,2.0]|102600|
|[2150.0,3.0,3.0]|126300|
|[2590.0,4.0,3.0]|176800|
|[1780.0,4.0,2.0]|145800|
|[2190.0,3.0,3.0]|147100|
|[1990.0,3.0,3.0]| 83600|
|[1700.0,2.0,2.0]|111400|
|[1920.0,3.0,3.0]|167200|
+----------------+------+
only showing top 20 rows



In [None]:

train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)


print(f"Training Data Count: {train_data.count()}")
print(f"Test Data Count: {test_data.count()}")


Training Data Count: 106
Test Data Count: 22


In [None]:
from pyspark.ml.regression import LinearRegression


lr = LinearRegression(featuresCol="features", labelCol="Price")


lr_model = lr.fit(train_data)


print("Coefficients:", lr_model.coefficients)
print("Intercept:", lr_model.intercept)


Coefficients: [34.90751731139586,11056.499492136118,17474.42629513101]
Intercept: -14926.095898035921


In [None]:

predictions = lr_model.transform(test_data)


predictions.select("features", "Price", "prediction").show()


+----------------+------+------------------+
|        features| Price|        prediction|
+----------------+------+------------------+
|[1560.0,2.0,2.0]|106600| 96591.48268227588|
|[1650.0,3.0,2.0]|107300|110789.65873243762|
|[1700.0,2.0,2.0]|111400| 101478.5351058713|
|[1720.0,3.0,2.0]|131300|113233.18494423531|
|[1780.0,4.0,2.0]|143600| 126384.1354750552|
|[1810.0,3.0,2.0]|103200|116374.86150226096|
|[1860.0,3.0,2.0]|130300|118120.23736783076|
|[1900.0,3.0,3.0]|102500| 136990.9643554176|
|[1920.0,4.0,2.0]|143100|131271.18789865062|
|[1930.0,2.0,2.0]|112300|109507.26408749234|
|[1930.0,2.0,3.0]|110400|126981.69038262335|
|[1930.0,3.0,3.0]|105600|138038.18987475947|
|[1940.0,2.0,2.0]|123600| 109856.3392606063|
|[1970.0,2.0,2.0]|152500|110903.56477994818|
|[2000.0,2.0,2.0]|117800|111950.79029929006|
|[2010.0,4.0,3.0]|124500|151887.29075180728|
|[2050.0,3.0,2.0]| 90300|124752.66565699596|
|[2150.0,4.0,3.0]|160600|156774.34317540267|
|[2250.0,3.0,3.0]|124600|149208.59541440615|
|[2260.0,3

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator


evaluator = RegressionEvaluator(labelCol="Price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

print(f"Root Mean Squared Error (RMSE): {rmse}")


Root Mean Squared Error (RMSE): 21674.389241589663
