In [23]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession \
  .builder \
  .appName('House price prediction') \
  .getOrCreate()



In [24]:
# Load the dataset
data = spark.read.csv('data/kc_house_data.csv', header=True, inferSchema=True)

data.printSchema()

root
 |-- id: long (nullable = true)
 |-- date: string (nullable = true)
 |-- price: double (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: double (nullable = true)
 |-- sqft_living: integer (nullable = true)
 |-- sqft_lot: integer (nullable = true)
 |-- floors: double (nullable = true)
 |-- waterfront: integer (nullable = true)
 |-- view: integer (nullable = true)
 |-- condition: integer (nullable = true)
 |-- grade: integer (nullable = true)
 |-- sqft_above: integer (nullable = true)
 |-- sqft_basement: integer (nullable = true)
 |-- yr_built: integer (nullable = true)
 |-- yr_renovated: integer (nullable = true)
 |-- zipcode: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- sqft_living15: integer (nullable = true)
 |-- sqft_lot15: integer (nullable = true)



In [25]:
data.show()

+----------+---------------+---------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+------------+-------+-------+--------+-------------+----------+
|        id|           date|    price|bedrooms|bathrooms|sqft_living|sqft_lot|floors|waterfront|view|condition|grade|sqft_above|sqft_basement|yr_built|yr_renovated|zipcode|    lat|    long|sqft_living15|sqft_lot15|
+----------+---------------+---------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+------------+-------+-------+--------+-------------+----------+
|7129300520|20141013T000000| 221900.0|       3|      1.0|       1180|    5650|   1.0|         0|   0|        3|    7|      1180|            0|    1955|           0|  98178|47.5112|-122.257|         1340|      5650|
|6414100192|20141209T000000| 538000.0|       3|     2.25|       2570|    7242|   2.0|         0|   0|        3|    7|      2170|          40

In [26]:
# Train/test split
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

In [27]:
# Select numerical columns (excluding the price)
numerical_columns = [col for col, dtype in data.dtypes if dtype in ('int', 'double') and col != 'price'] 
numerical_columns

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'zipcode',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15']

In [34]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

# Create feature vectors
assembler = VectorAssembler(inputCols=numerical_columns, outputCol='features')
assembled_data = assembler.transform(data)

# Prepare the dataset with features and label
final_data = assembled_data.select('features', 'price')

# Train a model
lr = LinearRegression(featuresCol='features', labelCol='price')
lr_model = lr.fit(train_data)

24/11/20 23:56:59 WARN Instrumentation: [db756f68] regParam is zero, which might cause numerical instability and overfitting.


In [35]:
predictions = lr_model.transform(test_data)

In [36]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator_mae = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='mae')
mae = evaluator_mae.evaluate(predictions)
print(f"Mean Absolute Error (MAE): {mae}")

Mean Absolute Error (MAE): 128226.71131498019
