In [108]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.sql.functions import isnan, when, count, col
from pyspark.ml.evaluation import RegressionEvaluator

In [109]:
# Initialize SparkSession
spark = SparkSession.builder.appName("CaliforniaHousingPrices").getOrCreate()

In [110]:
# Load the dataset
data = spark.read.csv("data/housing.csv", header=True, inferSchema=True)

# Display the schema of the dataset
data.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)
 |-- ocean_proximity: string (nullable = true)



In [111]:
data.count()

20640

In [112]:
data.select([count(when(col(c).isNull(), c)).alias(c) for c in data.columns]).show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|        0|       0|                 0|          0|           207|         0|         0|            0|                 0|              0|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+



In [113]:
filtered_data = data.na.drop(subset=['total_bedrooms'])
filtered_data.count()

20433

In [114]:
train_data, test_data = filtered_data.randomSplit([0.8, 0.2], seed=42)
print("Train size: ", train_data.count())
print("Test size: ", test_data.count())

Train size:  16395
Test size:  4038


In [116]:

indexer = StringIndexer(inputCol='ocean_proximity', outputCol='ocean_proximity_index')
encoder = OneHotEncoder(inputCol='ocean_proximity_index', outputCol='ocean_proximity_vec')


feature_cols = ['housing_median_age', 'total_rooms', 'total_bedrooms',
                'population', 'households', 'median_income', 'ocean_proximity_vec']
assembler = VectorAssembler(inputCols=feature_cols, outputCol='unscaled_features')
scaler = StandardScaler(inputCol='unscaled_features', outputCol='features', withMean=True, withStd=True)

lr = LinearRegression(featuresCol='features', labelCol='median_house_value', regParam=0.001)

pipeline = Pipeline(stages=[indexer, encoder, assembler, scaler, lr])

pipeline_model = pipeline.fit(train_data)

In [118]:
predictions = pipeline_model.transform(test_data)

In [119]:
test_evaluator_rmse = RegressionEvaluator(labelCol='median_house_value', predictionCol='prediction', metricName='rmse')
test_rmse = evaluator_rmse.evaluate(test_predictions)
print(f"Root Mean Squared Error (RMSE): {test_rmse}")

Root Mean Squared Error (RMSE): 69825.72320493394
