In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
  .appName("Feature engineering") \
  .getOrCreate()

24/10/09 09:03:19 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
 data = spark.read.csv("data/housing.csv", header=True, inferSchema=True)

 data.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)
 |-- ocean_proximity: string (nullable = true)



In [3]:
data.count()

20640

In [4]:
filtered_data = data.na.drop(subset=['total_bedrooms'])
filtered_data.count()

20433

In [5]:
train_data, test_data = filtered_data.randomSplit([0.8, 0.2], seed=42)
print("Train size: ", train_data.count())
print("Test size: ", test_data.count())

Train size:  16395
Test size:  4038


In [6]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler

indexer = StringIndexer(inputCol='ocean_proximity', outputCol='ocean_proximity_index')
indexer_model = indexer.fit(train_data)
train_data = indexer_model.transform(train_data)
train_data.show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|ocean_proximity_index|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+
|  -124.35|   40.54|              52.0|     1820.0|         300.0|     806.0|     270.0|       3.0147|           94600.0|     NEAR OCEAN|                  2.0|
|   -124.3|    41.8|              19.0|     2672.0|         552.0|    1298.0|     478.0|       1.9797|           85800.0|     NEAR OCEAN|                  2.0|
|  -124.27|   40.69|              36.0|     2349.0|         528.0|    1194.0|     465.0|       2.5179|           79000.0|     NEAR OCEAN|                  2.0|
|  -124.26|   40.58|              52.0| 

In [7]:
encoder = OneHotEncoder(inputCol='ocean_proximity_index', outputCol='ocean_proximity_vec')
encoder_model = encoder.fit(train_data)
train_data = encoder_model.transform(train_data)
train_data.show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+-------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|ocean_proximity_index|ocean_proximity_vec|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+-------------------+
|  -124.35|   40.54|              52.0|     1820.0|         300.0|     806.0|     270.0|       3.0147|           94600.0|     NEAR OCEAN|                  2.0|      (4,[2],[1.0])|
|   -124.3|    41.8|              19.0|     2672.0|         552.0|    1298.0|     478.0|       1.9797|           85800.0|     NEAR OCEAN|                  2.0|      (4,[2],[1.0])|
|  -124.27|   40.69|              36.0|     2349.0|         528.0|    1194.0|     465.0|       2.517

In [8]:
from pyspark.ml.feature import VectorAssembler

feature_columns = ['housing_median_age', 'total_rooms', 'total_bedrooms',
                   'population', 'households', 'median_income', 'ocean_proximity_vec']
assembler = VectorAssembler(inputCols=feature_columns, outputCol='unscaled_features')

train_data = assembler.transform(train_data)
train_data.show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+-------------------+--------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|ocean_proximity_index|ocean_proximity_vec|   unscaled_features|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+-------------------+--------------------+
|  -124.35|   40.54|              52.0|     1820.0|         300.0|     806.0|     270.0|       3.0147|           94600.0|     NEAR OCEAN|                  2.0|      (4,[2],[1.0])|[52.0,1820.0,300....|
|   -124.3|    41.8|              19.0|     2672.0|         552.0|    1298.0|     478.0|       1.9797|           85800.0|     NEAR OCEAN|                  2.0|      (4,[2],[1.0])|[19.0,2672.0,552.

In [9]:
scaler = StandardScaler(inputCol='unscaled_features', outputCol='features', withMean=True, withStd=True)
scaler_model = scaler.fit(train_data)
transformed_train_data = scaler_model.transform(train_data)
transformed_train_data.show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+-------------------+--------------------+--------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|ocean_proximity_index|ocean_proximity_vec|   unscaled_features|            features|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+-------------------+--------------------+--------------------+
|  -124.35|   40.54|              52.0|     1820.0|         300.0|     806.0|     270.0|       3.0147|           94600.0|     NEAR OCEAN|                  2.0|      (4,[2],[1.0])|[52.0,1820.0,300....|[1.85210106716407...|
|   -124.3|    41.8|              19.0|     2672.0|         552.0|    1298.0|     478.0|       1.9797|          

In [10]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol='features', labelCol='median_house_value')

model = lr.fit(transformed_train_data)

24/10/09 09:03:51 WARN Instrumentation: [b0124472] regParam is zero, which might cause numerical instability and overfitting.
24/10/09 09:03:51 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/10/09 09:03:51 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/10/09 09:03:51 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [11]:
train_predictions = model.transform(transformed_train_data)
train_predictions.show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+-------------------+--------------------+--------------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|ocean_proximity_index|ocean_proximity_vec|   unscaled_features|            features|        prediction|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+-------------------+--------------------+--------------------+------------------+
|  -124.35|   40.54|              52.0|     1820.0|         300.0|     806.0|     270.0|       3.0147|           94600.0|     NEAR OCEAN|                  2.0|      (4,[2],[1.0])|[52.0,1820.0,300....|[1.85210106716407...|226114.99700043933|
|   -124.3|    41.8|              19

In [14]:
test_data = indexer_model.transform(test_data)
test_data = encoder_model.transform(test_data)
test_data = assembler.transform(test_data)
test_data = scaler_model.transform(test_data)

In [15]:
test_predictions = model.transform(test_data)
test_predictions.show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+-------------------+--------------------+--------------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|ocean_proximity_index|ocean_proximity_vec|   unscaled_features|            features|        prediction|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+-------------------+--------------------+--------------------+------------------+
|   -124.3|   41.84|              17.0|     2677.0|         531.0|    1244.0|     456.0|       3.0313|          103600.0|     NEAR OCEAN|                  2.0|      (4,[2],[1.0])|[17.0,2677.0,531....|[-0.9217434791825...|195064.51529543975|
|  -124.23|   40.54|              52

In [17]:
from pyspark.ml.evaluation import RegressionEvaluator

test_evaluator_rmse = RegressionEvaluator(labelCol='median_house_value', predictionCol='prediction', metricName='rmse')
test_rmse = test_evaluator_rmse.evaluate(test_predictions)
print(f"Root Mean Squared Error (RMSE): {test_rmse}")

Root Mean Squared Error (RMSE): 69825.72320493394
