In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
  .appName("Predicting housing prices") \
  .getOrCreate()

24/10/09 23:10:23 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
data = spark.read.csv("data/housing.csv", header=True, inferSchema=True)

data.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)
 |-- ocean_proximity: string (nullable = true)



In [3]:
data.count()

20640

In [4]:
import pyspark.sql.functions as F

data \
  .select(F.count(F.when(F.col('ocean_proximity').isNull(), 1)).alias('ocean_proximity')) \
  .show()

+---------------+
|ocean_proximity|
+---------------+
|              0|
+---------------+



In [5]:
data \
  .select([F.count(F.when(F.col(c).isNull(), 1)).alias(c) for c in data.columns]) \
  .show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|        0|       0|                 0|          0|           207|         0|         0|            0|                 0|              0|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+



In [6]:
filtered_data = data.dropna(subset=['total_bedrooms'])
filtered_data.count()

20433

In [7]:
from pyspark.ml.feature import VectorAssembler

feature_columns = ['housing_median_age', 'total_rooms', 'total_bedrooms',
                   'population', 'households', 'median_income']
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')

transformed_data = assembler.transform(filtered_data)
transformed_data.show()

ModuleNotFoundError: No module named 'numpy'

In [14]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol='features', labelCol='median_house_value')

model = lr.fit(transformed_data)

24/10/09 08:55:21 WARN Instrumentation: [418eee96] regParam is zero, which might cause numerical instability and overfitting.
24/10/09 08:55:21 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/10/09 08:55:21 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/10/09 08:55:21 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [15]:
# 41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY
single_record = [{
    'housing_median_age': 41.0,
    'total_rooms': 880.0,
    'total_bedrooms': 129.0,
    'population': 322.0,
    'households': 126.0,
    'median_income': 8.3252
}]

single_record_df = spark.createDataFrame(single_record)


In [16]:
new_data = assembler.transform(single_record_df)
new_data.show()

+----------+------------------+-------------+----------+--------------+-----------+--------------------+
|households|housing_median_age|median_income|population|total_bedrooms|total_rooms|            features|
+----------+------------------+-------------+----------+--------------+-----------+--------------------+
|     126.0|              41.0|       8.3252|     322.0|         129.0|      880.0|[41.0,880.0,129.0...|
+----------+------------------+-------------+----------+--------------+-----------+--------------------+



In [17]:
model.transform(new_data).show()

+----------+------------------+-------------+----------+--------------+-----------+--------------------+------------------+
|households|housing_median_age|median_income|population|total_bedrooms|total_rooms|            features|        prediction|
+----------+------------------+-------------+----------+--------------+-----------+--------------------+------------------+
|     126.0|              41.0|       8.3252|     322.0|         129.0|      880.0|[41.0,880.0,129.0...|428551.23479640554|
+----------+------------------+-------------+----------+--------------+-----------+--------------------+------------------+

