# House Price Prediction with PySpark

Here, the hous price prediction task is solved by using the big data framework Apache Spark instead of scikit-learn. The house price data set is rather small, so using spark for this task is not efficient at all. But this model scales much better to bigger data sets, than a model using scikit-learn.

In [1]:
import numpy as np

import os

from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import log, exp
from pyspark.ml.feature import Imputer, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

import pyspark.ml.tuning as tune

In [2]:
spark = SparkSession.builder.getOrCreate()

## Dataset

In [3]:
# get train and test data set
data_loc = './data'

train_data_base = spark.read.csv(os.path.join(data_loc,'train.csv'), inferSchema=True, header=True, nullValue='NA')
test_data_base = spark.read.csv(os.path.join(data_loc,'test.csv'), inferSchema=True, header=True, nullValue='NA')

## Preprocessing

In [4]:
# find all numerical columns
num_features = [col_name for col_name, dtype in train_data_base.dtypes if dtype == "int"]

# remove SalePrice, which is the target, and the Id column from the list of features
num_features.remove("SalePrice")
num_features.remove("Id")

In [5]:
# split validation data
train, val = train_data_base.randomSplit([.7, .3])

# copy the test set
test = test_data_base.select("*")

In [6]:
# cast all numerical features to double (necessary for imputation)
for feat in num_features:
    train = train.withColumn(feat, train[feat].cast(DoubleType()))
    val = val.withColumn(feat, val[feat].cast(DoubleType()))
    test = test.withColumn(feat, test[feat].cast(DoubleType()))

In [7]:
# log-scale the SalePrice
train = train.withColumn("SalePriceLog", log("SalePrice"))
val = val.withColumn("SalePriceLog", log("SalePrice"))

## ML Pipeline

In [8]:
# names of the features afer imputation
num_features_imp = [feat+"_imp" for feat in num_features]

In [9]:
# set up the ML pipeline

# imputation of missing values
imputer = Imputer(inputCols=num_features, outputCols=num_features_imp)

# assembler to collect all the features
vec_assembler = VectorAssembler(inputCols=num_features_imp, outputCol="features")

# scaling of the fetures
scaler = StandardScaler(inputCol="features", outputCol="features_scaled", withStd=True, withMean=True)

# ridge regression
regression = LinearRegression(featuresCol="features_scaled", labelCol="SalePriceLog", elasticNetParam=0)

# pipeline, combining all the steps
pipe = Pipeline(stages=[imputer, vec_assembler, scaler, regression])

In [10]:
# create a parameter gird for hyperparameter tuning
grid = tune.ParamGridBuilder()
grid = grid.addGrid(regression.regParam, [0.001, 0.01, 0.1, 1, 10, 50, 100, 500, 1000])
grid = grid.addGrid(imputer.strategy, ["mean", "median"])
grid = grid.build()

In [11]:
# define the evaluation criteria
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="SalePriceLog", metricName='rmse')

# create the CrossValidator
cv = tune.CrossValidator(estimator=pipe, estimatorParamMaps=grid, evaluator=evaluator)

# fit cross validation models
cv_models = cv.fit(train)

# extract the best model
bestPipeline = cv_models.bestModel

## Evaluate the Model

In [12]:
# predict and evaluate the validation set
val_prediction = bestPipeline.transform(val)
evaluator.evaluate(val_prediction)

0.19154988085065985

In [13]:
# predict the test set
test_prediction = bestPipeline.transform(test)

In [14]:
# create the submission file 
submission = test_prediction.select("Id", "prediction")
submission = submission.withColumn("SalePrice", exp("prediction"))
submission = submission.drop("prediction")
submission.write.csv(os.path.join(data_loc, "submission_spark"), header=True)

The submission scored 0.14608 on the kaggle public leader board.