# House Price Prediction with PySpark

In [67]:
import os

from pyspark.ml.feature import Imputer

from pyspark.ml import Pipeline

from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType

from pyspark.mllib.regression import RidgeRegressionWithSGD

from pyspark.ml.feature import VectorAssembler

In [None]:
spark = SparkSession.builder.getOrCreate()

In [2]:
spark.catalog.listTables()

[]

In [3]:
# get train and test data set
data_loc = './data'

train_data_base = spark.read.csv(os.path.join(data_loc,'train.csv'), inferSchema=True, header=True)
test_data_base = spark.read.csv(os.path.join(data_loc,'test.csv'), inferSchema=True, header=True)

## some examples

In [16]:
train_data_base.describe().toPandas()

Unnamed: 0,summary,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,count,1460.0,1460.0,1460,1460.0,1460.0,1460,1460,1460,1460,...,1460.0,1460,1460,1460,1460.0,1460.0,1460.0,1460,1460,1460.0
1,mean,730.5,56.897260273972606,,70.04995836802665,10516.828082191782,,,,,...,2.758904109589041,,,,43.489041095890414,6.321917808219178,2007.8157534246573,,,180921.19589041092
2,stddev,421.6100093688479,42.30057099381045,,24.28475177448321,9981.26493237915,,,,,...,40.17730694453021,,,,496.1230244579441,2.7036262083595117,1.3280951205521143,,,79442.50288288663
3,min,1.0,20.0,C (all),100.0,1300.0,Grvl,Grvl,IR1,Bnk,...,0.0,Ex,GdPrv,Gar2,0.0,1.0,2006.0,COD,Abnorml,34900.0
4,max,1460.0,190.0,RM,,215245.0,Pave,Pave,Reg,Lvl,...,738.0,,,TenC,15500.0,12.0,2010.0,WD,Partial,755000.0


In [4]:
# add data to the catalog
train_data_base.createOrReplaceTempView("temp")

In [5]:
spark.catalog.listTables()

[Table(name='temp', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [7]:
# run a simple query
spark.sql("SELECT Id, LotFrontage, Alley FROM temp LIMIT 10").show()

+---+-----------+-----+
| Id|LotFrontage|Alley|
+---+-----------+-----+
|  1|         65|   NA|
|  2|         80|   NA|
|  3|         68|   NA|
|  4|         60|   NA|
|  5|         84|   NA|
|  6|         85|   NA|
|  7|         75|   NA|
|  8|         NA|   NA|
|  9|         51|   NA|
| 10|         50|   NA|
+---+-----------+-----+



In [20]:
expensive_houses = train_data_base.select("Id", "LotFrontage", "SalePrice", "YearBuilt", "YrSold")\
    .filter(expensive_houses.SalePrice > 500000)

In [21]:
expensive_houses.show(10)

+----+-----------+---------+---------+------+
|  Id|LotFrontage|SalePrice|YearBuilt|YrSold|
+----+-----------+---------+---------+------+
| 179|         63|   501837|     2008|  2009|
| 441|        105|   555000|     2008|  2009|
| 692|        104|   755000|     1994|  2007|
| 770|         47|   538000|     2003|  2010|
| 804|        107|   582933|     2008|  2009|
| 899|        100|   611657|     2009|  2010|
|1047|         85|   556581|     2005|  2006|
|1170|        118|   625000|     1995|  2006|
|1183|        160|   745000|     1996|  2007|
+----+-----------+---------+---------+------+



In [24]:
expensive_houses = expensive_houses.withColumn("AgeSold", expensive_houses.YrSold-expensive_houses.YearBuilt)
expensive_houses.show()

+----+-----------+---------+---------+------+-------+
|  Id|LotFrontage|SalePrice|YearBuilt|YrSold|AgeSold|
+----+-----------+---------+---------+------+-------+
| 179|         63|   501837|     2008|  2009|      1|
| 441|        105|   555000|     2008|  2009|      1|
| 692|        104|   755000|     1994|  2007|     13|
| 770|         47|   538000|     2003|  2010|      7|
| 804|        107|   582933|     2008|  2009|      1|
| 899|        100|   611657|     2009|  2010|      1|
|1047|         85|   556581|     2005|  2006|      1|
|1170|        118|   625000|     1995|  2006|     11|
|1183|        160|   745000|     1996|  2007|     11|
+----+-----------+---------+---------+------+-------+



## ML Model

In [23]:
# find all numerical columns
num_features = [col_name for col_name, dtype in train_data_base.dtypes if dtype == "int"]

# remove SalePrice, which is the target
num_features.remove("SalePrice")

In [27]:
# split validation data
training, validation = train_data_base.randomSplit([.7, .3])

In [51]:
train = training.select(*num_features, "SalePrice")
val = validation.select(*num_features, "SalePrice")
test = test_data_base.select(*num_features)

In [78]:
for feat in num_features:
    train = train.withColumn(feat, train[feat].cast(DoubleType()))
    val = val.withColumn(feat, val[feat].cast(DoubleType()))
    test = test.withColumn(feat, test[feat].cast(DoubleType()))

In [77]:
train.printSchema()

root
 |-- Id: double (nullable = true)
 |-- MSSubClass: double (nullable = true)
 |-- LotArea: double (nullable = true)
 |-- OverallQual: double (nullable = true)
 |-- OverallCond: double (nullable = true)
 |-- YearBuilt: double (nullable = true)
 |-- YearRemodAdd: double (nullable = true)
 |-- BsmtFinSF1: double (nullable = true)
 |-- BsmtFinSF2: double (nullable = true)
 |-- BsmtUnfSF: double (nullable = true)
 |-- TotalBsmtSF: double (nullable = true)
 |-- 1stFlrSF: double (nullable = true)
 |-- 2ndFlrSF: double (nullable = true)
 |-- LowQualFinSF: double (nullable = true)
 |-- GrLivArea: double (nullable = true)
 |-- BsmtFullBath: double (nullable = true)
 |-- BsmtHalfBath: double (nullable = true)
 |-- FullBath: double (nullable = true)
 |-- HalfBath: double (nullable = true)
 |-- BedroomAbvGr: double (nullable = true)
 |-- KitchenAbvGr: double (nullable = true)
 |-- TotRmsAbvGrd: double (nullable = true)
 |-- Fireplaces: double (nullable = true)
 |-- GarageCars: double (nullable 

In [79]:
num_features_imp = [feat+"_imp" for feat in num_features]

In [80]:
imputer = Imputer(inputCols=num_features, outputCols=num_features_imp)

In [81]:
vec_assembler = VectorAssembler(inputCols=num_features_imp, outputCol="features")

In [82]:
pipe = Pipeline(stages=[imputer, vec_assembler])

In [83]:
train_transf = pipe.fit(train).transform(train)

pyspark.sql.dataframe.DataFrame

In [None]:
regression = RidgeRegressionWithSGD()