In [1]:
# Import SparkSession from pyspark.sql
from pyspark.sql import SparkSession
# Create my_spark
spark = SparkSession.builder.getOrCreate()
# load ratings data
df = spark.read.csv("2017_StPaul_MN_Real_Estate.csv", header=True, inferSchema=True)

In [2]:
from pyspark.sql import functions as F
df=df.withColumn('offmarketdate', F.unix_timestamp("offmarketdate", "MM/dd/yyyy H:mm").cast("timestamp"))

In [18]:
df=df.withColumn('LISTDATE', F.unix_timestamp("LISTDATE", "MM/dd/yyyy H:mm").cast("timestamp"))

In [19]:
df[['offmarketdate']].show()

+-------------------+
|      offmarketdate|
+-------------------+
|0017-07-30 00:00:00|
|0017-10-13 00:00:00|
|0017-07-24 00:00:00|
|0017-09-13 00:00:00|
|0017-10-03 00:00:00|
|0017-04-27 00:00:00|
|0017-07-10 00:00:00|
|0017-11-10 00:00:00|
|0017-11-11 00:00:00|
|0017-11-20 00:00:00|
|0017-08-07 00:00:00|
|0017-07-29 00:00:00|
|0017-03-24 00:00:00|
|0017-06-11 00:00:00|
|0017-06-05 00:00:00|
|0017-10-17 00:00:00|
|0017-09-22 00:00:00|
|0017-10-16 00:00:00|
|0017-06-06 00:00:00|
|0017-08-27 00:00:00|
+-------------------+
only showing top 20 rows



In [5]:
df[['LISTDATE']].show()

+-------------------+
|           LISTDATE|
+-------------------+
|0017-07-15 00:00:00|
|0017-10-09 00:00:00|
|0017-06-26 00:00:00|
|0017-08-25 00:00:00|
|0017-09-12 00:00:00|
|0017-04-10 00:00:00|
|0017-06-08 00:00:00|
|0017-11-05 00:00:00|
|0017-10-12 00:00:00|
|0017-09-02 00:00:00|
|0017-05-19 00:00:00|
|0017-05-11 00:00:00|
|0017-03-12 00:00:00|
|0017-03-06 00:00:00|
|0017-05-18 00:00:00|
|0017-10-15 00:00:00|
|0017-09-10 00:00:00|
|0017-09-01 00:00:00|
|0017-05-06 00:00:00|
|0017-08-11 00:00:00|
+-------------------+
only showing top 20 rows



In [6]:
df=df.select('SalesClosePrice','LISTDATE','LISTPRICE','OriginalListPrice','SchoolDistrictNumber','offmarketdate','LivingArea','FOUNDATIONSIZE', 'DAYSONMARKET', 'Fireplaces', 'PDOM', 'SQFTABOVEGROUND', 'YEARBUILT', 'ACRES', 'BathsFull', 'BathsHalf', 'BATHQUARTER', 'BATHSTHREEQUARTER', 'BATHSTOTAL', 'SQFTBELOWGROUND', 'AssociationFee', 'AssessedValuation')

In [7]:
df.printSchema()

root
 |-- SalesClosePrice: integer (nullable = true)
 |-- LISTDATE: timestamp (nullable = true)
 |-- LISTPRICE: integer (nullable = true)
 |-- OriginalListPrice: integer (nullable = true)
 |-- SchoolDistrictNumber: string (nullable = true)
 |-- offmarketdate: timestamp (nullable = true)
 |-- LivingArea: integer (nullable = true)
 |-- FOUNDATIONSIZE: integer (nullable = true)
 |-- DAYSONMARKET: integer (nullable = true)
 |-- Fireplaces: integer (nullable = true)
 |-- PDOM: integer (nullable = true)
 |-- SQFTABOVEGROUND: integer (nullable = true)
 |-- YEARBUILT: integer (nullable = true)
 |-- ACRES: double (nullable = true)
 |-- BathsFull: integer (nullable = true)
 |-- BathsHalf: integer (nullable = true)
 |-- BATHQUARTER: integer (nullable = true)
 |-- BATHSTHREEQUARTER: integer (nullable = true)
 |-- BATHSTOTAL: integer (nullable = true)
 |-- SQFTBELOWGROUND: integer (nullable = true)
 |-- AssociationFee: integer (nullable = true)
 |-- AssessedValuation: double (nullable = true)



In [8]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

# Map strings to numbers with string indexer
string_indexer = StringIndexer(inputCol='SchoolDistrictNumber', outputCol='School_Index')
indexed_df = string_indexer.fit(df).transform(df)

# Onehot encode indexed values
encoder = OneHotEncoder(inputCol='School_Index', outputCol='School_Vec')
encoded_df = encoder.transform(indexed_df)

# Inspect the transformation steps
encoded_df[['SCHOOLDISTRICTNUMBER', 'School_Index', 'School_Vec']].show(truncate=100)

+-----------------------------+------------+-------------+
|         SCHOOLDISTRICTNUMBER|School_Index|   School_Vec|
+-----------------------------+------------+-------------+
|             834 - Stillwater|         3.0|(7,[3],[1.0])|
|             834 - Stillwater|         3.0|(7,[3],[1.0])|
|622 - North St Paul-Maplewood|         1.0|(7,[1],[1.0])|
|             834 - Stillwater|         3.0|(7,[3],[1.0])|
|622 - North St Paul-Maplewood|         1.0|(7,[1],[1.0])|
|             834 - Stillwater|         3.0|(7,[3],[1.0])|
|             834 - Stillwater|         3.0|(7,[3],[1.0])|
|             834 - Stillwater|         3.0|(7,[3],[1.0])|
|             834 - Stillwater|         3.0|(7,[3],[1.0])|
|             834 - Stillwater|         3.0|(7,[3],[1.0])|
|             834 - Stillwater|         3.0|(7,[3],[1.0])|
|             834 - Stillwater|         3.0|(7,[3],[1.0])|
|             834 - Stillwater|         3.0|(7,[3],[1.0])|
|             834 - Stillwater|         3.0|(7,[3],[1.0]

In [9]:
from pyspark.ml.feature import VectorAssembler
# Replace Missing values
encoded_df = encoded_df.fillna(-1)
features_cols=['LISTPRICE','OriginalListPrice','School_Vec','FOUNDATIONSIZE', 'DAYSONMARKET', 'Fireplaces', 'PDOM', 'SQFTABOVEGROUND', 'LivingArea', 'YEARBUILT', 'ACRES', 'BathsFull', 'BathsHalf', 'BATHQUARTER', 'BATHSTHREEQUARTER', 'BATHSTOTAL', 'SQFTBELOWGROUND', 'AssociationFee', 'AssessedValuation']
# Create the vector assembler transformer
vec = VectorAssembler(inputCols=features_cols, outputCol='features')
# Apply the vector transformer to data
encoded_df = vec.transform(encoded_df)
# Select only the feature vectors and the dependent variable
ml_ready_df = encoded_df.select(['SalesClosePrice','features','offmarketdate','LISTDATE'])
# Inspect Results
ml_ready_df.show(5)

+---------------+--------------------+-------------------+-------------------+
|SalesClosePrice|            features|      offmarketdate|           LISTDATE|
+---------------+--------------------+-------------------+-------------------+
|         143000|(25,[0,1,5,9,10,1...|0017-07-30 00:00:00|0017-07-15 00:00:00|
|         190000|(25,[0,1,5,9,10,1...|0017-10-13 00:00:00|0017-10-09 00:00:00|
|         225000|(25,[0,1,3,9,10,1...|0017-07-24 00:00:00|0017-06-26 00:00:00|
|         265000|(25,[0,1,5,9,10,1...|0017-09-13 00:00:00|0017-08-25 00:00:00|
|         249900|(25,[0,1,3,9,10,1...|0017-10-03 00:00:00|0017-09-12 00:00:00|
+---------------+--------------------+-------------------+-------------------+
only showing top 5 rows



In [20]:
from pyspark.sql.functions import datediff, to_date, lit

split_date = to_date(lit('0017-09-01'))
# Create Sequential Test set
# Create Sequential Test and Training Sets
train_df = ml_ready_df.where(ml_ready_df['offmarketdate'] < split_date) 
test_df = ml_ready_df.where(ml_ready_df['offmarketdate'] >= split_date ).where(ml_ready_df['LISTDATE'] <= split_date )

In [26]:
from pyspark.ml.regression import GBTRegressor

# Train a Gradient Boosted Trees (GBT) model.
gbt = GBTRegressor(featuresCol='features',
                           labelCol='SalesClosePrice',
                           predictionCol="Prediction_Price",
                           seed=42
                           )

# Train model.
gbt_model = gbt.fit(train_df)

gbt_predictions=gbt_model.transform(test_df)

In [27]:
gbt_predictions.show()

+---------------+--------------------+-------------------+-------------------+------------------+
|SalesClosePrice|            features|      offmarketdate|           LISTDATE|  Prediction_Price|
+---------------+--------------------+-------------------+-------------------+------------------+
|         265000|(25,[0,1,5,9,10,1...|0017-09-13 00:00:00|0017-08-25 00:00:00| 233188.2078303583|
|         274000|(25,[0,1,5,9,10,1...|0017-10-16 00:00:00|0017-09-01 00:00:00|285099.13697210734|
|         289900|[299900.0,299900....|0017-09-30 00:00:00|0017-06-26 00:00:00| 302697.0272815767|
|         375000|[383000.0,400000....|0017-09-18 00:00:00|0017-07-09 00:00:00| 366223.4141945253|
|         394900|(25,[0,1,5,9,10,1...|0017-09-18 00:00:00|0017-07-14 00:00:00| 409191.4678542085|
|         394900|[394900.0,424900....|0017-11-09 00:00:00|0017-06-16 00:00:00| 395128.1138736871|
|         404005|[409990.0,436255....|0017-11-02 00:00:00|0017-06-12 00:00:00|424292.19977619435|
|         405000|(25

In [28]:
from pyspark.ml.regression import RandomForestRegressor
# Initialize model with columns to utilize
rf = RandomForestRegressor(featuresCol="features",labelCol="SalesClosePrice",predictionCol="Prediction_Price",seed=42)

# Train model
rf_model = rf.fit(train_df)
rf_predictions=rf_model.transform(test_df)

In [30]:
from pyspark.ml.evaluation import RegressionEvaluator

# Select columns to compute test error
evaluator = RegressionEvaluator(labelCol='SalesClosePrice', 
                                predictionCol='Prediction_Price')
# Dictionary of model predictions to loop over
models = {'Gradient Boosted Trees': gbt_predictions, 'Random Forest Regression': rf_predictions}
for key, preds in models.items():
  # Create evaluation metrics
  rmse = evaluator.evaluate(preds, {evaluator.metricName: 'rmse'})
  r2 = evaluator.evaluate(preds, {evaluator.metricName: 'r2'})
  
  # Print Model Metrics
  print(key + ' RMSE: ' + str(rmse))
  print(key + ' R^2: ' + str(r2))

Gradient Boosted Trees RMSE: 41242.35692384228
Gradient Boosted Trees R^2: 0.9248955220962971
Random Forest Regression RMSE: 39255.218653369346
Random Forest Regression R^2: 0.9319585312092161
