In [1]:
import os
os.environ["PYSPARK_SUBMIT_ARGS"] = '--master local[2] pyspark-shell'

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

spark = (SparkSession 
    .builder 
     .master("local[*]") 
    .getOrCreate()
        )

# Baseline Model

Let's compute the average `price` on the training dataset, and use that as our prediction column for our test dataset, then evaluate the result.

In [2]:
from pyspark.sql.functions import avg, lit

filePath = "data/sf-airbnb-clean.parquet"
airbnbDF = spark.read.parquet(filePath)
trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=42)

avgPrice = float(trainDF.select(avg("price")).first()[0])
predDF = testDF.withColumn("avgPrediction", lit(avgPrice))

In [3]:
from pyspark.ml.evaluation import RegressionEvaluator

regressionMeanEvaluator = RegressionEvaluator(predictionCol="avgPrediction", labelCol="price", metricName="rmse")

print(f"The RMSE for predicting the average price is: {regressionMeanEvaluator.evaluate(predDF):.2f}")


The RMSE for predicting the average price is: 240.71
