# AirBnB Price Predictions

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = (SparkSession
        .builder
        .appName("sf-airbnb")
        .getOrCreate())

In [4]:
filePath = "C:/Users/sean.cornillie/Education/LearningSparkV2/Spark_Dev/datasets/sf-airbnb-clean.parquet/"

In [6]:
airbnbDF = spark.read.parquet(filePath)

airbnbDF.select("neighbourhood_cleansed", "room_type", "bedrooms", "bathrooms", "number_of_reviews", "price").show(5)

+----------------------+---------------+--------+---------+-----------------+-----+
|neighbourhood_cleansed|      room_type|bedrooms|bathrooms|number_of_reviews|price|
+----------------------+---------------+--------+---------+-----------------+-----+
|      Western Addition|Entire home/apt|     1.0|      1.0|            180.0|170.0|
|        Bernal Heights|Entire home/apt|     2.0|      1.0|            111.0|235.0|
|        Haight Ashbury|   Private room|     1.0|      4.0|             17.0| 65.0|
|        Haight Ashbury|   Private room|     1.0|      4.0|              8.0| 65.0|
|      Western Addition|Entire home/apt|     2.0|      1.5|             27.0|785.0|
+----------------------+---------------+--------+---------+-----------------+-----+
only showing top 5 rows



In [7]:
### We'll keep 80% of our data for the training set, and 20% for our test set.
### Set random seed for reproducibility (if re-run with same random seed we should get same results)

trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=42)

print(f"""There are {trainDF.count()} rows in the training set, and {testDF.count()} in the test set""")

There are 5780 rows in the training set, and 1366 in the test set


In [17]:
from pyspark.ml.feature import VectorAssembler

#### Prepping features with Transformers
Here we'll prep the data to build a linear regression model predicting price given a # of bedrooms. V simple example, classic y=mx+b

In [10]:
### Using VectorAssembler, we are prepping/transforming our data into a format that our linear regression model expects

vecAssembler = VectorAssembler(inputCols=["bedrooms"], outputCol="features")
vecTrainDF = vecAssembler.transform(trainDF)
vecTrainDF.select("bedrooms", "features", "price").show(10)

+--------+--------+-----+
|bedrooms|features|price|
+--------+--------+-----+
|     1.0|   [1.0]|200.0|
|     1.0|   [1.0]|130.0|
|     1.0|   [1.0]| 95.0|
|     1.0|   [1.0]|250.0|
|     3.0|   [3.0]|250.0|
|     1.0|   [1.0]|115.0|
|     1.0|   [1.0]|105.0|
|     1.0|   [1.0]| 86.0|
|     1.0|   [1.0]|100.0|
|     2.0|   [2.0]|220.0|
+--------+--------+-----+
only showing top 10 rows



#### Using Estimators to build models
Here we actually build our model to estimated price per bedroom.

In [18]:
from pyspark.ml.regression import LinearRegression

In [19]:
lr = LinearRegression(featuresCol="features", labelCol="price")
lrModel = lr.fit(vecTrainDF)

In [20]:
### Output our coefficients and organize locially below

m = lrModel.coefficients[0]
b = lrModel.intercept

print(f"""The formula for the linear regression line is price = {m}*bedrooms + {b}""")

The formula for the linear regression line is price = 123.6757463819947*bedrooms + 47.51023373378815


#### Creating a Pipeline

In [21]:
from pyspark.ml import Pipeline

In [22]:
### Prep a pipeline that we can simply just apply to our test data
pipeline = Pipeline(stages=[vecAssembler, lr])
pipelineModel = pipeline.fit(trainDF)

In [24]:
### Apply pipeline to our test data
### Very simple model that doesn't provide great predictions against results

predDF = pipelineModel.transform(testDF)
predDF.select("bedrooms", "features", "price", "prediction").show(10)

+--------+--------+------+------------------+
|bedrooms|features| price|        prediction|
+--------+--------+------+------------------+
|     1.0|   [1.0]|  85.0|171.18598011578285|
|     1.0|   [1.0]|  45.0|171.18598011578285|
|     1.0|   [1.0]|  70.0|171.18598011578285|
|     1.0|   [1.0]| 128.0|171.18598011578285|
|     1.0|   [1.0]| 159.0|171.18598011578285|
|     2.0|   [2.0]| 250.0|294.86172649777757|
|     1.0|   [1.0]|  99.0|171.18598011578285|
|     1.0|   [1.0]|  95.0|171.18598011578285|
|     1.0|   [1.0]| 100.0|171.18598011578285|
|     1.0|   [1.0]|2010.0|171.18598011578285|
+--------+--------+------+------------------+
only showing top 10 rows



#### One-hot encoding
Let's build a slightly more complex pipeline that incorporates all of the dataset's numeric & categorical features. <br/>
Most machine learning models in MLib expect numerical values as input, represented as vectors. <br/>
So, we'll use one-hot encoding (OHE) to convert categoricals into numerics. <br/>

In [25]:
# Suppose we have a column called Animal and have three types
### We can't pass string types into our ML model so need at assign numeric mapping, such as:
#### Animal = {"Dog", "Cat", "Fish"}
#### "Dog" = 1, "Cat" = 2, "Fish" = 3

# However this approach could introduce some relationships that don't exist. E.g. is cat worth 2 dogs, etc?
### So, we'll use OHE to vectorize existance of dog/cat/fish in our animal column.
#### "Dog" =  [ 1, 0, 0]
#### "Cat" =  [ 0, 1, 0]
#### "Fish" = [ 0, 0, 1]

In [26]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

In [27]:
# Define categorical columns
categoricalCols = [field for (field, dataType) in trainDF.dtypes if dataType == "string"]


indexOutputCols = [x + "Index" for x in categoricalCols]
oheOutputCols = [x + "OHE" for x in categoricalCols]

# Create indexes of all strings in each categorical column
stringIndexer = StringIndexer(inputCols=categoricalCols,
                              outputCols=indexOutputCols,
                              handleInvalid="skip")

# Encodes those indexes into column of binary vectors
oheEncoder = OneHotEncoder(inputCols=indexOutputCols,
                           outputCols=oheOutputCols)

#Define numeric columns 
numericCols = [field for (field, dataType) in trainDF.dtypes if ((dataType == "double") & (field != "price"))]

#Re-assemble converted OHE & Numeric columns
assemblerInputs = oheOutputCols + numericCols

vecAssembler = VectorAssembler(inputCols=assemblerInputs,
                               outputCol="features")

In [32]:
# Another (less complicated) way to do this is to use RFormula.
# RFormula automatically StringIndex & OHE all of your string columns, convert numerics to double type, and combine into single vector using VectorAssembler under the hood.
# All we need to do is provide it with a formula = "y ~ bedrooms + bathrooms"
### In the case below "price ~ .", we're predicting price based on all (.) features.
# Thus, we could replace all of the above code with the snippet below

from pyspark.ml.feature import RFormula

In [33]:
rFormula = RFormula(formula = "price ~ .",
                    featuresCol="features",
                    labelCol="price",
                    handleInvalid="skip")

In [34]:
lr = LinearRegression(labelCol="price", featuresCol="features")
pipeline = Pipeline(stages = [stringIndexer, oheEncoder, vecAssembler, lr])
### Alt pipeline: use RFormula ###
#pipeline = Pipeline(stages = [rFormula, lr])

pipelineModel = pipeline.fit(trainDF)
predDF = pipelineModel.transform(testDF)
predDF.select("features", "price", "prediction").show(5)

+--------------------+-----+------------------+
|            features|price|        prediction|
+--------------------+-----+------------------+
|(98,[0,3,6,22,43,...| 85.0| 55.24365707389188|
|(98,[0,3,6,22,43,...| 45.0|23.357685914717877|
|(98,[0,3,6,22,43,...| 70.0|28.474464479034395|
|(98,[0,3,6,12,42,...|128.0| -91.6079079594947|
|(98,[0,3,6,12,43,...|159.0| 95.05688229945372|
+--------------------+-----+------------------+
only showing top 5 rows



### Evaluating Models

#### RMSE

In [35]:
from pyspark.ml.evaluation import RegressionEvaluator

In [37]:
regressionEvaluator = RegressionEvaluator(
    predictionCol="prediction",
    labelCol="price",
    metricName="rmse")

rmse = regressionEvaluator.evaluate(predDF)
print(f"RMSE is {rmse:.1f}")

RMSE is 220.6


In [38]:
print(rmse)

220.56321700343753


#### R-Squared

In [39]:
r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
print(r2)

0.16043316698848087


### Lab/HW: Re-train & fit model with Log prices
This R2 very bad. Chiefly b/c ML models will fare better with a log-normal distribution, and we did not actually convert to begin with.

In [45]:
filePath = "C:/Users/sean.cornillie/Education/LearningSparkV2/Spark_Dev/datasets/sf-airbnb-clean.parquet/"
airbnbDF = spark.read.parquet(filePath)
trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=42)

print(f"""There are {trainDF.count()} rows in the training set, and {testDF.count()} in the test set""")

There are 5780 rows in the training set, and 1366 in the test set


In [53]:
logTrainDF = trainDF.withColumn("log_price", log(col("price")))
logTestDF = trainDF.withColumn("log_price", log(col("price")))

In [54]:
rFormula = RFormula(formula = "log_price ~ . - price", ## log_price based on all features except price
                    featuresCol="features",
                    labelCol="log_price",
                    handleInvalid="skip")

In [58]:
lr = LinearRegression(labelCol="log_price", predictionCol="log_pred")
pipeline = Pipeline(stages = [rFormula, lr])

pipelineModel = pipeline.fit(logTrainDF)
predDF = pipelineModel.transform(logTestDF)

In [59]:
expDF = predDF.withColumn("prediction", exp(col("log_pred")))

In [60]:
regressionEvaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction")
rmse = regressionEvaluator.setMetricName("rmse").evaluate(expDF)
r2 = regressionEvaluator.setMetricName("r2").evaluate(expDF)

print(f"RMSE is {rmse}")
print(f"R2 is {r2}")

RMSE is 288.05105439893686
R2 is 0.2184102251246367


#### Now let's save our pipeline model for future re-use

In [61]:
pipelinePath = "/tmp/lr-pipeline-model"
pipelineModel.write().overwrite().save(pipelinePath)

In [62]:
spark.stop()