<a href="https://colab.research.google.com/github/stephenfrein/csc8491/blob/main/HomePricePredictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# read in home data from CSV file
homes_raw = read.csv('https://csc8491.s3.amazonaws.com/home_data.csv')
head(homes_raw)

In [None]:
# review homes data set structure
str(homes_raw)

In [None]:
# check ranges, outliers, nulls, etc.
summary(homes_raw)

In [None]:
# get rid of rows with missing data - these cause many model-building algorithms to choke
homes_clean = na.omit(homes_raw)
nrow(homes_raw)
nrow(homes_clean)
summary(homes_clean)

In [None]:
# split into training and test data sets
# training used to build model - test used to try it on unseen data
# set training data percentage size
train_proportion = .70
# indexes of rows in training set - args are total_range and how many nums from range
homes_train_rows = sample(1:nrow(homes_clean),train_proportion * nrow(homes_clean))
homes_train_rows
#create training data frame
homes_train = homes_clean[homes_train_rows,]
sprintf("Training data set has %s rows", nrow(homes_train))
# create test data frame - minus makes it the opposite rows
homes_test = homes_clean[-homes_train_rows,]
sprintf("Test data set has %s rows", nrow(homes_test))

In [None]:
# build and assess model
lm.allpredictors = lm(price ~ ., data = homes_train)
summary(lm.allpredictors)
# make some predictions
all_preds = predict(lm.allpredictors, newdata = homes_test)
# calculate MAE - mean absolute error
all_preds_avg_error = mean(abs(all_preds - homes_test$price))
sprintf("MAE (mean absolute error) is %s", all_preds_avg_error)


In [None]:
# build a model just based on square feet - easier to graph in 2D
lm.sqft = lm(price ~ sqft_living, data = homes_train)
summary(lm.sqft)
plot(homes_train$sqft_living, homes_train$price)
abline(lm.sqft)

In [None]:
# find influential outliers with Cook's Distance
# measures the impact of a single observation on the model
cooksd = cooks.distance(lm.allpredictors)
# plot Cook's Distance
# pch means plot character
# cex is character expansion ratio
plot(cooksd, pch="*", cex=2, main="Influential Observations - Cook's Distance")

In [None]:
# find the top N Cook's Distance values
top_n = 20
head(sort(cooksd, decreasing = TRUE), n= top_n)
cutoff_threshold = min(head(sort(cooksd, decreasing = TRUE), n= top_n))
cutoff_threshold

In [None]:
# plot Cook's Distance
# pch means plot character
# cex is character expansion ratio
plot(cooksd, pch="*", cex=2, main="Influential Observations - Cook's Distance")
# add cutoff line for top Cook's Distance values
abline(h = cutoff_threshold, col="red", lwd=3)

In [None]:
# influential row numbers
influential = as.numeric(names(cooksd)[(cooksd >= cutoff_threshold)])
length(influential)
# remove those rows
homes_rm_outliers = homes_train[ !(rownames(homes_train) %in% influential), ]
#build new model
lm.sqft_outrm = lm(price ~ sqft_living, data = homes_rm_outliers)
summary(lm.sqft_outrm)
# let's see the difference with outliers removed
plot(homes_rm_outliers$sqft_living, homes_rm_outliers$price)
abline(lm.sqft, col="red") #old model
abline(lm.sqft_outrm, col="blue") #new model

In [None]:
# which predicts better?
# try model built on outliers
sqft_preds = predict(lm.sqft, newdata = homes_test)
sqft_preds_avg_error = mean(abs(sqft_preds - homes_test$price))
sprintf("Average error with outliers: %s", sqft_preds_avg_error)
# now try model built without outliers
outrm_sqft_preds = predict(lm.sqft_outrm, newdata = homes_test)
outrm_sqft_preds_avg_error = mean(abs(outrm_sqft_preds - homes_test$price))
sprintf("Average error with outliers removed: %s", outrm_sqft_preds_avg_error)

In [None]:
# now use all predictors with outliers removed
lm.all_pred_outrm = lm(price ~ ., data = homes_rm_outliers)
outrm_all_preds = predict(lm.all_pred_outrm, newdata = homes_test)
outrm_all_preds_avg_error = mean(abs(outrm_all_preds - homes_test$price))
sprintf("Average error all predictors with outliers removed: %s", outrm_all_preds_avg_error)
sprintf("Average error all predictors with outliers left in: %s", all_preds_avg_error)

In [None]:
# predict the 95% confidence interval of homes with following properties
# 95% chance that population parameter for homes with these properties
# will be within the interval
predict(lm.all_pred_outrm, data.frame(bedrooms=c(4), bathrooms=c(3), sqft_living=c(2794),
                                      sqft_lot=c(16553), floors=c(2), condition=c(3), yrs_old=70,
                                      assess_cd=9), interval="confidence")

In [None]:
# predict how much a *specific* home will cost - same center, wider range
predict(lm.all_pred_outrm, data.frame(bedrooms=c(4), bathrooms=c(3), sqft_living=c(2794),
                                      sqft_lot=c(16553), floors=c(2), condition=c(3), yrs_old=70,
                                      assess_cd=9), interval="prediction")

In [None]:
# do the same for another house
predict(lm.all_pred_outrm, data.frame(bedrooms=c(3), bathrooms=c(1.5), sqft_living=c(1499),
                                      sqft_lot=c(5271), floors=c(2), condition=c(4), yrs_old=60,
                                      assess_cd=8),interval="confidence")
predict(lm.all_pred_outrm, data.frame(bedrooms=c(3), bathrooms=c(1.5), sqft_living=c(1499),
                                      sqft_lot=c(5271), floors=c(2), condition=c(4), yrs_old=60,
                                      assess_cd=8),interval="prediction")