In [23]:
# import necessary libraries
import pandas as pd

In [24]:
# Bring in past sales data
past_sales = pd.read_csv('cleaned_past.csv')
past_sales.head()

Unnamed: 0,SALE TYPE,PROPERTY TYPE,CITY,ZIP OR POSTAL CODE,PRICE,BEDS,BATHS,LOCATION,SQUARE FEET,LOT SIZE,YEAR BUILT,DAYS ON MARKET,$/SQUARE FEET,HOA/MONTH,LATITUDE,LONGITUDE
0,PAST SALE,Single Family Residential,Canyon Country,91387.0,570000,3.0,2.5,CAN2 - Canyon Country 2,1464.0,10789.0,1983.0,5,389.0,0.0,34.449485,-118.400176
1,PAST SALE,Single Family Residential,Canyon Country,91387.0,535000,3.0,2.0,CAN2 - Canyon Country 2,1548.0,24285.0,1974.0,69,346.0,0.0,34.457457,-118.389404
2,PAST SALE,Single Family Residential,Canyon Country,91387.0,525000,4.0,2.0,CAN2 - Canyon Country 2,1326.0,7336.0,1975.0,37,396.0,0.0,34.454962,-118.391801
3,PAST SALE,Single Family Residential,Canyon Country,91387.0,560000,4.0,2.0,CAN2 - Canyon Country 2,1854.0,6628.0,1991.0,31,302.0,0.0,34.440028,-118.388719
4,PAST SALE,Single Family Residential,Canyon Country,91387.0,560000,3.0,1.75,CAN2 - Canyon Country 2,1479.0,8662.0,1965.0,18,379.0,45.0,34.427945,-118.425415


Let's create dictionary of price per square foot averages for each zip code, such that zip code is the key:
{zipcode: price}


In [25]:
# For each zipcode, find mean $/square feet.
zipcodes = past_sales['ZIP OR POSTAL CODE'].unique()
print(zipcodes)

[91387. 91351. 91381. 91342. 91355. 91321. 91350. 91390. 91354. 91384.]


In [26]:
# Create dictionary of zipcode: $/sqft pairs
median_price_per_zipcode = {}

for code in zipcodes:
    value = past_sales[past_sales['ZIP OR POSTAL CODE'] == code]['$/SQUARE FEET'].median()
    median_price_per_zipcode.update({code:value})
    
print(median_price_per_zipcode)

{91387.0: 332.0, 91351.0: 321.0, 91381.0: 324.0, 91342.0: 289.0, 91355.0: 360.5, 91321.0: 331.5, 91350.0: 308.5, 91390.0: 300.0, 91354.0: 341.0, 91384.0: 301.0}


In [34]:
# Create column of estimated price based on $/sqft for the given zipcode 
# multiplied by the square feet for that property
past_sales['AREA_PRICE/SQFT'] = past_sales['ZIP OR POSTAL CODE'].map(median_price_per_zipcode)
past_sales['EST_VALUE'] = past_sales['SQUARE FEET'] * past_sales['AREA_PRICE/SQFT']

past_sales['EST_VALUE'].head()

0    486048.0
1    513936.0
2    440232.0
3    615528.0
4    491028.0
Name: EST_VALUE, dtype: float64

In [35]:
# Find MAE for estimated price ['EST_VALUE'] compared to actual sale price ['PRICE']
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_true=past_sales['PRICE'], y_pred=past_sales['EST_VALUE'])

In [36]:
print('Mean Absolute Error (MAE) for the benchmark method is: ', mae)

Mean Absolute Error (MAE) for the benchmark method is:  83028.8726076555
