# Housing Price Predictor

I will be analyzing the median housing prices, both in general and with relations to its square footage, to predict how much a house will cost in a given state in the future.

In [32]:
# Install the required scikit-learn package
!pip install scikit-learn



In [33]:
# Necessary libraries to perform predictive task
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression, Lasso
from statsmodels.tsa.ar_model import AutoReg
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from datetime import date

In [34]:
# Simple tests to see if all libraries were imported successfully
assert pd
assert np
assert linear_model
assert LinearRegression
assert Lasso
assert AutoReg
assert train_test_split
assert r2_score
assert mean_absolute_error
assert mean_squared_error
assert date

In [35]:
# Constants
ABBREV_TO_STATES = {"AL":"Alabama","AK":"Alaska","AZ":"Arizona","AR":"Arkansas","CA":"California","CO":"Colorado","CT":"Connecticut","DE":"Delaware","FL":"Florida","GA":"Georgia","HI":"Hawaii","ID":"Idaho","IL":"Illinois","IN":"Indiana","IA":"Iowa","KS":"Kansas","KY":"Kentucky","LA":"Louisiana","ME":"Maine","MD":"Maryland","MA":"Massachusetts","MI":"Michigan","MN":"Minnesota","MS":"Mississippi","MO":"Missouri","MT":"Montana","NE":"Nebraska","NV":"Nevada","NH":"New Hampshire","NJ":"New Jersey","NM":"New Mexico","NY":"New York","NC":"North Carolina","ND":"North Dakota","OH":"Ohio","OK":"Oklahoma","OR":"Oregon","PA":"Pennsylvania","RI":"Rhode Island","SC":"South Carolina","SD":"South Dakota","TN":"Tennessee","TX":"Texas","UT":"Utah","VT":"Vermont","VA":"Virginia","WA":"Washington","WV":"West Virginia","WI":"Wisconsin","WY":"Wyoming"}
STATES_TO_ABBREV = {v: k for k, v in ABBREV_TO_STATES.items()}
DATASET = 'data/housing_data.csv'

## Step 1: Obtain and Preprocess Data

For this task, we will be obtaining our data from Realtor.com’s Data Library, which can be found [here](https://www.realtor.com/research/data/). The data used in this project is historical data on median housing prices refined to a location's Zip code.

In [36]:
def preprocess_data(url=DATASET) -> pd.DataFrame:
    '''Retrieves and preprocesses housing data from the web.
    
    Parameters
    ----------
    url: string
        URL of link housing data is retrieved from
    
    Returns
    -------
    data: DataFrame
        DataFrame of housing data, which includes statistics on its location, date, and median pricing
    '''
    columns = ['Date', 'Location', 'Median Price', 'Median PPSF']
    data = pd.read_csv(url)
    data.rename(columns={
        'month_date_yyyymm': 'Date',
        'zip_name' : 'Location',
        'median_listing_price': 'Median Price',
        'median_listing_price_per_square_foot': 'Median PPSF'
        }, inplace = True)

    data = data[columns]
    return data

In [None]:
data = preprocess_data()
data