# Housing Price Predictor

I will be analyzing the median housing prices, both in general and with relations to its square footage, to predict how much a house will cost in a given state in the future.

In [1]:
# Install the required scikit-learn package
!pip install scikit-learn




[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: C:\Users\ryand\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
# Necessary libraries to perform predictive task
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression, Lasso
from statsmodels.tsa.ar_model import AutoReg
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from datetime import date

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
# Simple tests to see if all libraries were imported successfully
assert pd
assert np
assert linear_model
assert LinearRegression
assert Lasso
assert AutoReg
assert train_test_split
assert r2_score
assert mean_absolute_error
assert mean_squared_error
assert date

In [4]:
# Constants
ABBREV_TO_STATES = {"AL":"Alabama","AK":"Alaska","AZ":"Arizona","AR":"Arkansas","CA":"California","CO":"Colorado","CT":"Connecticut","DE":"Delaware","FL":"Florida","GA":"Georgia","HI":"Hawaii","ID":"Idaho","IL":"Illinois","IN":"Indiana","IA":"Iowa","KS":"Kansas","KY":"Kentucky","LA":"Louisiana","ME":"Maine","MD":"Maryland","MA":"Massachusetts","MI":"Michigan","MN":"Minnesota","MS":"Mississippi","MO":"Missouri","MT":"Montana","NE":"Nebraska","NV":"Nevada","NH":"New Hampshire","NJ":"New Jersey","NM":"New Mexico","NY":"New York","NC":"North Carolina","ND":"North Dakota","OH":"Ohio","OK":"Oklahoma","OR":"Oregon","PA":"Pennsylvania","RI":"Rhode Island","SC":"South Carolina","SD":"South Dakota","TN":"Tennessee","TX":"Texas","UT":"Utah","VT":"Vermont","VA":"Virginia","WA":"Washington","WV":"West Virginia","WI":"Wisconsin","WY":"Wyoming"}
STATES_TO_ABBREV = {v: k for k, v in ABBREV_TO_STATES.items()}
DATASET = 'data/housing_data.csv'

## Step 1: Obtain and Preprocess Data

For this task, we will be obtaining our data from Realtor.com’s Data Library, which can be found [here](https://www.realtor.com/research/data/). The data used in this project is historical data on median housing prices refined to a county.

After we select the columns that we pertinently need from the data, we will be filtering the column based on a state of the user's choice, to then prepare for the next step!

In [40]:
def format_date(date):
    '''Helper method applied to Date column in the data for ease of plotting later
    
    Parameters
    ----------
    date: str
        Date in the format YYYYMM
    Returns
    -------
    str
        Date in the format MM-YYYY
    '''
    date = str(date)
    year, month = date[:4], date[4:]
    formatted_date = f'{month}-{year}'
    return formatted_date

def preprocess_data(url=DATASET) -> pd.DataFrame:
    '''Retrieves and preprocesses housing data from the web.
    
    Parameters
    ----------
    url: string
        URL of link housing data is retrieved from
    
    Returns
    -------
    data: DataFrame
        DataFrame of housing data, which includes statistics on its location, date, and median pricing
    '''
    columns = ['Date', 'Location', 'Median Price', 'Median PPSF']
    data = pd.read_csv(url)
    data.rename(columns={
        'month_date_yyyymm': 'Date',
        'county_name' : 'Location',
        'median_listing_price': 'Median Price',
        'median_listing_price_per_square_foot': 'Median PPSF'
        }, inplace = True)
    # Note: the last line is removed as it is a quality flag
    data = data[columns].iloc[:-1]
    data['Date'] = data['Date'].apply(format_date)
    return data

In [41]:
data = preprocess_data()
data

  data = pd.read_csv(url)


Unnamed: 0,Date,Location,Median Price,Median PPSF
0,05-2024,"oglethorpe, ga",363500.0,177.0
1,05-2024,"anderson, tx",292500.0,143.0
2,05-2024,"san jacinto, tx",285000.0,180.0
3,05-2024,"storey, nv",727500.0,304.0
4,05-2024,"chelan, wa",749500.0,409.0
...,...,...,...,...
294597,07-2016,"weber, ut",268500.0,106.0
294598,07-2016,"rutherford, nc",220000.0,115.0
294599,07-2016,"monroe, tn",175000.0,101.0
294600,07-2016,"pennington, sd",230000.0,110.0


In [42]:
def filter_data(data=data, state='CA'):
    '''Filters the data within the dataset by the state for model training purposes.
    
    Parameters
    ----------
    state: str
        Two-letter abbreviation or full name of a state
        
    Returns
    -------
    filtered_data: DataFrame
        DataFrame consisting of data from a singular state
    '''
    if state.upper() not in STATES_TO_ABBREV and state.upper() not in ABBREV_TO_STATES:
        print('Not a valid state. Please try again.')
        return None

    if len(state) != 2:
        state = STATES_TO_ABBREV[state]
    state = state.lower()

    filtered_data = data[data['Location'].str[-2:] == state]
    return filtered_data

In [47]:
filtered_data = filter_data() # Can attach arguments here to check other state information!
filtered_data

Unnamed: 0,Date,Location,Median Price,Median PPSF
129,05-2024,"san bernardino, ca",541005.0,345.0
151,05-2024,"lake, ca",434500.0,254.0
189,05-2024,"riverside, ca",669648.0,351.0
223,05-2024,"sacramento, ca",570712.0,338.0
278,05-2024,"kings, ca",373125.0,231.0
...,...,...,...,...
294448,07-2016,"nevada, ca",499000.0,241.0
294455,07-2016,"riverside, ca",399000.0,191.0
294500,07-2016,"inyo, ca",334900.0,189.0
294560,07-2016,"tehama, ca",249000.0,144.0


## Step 2: Splitting the Data

After we have the data that we want, we are going to split our data up into a training and testing set in an 80/20 split, where we are targetting the Median Price of a home as well as a Median PPSF of said homes given date information.

In [44]:
X, y = filtered_data['Date'], filtered_data[['Median Price', 'Median PPSF']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test

(98511     10-2021
 91247     12-2021
 50971     01-2023
 70926     07-2022
 158645    02-2020
            ...   
 202004    12-2018
 277868    12-2016
 279517    11-2016
 288224    09-2016
 45884     03-2023
 Name: Date, Length: 4408, dtype: object,
 76718     05-2022
 64218     09-2022
 240901    12-2017
 243550    11-2017
 99878     09-2021
            ...   
 156941    03-2020
 257993    06-2017
 53520     12-2022
 77993     04-2022
 231770    03-2018
 Name: Date, Length: 1102, dtype: object,
         Median Price  Median PPSF
 98511       482000.0        249.0
 91247       599900.0        308.0
 50971       599000.0        331.0
 70926      1290000.0       1052.0
 158645      299900.0        187.0
 ...              ...          ...
 202004     1399000.0        870.0
 277868      415000.0        228.0
 279517      584000.0        284.0
 288224      237200.0        125.0
 45884       365000.0        271.0
 
 [4408 rows x 2 columns],
         Median Price  Median PPSF
 76718       66

In [48]:
def get_next_five_years(data=filtered_data) -> pd.Series:
    '''Get the month and year of all future months up to five years in the future.
    
    Parameters
    ----------
    data: DataFrame
        filtered data before training to extract most recent data at the top of the data file.
        
    Returns
    -------
    pd.Series
        pandas Series of next five years of month-year combinations'''
    return

# Linear Regression
def prediction_model(model, X_train, X_test, y_train, y_test):
    '''Using Linear Regression, predicts the future price of a home in a specified location five years into the future.
    
    Parameters
    ----------
    model: Object
        Model passed into the predictive method, should be initialized prior to method call
    X_train: Series
        Training features for the model
    X_test: Series
        Testing features for the model
    y_train: Series
        Training targets for the model
    y_test: Series
        Testing targets for the model

    Returns
    -------
    future_predictions: Series
        Sequence of predictions up to five years past the most recent month of data in the dataset.
    '''
    model.fit(X_train, y_train)
    testing_predictions = model.predict(X_test)
    print(f'MSE of the Current Model: {mean_squared_error(testing_predictions, y_test)}')
    future_predictions = model.predict(get_next_five_years())
    return future_predictions

In [None]:
models = [LinearRegression(), Lasso(), AutoReg()]
for model in models:
    prediction_model(model, X_train, X_test, y_train, y_test)