In [1]:
import os
import pandas as pd
import seaborn as sns
import random
from datetime import date
from dateutil.relativedelta import relativedelta

from sklearn.metrics import mean_squared_error

In [2]:
data_folder = '../03.clean-data/'
positive_monthly_return_cutoff = 0.002

## Load Data

In [3]:
hpi = pd.read_csv(os.path.join(data_folder, 'hpi.csv'), parse_dates=['Date']) # Load
hpi.sort_values(['Area', 'Type', 'Date'], inplace=True) # Sort
hpi.reset_index(inplace=True)

In [4]:
hpi_returns = hpi.copy()

In [5]:
hpi_returns['HPI.L1'] = hpi_returns.groupby(['Area', 'Type'])['HPI'].shift(1)
hpi_returns['HPI.L12'] = hpi_returns.groupby(['Area', 'Type'])['HPI'].shift(12)

In [6]:
hpi_returns['MonthlyChangeInHPI'] = (hpi_returns['HPI'] - hpi_returns['HPI.L1']) / hpi_returns['HPI.L1']
hpi_returns['YearlyChangeInHPI'] = (hpi_returns['HPI'] - hpi_returns['HPI.L12']) / hpi_returns['HPI.L12']

## Use the Rolling Average to Make a Forecast

In [7]:
window_length = 36  # months before the date we run the forecast on.


### Example with one date

In [8]:
start_date = pd.to_datetime(date(2012, 2, 1))
end_date = start_date + relativedelta(months=window_length)
train = hpi_returns[hpi_returns['Date'].between(start_date, end_date)].copy()
date_of_forecast_being_made = train['Date'].max()
date_of_forecasted_value = end_date + relativedelta(months=1)
start_date, end_date, date_of_forecast_being_made, date_of_forecasted_value

(Timestamp('2012-02-01 00:00:00'),
 Timestamp('2015-02-01 00:00:00'),
 Timestamp('2015-02-01 00:00:00'),
 Timestamp('2015-03-01 00:00:00'))

In [9]:
forecasted_monthly_return = train.groupby(['Area', 'Type'])['MonthlyChangeInHPI'].mean()
forecasted_monthly_return.rename('ForecastedMonthlyChangeInHPI', inplace=True)
forecasted_monthly_return = forecasted_monthly_return.to_frame()

forecasted_monthly_return['ForecastedPositiveReturn'] = (
    train
        [train['MonthlyChangeInHPI'].notnull()]
        .groupby(['Area', 'Type'])
        ['MonthlyChangeInHPI']
        .apply(lambda x: (x > positive_monthly_return_cutoff).sum() / len(x))
)

forecasted_monthly_return['Date'] = date_of_forecasted_value
forecasted_monthly_return.reset_index(inplace=True)
forecasted_monthly_return

Unnamed: 0,Area,Type,ForecastedMonthlyChangeInHPI,ForecastedPositiveReturn,Date
0,TREB Total,Apartment,0.003106,0.482759,2015-03-01
1,TREB Total,Composite,0.006060,0.620690,2015-03-01
2,TREB Total,Single-Family Attached,0.006839,0.655172,2015-03-01
3,TREB Total,Single-Family Detached,0.006819,0.655172,2015-03-01
4,TREB Total,Townhouse,0.005879,0.724138,2015-03-01
...,...,...,...,...,...
175,Toronto W10,Apartment,0.003104,0.517241,2015-03-01
176,Toronto W10,Composite,0.007192,0.655172,2015-03-01
177,Toronto W10,Single-Family Attached,0.007635,0.655172,2015-03-01
178,Toronto W10,Single-Family Detached,0.008944,0.724138,2015-03-01


### Rolling predictions on a loop

In [10]:
possible_start_dates = pd.date_range(
    start=hpi_returns['Date'].min(),
    end=hpi_returns['Date'].max() - relativedelta(months=window_length - 1),
    freq='MS',
)
len(possible_start_dates)

76

In [11]:
%%time
forecast_dfs = []
for start_date in possible_start_dates:
    end_date = start_date + relativedelta(months=window_length)
    train = hpi_returns[hpi_returns['Date'].between(start_date, end_date)].copy()
    date_of_forecast_being_made = train['Date'].max()
    date_of_forecasted_value = end_date + relativedelta(months=1)
    
    forecast_df = train.groupby(['Area', 'Type'])['MonthlyChangeInHPI'].mean()
    forecast_df.rename('ForecastedMonthlyChangeInHPI', inplace=True)
    forecast_df = forecast_df.to_frame()
    
    forecast_df['ForecastedPositiveReturn'] = (
        train
            [train['MonthlyChangeInHPI'].notnull()]
            .groupby(['Area', 'Type'])
            ['MonthlyChangeInHPI']
            .apply(lambda x: (x > positive_monthly_return_cutoff).sum() / len(x))
        )
    
    forecast_df['Date'] = date_of_forecasted_value
    forecast_df.reset_index(inplace=True)
    
    forecast_dfs.append(forecast_df)

forecasted_monthly_return = pd.concat(forecast_dfs)
forecasted_monthly_return

Wall time: 4.29 s


Unnamed: 0,Area,Type,ForecastedMonthlyChangeInHPI,ForecastedPositiveReturn,Date
0,TREB Total,Apartment,0.003106,0.482759,2015-03-01
1,TREB Total,Composite,0.006060,0.620690,2015-03-01
2,TREB Total,Single-Family Attached,0.006839,0.655172,2015-03-01
3,TREB Total,Single-Family Detached,0.006819,0.655172,2015-03-01
4,TREB Total,Townhouse,0.005879,0.724138,2015-03-01
...,...,...,...,...,...
300,Whitchurch-Stouffville,Apartment,0.001982,0.485714,2021-06-01
301,Whitchurch-Stouffville,Composite,0.009300,0.685714,2021-06-01
302,Whitchurch-Stouffville,Single-Family Attached,0.013590,0.657143,2021-06-01
303,Whitchurch-Stouffville,Single-Family Detached,0.009514,0.628571,2021-06-01


## Validate

In [12]:
validation = hpi_returns.merge(
    forecasted_monthly_return,
    how='outer',
    on=['Area', 'Type', 'Date'],
)
print(len(validation), len(validation.drop_duplicates(['Area', 'Type', 'Date'])))  # Check if the merge (join) impacted the granularity of the data set, i.e. if we introduced duplicates
validation = validation[validation['ForecastedMonthlyChangeInHPI'].notnull() & validation['MonthlyChangeInHPI'].notnull()]

28080 28080


### MSE

In [13]:
benchmark_mse = mean_squared_error(validation['MonthlyChangeInHPI'], validation['ForecastedMonthlyChangeInHPI'])
benchmark_mse

0.0008836380148939758

### Normalize the MSE

In [14]:
# MSE is 10% of the average value.
benchmark_mse / hpi_returns['MonthlyChangeInHPI'].mean()

0.10512084173517866

In [15]:
benchmark_mse / hpi_returns.query('Area == "TREB Total" & Type == "Composite"')['MonthlyChangeInHPI'].mean()

0.12102376819224779

### MSE by Area & Housing Type

In [16]:
benchmark_mse_by_area_and_type = (
    validation
        .groupby(['Area', 'Type'])
        .apply(lambda x: mean_squared_error(x['MonthlyChangeInHPI'], x['ForecastedMonthlyChangeInHPI']))
)
benchmark_mse_by_area_and_type

Area                    Type                  
Adjala-Tosorontio       Composite                 0.001136
                        Single-Family Detached    0.001134
Ajax                    Apartment                 0.000575
                        Composite                 0.000451
                        Single-Family Attached    0.000469
                                                    ...   
Whitchurch-Stouffville  Apartment                 0.000761
                        Composite                 0.000702
                        Single-Family Attached    0.001138
                        Single-Family Detached    0.000734
                        Townhouse                 0.000569
Length: 286, dtype: float64

## Probability of a Positive Return

In [17]:
validation['ForecastedPositiveReturn'].describe()

count    19833.000000
mean         0.622139
std          0.123345
min          0.000000
25%          0.540541
50%          0.620690
75%          0.694444
max          1.000000
Name: ForecastedPositiveReturn, dtype: float64