### Import Packages

In [1]:
import pandas as pd 
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from pprint import pprint
from datetime import datetime

from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Append the entire repo parent director so files therein can be accessed in notebook
import os
import sys
import pathlib
sys.path.append(str(pathlib.Path().absolute().parent))

from src import helper
%load_ext autoreload

### Load Data

In [2]:
CA_time_interpolated = pd.read_pickle('../data/processed/interpolated_fillnaTime_df.pickle')
interp_preds = pd.read_csv('../data/predictions/rent_buy_zip_interp.csv')

In [3]:
interp_preds.rename(columns={'Unnamed: 0': 'zip_code'}, inplace=True)

In [4]:
CA_time_interpolated.head()

Unnamed: 0,ds,ZipCode,ZHVI_SingleFamilyResidence,Zri_MultiFamilyResidenceRental
0,1996-04-30,90001,114100.0,1552.0
1,1996-04-30,90002,105700.0,1552.0
2,1996-04-30,90003,103800.0,1552.0
3,1996-04-30,90004,248500.0,1552.0
4,1996-04-30,90005,328800.0,1552.0


In [5]:
CA_time_interpolated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 332663 entries, 0 to 332662
Data columns (total 4 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   ds                              332663 non-null  datetime64[ns]
 1   ZipCode                         332663 non-null  int32         
 2   ZHVI_SingleFamilyResidence      332663 non-null  float64       
 3   Zri_MultiFamilyResidenceRental  332663 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int32(1)
memory usage: 8.9 MB


### ZipCode MetaData for Appreciation Rate Calculation Setup

In [6]:
cali_zips = CA_time_interpolated['ZipCode'].unique().tolist()
len(cali_zips)

1311

In [7]:
zip_metadata = dict()

for zipcode in cali_zips:
    # filter on the zipcode
    sub_df = CA_time_interpolated[CA_time_interpolated['ZipCode'] == zipcode].sort_values('ds', ascending=False)
    sub_df.reset_index(drop=True, inplace=True)
    
    # convert the 'ds' column to datetime
    n_recent = sub_df.loc[0, 'ds'].to_pydatetime()
    zhvi_0 = sub_df.loc[0, 'ZHVI_SingleFamilyResidence']
    zri_0 = sub_df.loc[0, 'Zri_MultiFamilyResidenceRental']
    
    zip_metadata[zipcode] = {
        'n_recent' : n_recent,
        'zhvi_0' : zhvi_0,
        'zri_0' : zri_0,
        'n' : helper.months_til_today(sub_df)
    }
    
len(zip_metadata)    

1311

In [8]:
zip_meta_df = pd.DataFrame(zip_metadata).T.reset_index()
zip_meta_df.rename(columns={'index':'zip_code'}, inplace=True)
zip_meta_df.head()

Unnamed: 0,zip_code,n_recent,zhvi_0,zri_0,n
0,90001,2017-12-31,367200.0,2193,27
1,90002,2017-12-31,355800.0,2201,27
2,90003,2017-12-31,381400.0,2206,27
3,90004,2017-12-31,1591700.0,2826,27
4,90005,2017-12-31,1470900.0,2751,27


In [9]:
zip_meta_df['n'].unique()

array([27, 212, 55, 159, 129], dtype=object)

In [10]:
zip_meta_df['n_recent'].unique()

array(['2017-12-31T00:00:00.000000000', '2002-07-31T00:00:00.000000000',
       '2015-08-31T00:00:00.000000000', '2006-12-31T00:00:00.000000000',
       '2009-06-30T00:00:00.000000000'], dtype='datetime64[ns]')

In [11]:
zip_meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1311 entries, 0 to 1310
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   zip_code  1311 non-null   int64         
 1   n_recent  1311 non-null   datetime64[ns]
 2   zhvi_0    1311 non-null   object        
 3   zri_0     1311 non-null   object        
 4   n         1311 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 51.3+ KB


In [12]:
interp_preds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1311 entries, 0 to 1310
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   zip_code  1311 non-null   int64  
 1   buy       1311 non-null   float64
 2   rent      1311 non-null   float64
dtypes: float64(2), int64(1)
memory usage: 30.9 KB


In [13]:
interp_preds.rename(columns={
    'buy':'zhvi_n',
    'rent':'zri_n'
}, inplace=True)

In [14]:
merged_for_appr = interp_preds.merge(zip_meta_df, on='zip_code', how='left')
merged_for_appr.reset_index(drop=True, inplace=True)

In [15]:
merged_for_appr['y_zhvi'] = [np.log(zhvi_n/zhvi_0) for zhvi_n, zhvi_0 in zip(merged_for_appr['zhvi_n'], merged_for_appr['zhvi_0'])]
merged_for_appr['y_zri'] = [np.log(zri_n/zri_0) for zri_n, zri_0 in zip(merged_for_appr['zri_n'], merged_for_appr['zri_0'])]

In [16]:
merged_for_appr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1311 entries, 0 to 1310
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   zip_code  1311 non-null   int64         
 1   zhvi_n    1311 non-null   float64       
 2   zri_n     1311 non-null   float64       
 3   n_recent  1311 non-null   datetime64[ns]
 4   zhvi_0    1311 non-null   object        
 5   zri_0     1311 non-null   object        
 6   n         1311 non-null   object        
 7   y_zhvi    1309 non-null   float64       
 8   y_zri     1311 non-null   float64       
dtypes: datetime64[ns](1), float64(4), int64(1), object(3)
memory usage: 92.3+ KB


In [17]:
merged_for_appr[merged_for_appr['y_zhvi'].isnull()]

Unnamed: 0,zip_code,zhvi_n,zri_n,n_recent,zhvi_0,zri_0,n,y_zhvi,y_zri
1231,93562,-66689.89,2663.971102,2017-12-31,80200,1965,27,,0.304326
1270,95113,-2685721.0,5951.109136,2017-12-31,606100,1965,27,,1.108085


### Generate Predictions for Appreciation Rate

In [18]:
# Skip the above 2 zipcodes because there are negative home values weirdly
clean_merged = merged_for_appr.dropna()

In [19]:
zhvi_X, zhvi_y = clean_merged['n'], clean_merged['y_zhvi']
zri_X, zri_y = clean_merged['n'], clean_merged['y_zri']

h_X_train, h_X_test, h_y_train, h_y_test = train_test_split(zhvi_X, zhvi_y, random_state=42)
r_X_train, r_X_test, r_y_train, r_y_test = train_test_split(zri_X, zri_y, random_state=42)

In [20]:
np.array(h_X_train).reshape(-1,1).shape

(981, 1)

In [21]:
h_y_train.shape

(981,)

In [24]:
lr_h = LinearRegression()

lr_h.fit(np.array(h_X_train).reshape(-1,1), h_y_train)
h_preds = lr_h.predict(np.array(h_X_test).reshape(-1,1))
h_score = lr_h.score(np.array(h_X_test).reshape(-1,1), h_y_test)

print(h_score)

-0.0008028067671514982


In [27]:
lr_r = LinearRegression()

lr_r.fit(np.array(r_X_train).reshape(-1,1), r_X_train)
r_preds = lr_r.predict(np.array(r_X_test).reshape(-1,1))
r_score = lr_r.score(np.array(r_X_test).reshape(-1,1), r_y_test)

print(r_score)

-48833.44755782352
