### Import Packages

In [52]:
import pandas as pd 
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from pprint import pprint
from datetime import datetime

from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Append the entire repo parent director so files therein can be accessed in notebook
import os
import sys
import pathlib
sys.path.append(str(pathlib.Path().absolute().parent))

from src import helper
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Data

In [2]:
CA_time_interpolated = pd.read_pickle('../data/processed/interpolated_fillnaTime_df.pickle')
interp_preds = pd.read_csv('../data/predictions/rent_buy_zip_interp.csv')

In [7]:
CA_time_interpolated.head()

Unnamed: 0,ds,ZipCode,ZHVI_SingleFamilyResidence,Zri_MultiFamilyResidenceRental
0,1996-04-30,90001,114100.0,1552.0
1,1996-04-30,90002,105700.0,1552.0
2,1996-04-30,90003,103800.0,1552.0
3,1996-04-30,90004,248500.0,1552.0
4,1996-04-30,90005,328800.0,1552.0


In [55]:
CA_time_interpolated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 332663 entries, 0 to 332662
Data columns (total 4 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   ds                              332663 non-null  datetime64[ns]
 1   ZipCode                         332663 non-null  int32         
 2   ZHVI_SingleFamilyResidence      332663 non-null  float64       
 3   Zri_MultiFamilyResidenceRental  332663 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int32(1)
memory usage: 8.9 MB


### ZipCode MetaData for Appreciation Rate Calculation Setup

In [21]:
cali_zips = CA_time_interpolated['ZipCode'].unique().tolist()
len(cali_zips)

1311

In [38]:
zip_metadata = dict()

for zipcode in cali_zips:
    sub_df = CA_time_interpolated[CA_time_interpolated['ZipCode'] == zipcode].sort_values('ds', ascending=False)
    sub_df.reset_index(drop=True, inplace=True)
    n_recent = sub_df.loc[0, 'ds'].to_pydatetime()
    zhvi_0 = sub_df.loc[0, 'ZHVI_SingleFamilyResidence']
    zri_0 = sub_df.loc[0, 'Zri_MultiFamilyResidenceRental']
    
    zip_metadata[zipcode] = {
        'n_recent' : n_recent,
        'zhvi_0' : zhvi_0,
        'zri_0' : zri_0,
        'n' : helper.months_til_today(sub_df)
    }
    
len(zip_metadata)    

1311

In [39]:
zip_meta_df = pd.DataFrame(zip_metadata).T.reset_index()
zip_meta_df.rename(columns={'index':'zip_code'}, inplace=True)
zip_meta_df.head()

Unnamed: 0,zip_code,n_recent,zhvi_0,zri_0,n
0,90001,2017-12-31,367200.0,2193,27
1,90002,2017-12-31,355800.0,2201,27
2,90003,2017-12-31,381400.0,2206,27
3,90004,2017-12-31,1591700.0,2826,27
4,90005,2017-12-31,1470900.0,2751,27


In [54]:
zip_meta_df['n_recent'].unique()

array(['2017-12-31T00:00:00.000000000', '2002-07-31T00:00:00.000000000',
       '2015-08-31T00:00:00.000000000', '2006-12-31T00:00:00.000000000',
       '2009-06-30T00:00:00.000000000'], dtype='datetime64[ns]')

In [41]:
zip_meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1311 entries, 0 to 1310
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   zip_code  1311 non-null   int64         
 1   n_recent  1311 non-null   datetime64[ns]
 2   zhvi_0    1311 non-null   object        
 3   zri_0     1311 non-null   object        
 4   n         1311 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 51.3+ KB


In [43]:
interp_preds.rename(columns={
    'buy':'zhvi_n',
    'rent':'zri_n'
}, inplace=True)

In [47]:
merged_for_appr = interp_preds.merge(zip_meta_df, on='zip_code', how='left')
merged_for_appr.reset_index(drop=True, inplace=True)

In [48]:
merged_for_appr['y_zhvi'] = [np.log(zhvi_n/zhvi_0) for zhvi_n, zhvi_0 in zip(merged_for_appr['zhvi_n'], merged_for_appr['zhvi_0'])]
merged_for_appr['y_zri'] = [np.log(zri_n/zri_0) for zri_n, zri_0 in zip(merged_for_appr['zri_n'], merged_for_appr['zri_0'])]

In [49]:
merged_for_appr.head()

Unnamed: 0,zip_code,zhvi_n,zri_n,n_recent,zhvi_0,zri_0,n,y_zhvi,y_zri
0,90001,290106.7,2107.267212,2017-12-31,367200.0,2193,27,-0.235658,-0.039879
1,90002,273803.7,2073.61767,2017-12-31,355800.0,2201,27,-0.261957,-0.059617
2,90003,292014.1,2111.681483,2017-12-31,381400.0,2206,27,-0.267047,-0.043696
3,90004,1668105.0,3066.411287,2017-12-31,1591700.0,2826,27,0.046886,0.081646
4,90005,1510442.0,2871.4835,2017-12-31,1470900.0,2751,27,0.026528,0.042864


In [50]:
merged_for_appr['n'].unique()

array([27, 212, 55, 159, 129], dtype=object)

In [None]:
zhvi_X, zhvi_y = merged_for_appr['n'], merged_for_appr['y_zhvi']
zri_X, zri_y = merged_for_appr['n'], merged_for_appr['y_zri']


In [None]:

lr_zhvi = LinearRegression(random_state=42)
lr_zvi = LinearRegression(random_state=42)

lr_zhvi.fit(zhvi_X, zhvi_y)
lr_zhvi