In [110]:
from datetime import timedelta
import pickle
from os import listdir
from os.path import (
join, isfile, splitext, basename
)
import pandas as pd
import numpy as np
import torch
from utils.data.CountyDataset import (
    CountyDataset, COUNTY_DROP_COLS, COUNTY_PCT_COLS,
    FEDERAL_DROP_COLS, STATE_DROP_COLS, COUNTY_DROP_COLS
)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [112]:
# get valid state and county paths
state_paths = CountyDataset.get_valid_paths(path_type='state', verbose=False)
county_paths = CountyDataset.get_valid_paths(path_type='county', verbose=False)
county_ids = [int(basename(p).split('_')[0]) for p in county_paths]
state_ids = [int(basename(p).split('_')[0]) for p in state_paths]

In [113]:
# get master reference
master_path = 'data/master_county_state_reference.csv'
df_master = pd.read_csv(master_path, usecols=['county_id', 'state_id'], dtype=np.integer)
df_master = df_master[df_master.county_id.isin(county_ids)]
df_master = df_master[df_master.state_id.isin(state_ids)]
df_master = df_master.reset_index(drop=True)

In [114]:
# split into train, test, val
data_refs = df_master.values
np.random.shuffle(data_refs)
train_pct = 0.7
val_pct = 0.15

train_idx = int(np.floor((len(data_refs)*train_pct)))
val_idx = train_idx+int(np.floor((len(data_refs)*val_pct)))
train_master = data_refs[:train_idx,:]
val_master = data_refs[train_idx:val_idx]
test_master = data_refs[val_idx:]

In [116]:
def get_stats(DIN):
    X, _, Y = zip(*DIN)
    X = torch.cat(X, dim=0)
    Y = torch.cat(Y, dim=0)
    xmu, xstd = X.mean(dim=0), X.std(dim=0)
    ymu, ystd = Y.mean(dim=0), Y.std(dim=0)
    return (xmu, xstd), (ymu, ystd)

In [152]:
df_country = pd.read_csv(join(CountyDataset.COUNTRY_DIR, f'usa.csv'), dtype=np.float32, index_col='date', parse_dates=['date'])
df_country = df_country.drop(labels=FEDERAL_DROP_COLS, axis=1)
Tback = 24
Tfwd = 6
T = Tback+Tfwd
split_types = ['train', 'val', 'test']
datasets = [train_master, val_master, test_master]
xstats, ystats = None, None
for i, dataset in enumerate(datasets):
    print(f'getting data for {split_types[i]}')
    data = []
    for county_id, state_id in dataset:
        df_local = pd.read_csv(join(CountyDataset.COUNTY_DIR, f'{county_id}_zillow.csv'), dtype=np.float32, parse_dates=[0], na_values='.').ffill().bfill()
        df_local = df_local.rename(columns={'Unnamed: 0': 'date'}).set_index('date')
        df_local = df_local.drop(labels=COUNTY_DROP_COLS, axis=1)
        for k, v in COUNTY_PCT_COLS.items():
            df_local[v] = df_local[v].div(df_local[k], axis=0)
        df_out = pd.merge(df_country, df_local, on='date', how='outer', suffixes=['_country', '_local']).ffill().bfill()
        df_state = pd.read_csv(join(CountyDataset.STATE_DIR, f'{state_id}_zillow.csv'), dtype=np.float32, parse_dates=[0], na_values='.').ffill().bfill()
        df_state = df_state.rename(columns={'Unnamed: 0': 'date'}).set_index('date')
        df_state = df_state.drop(labels=STATE_DROP_COLS, axis=1)
        df_out = pd.merge(df_out, df_state, on='date', how='outer').ffill().bfill()
        idx_breaks = (df_out.index+np.timedelta64(T, 'M'))
        idx_bounds = list(zip(df_out.index, idx_breaks))[:-Tfwd]
        for lb, ub in idx_bounds:
            df_window = df_out[(df_out.index >= lb) & (df_out.index <= ub)]
            X = torch.tensor(df_window.loc[:, ~df_window.columns.isin(['Zillow Price Index'])].values[:Tback], dtype=torch.float)
            Xlog = torch.tensor(df_window['Zillow Price Index'].values[:Tback], dtype=torch.float)
            Y = torch.tensor(df_window['Zillow Price Index'].values[-Tfwd:], dtype=torch.float)
            data.append((X, Xlog, Y))

    if i == 0:
        print('getting stats')
        xstats, ystats = get_stats(data)
    print(f'standardizing data for {split_types[i]}')
    data_standard = []
    for X, Xlog, Y in data:
        x_standard = (X[:, :-1]-xstats[0][:-1])/xstats[1][:-1]
        xlog = np.log(Xlog)
        x_standard = np.concatenate((x_standard, xlog[:, None]), axis=1)
        y_log = np.log(Y)
        data_standard.append((x_standard, y_log))
    print(f'saving {split_types[i]} data for {Tback} {Tfwd}')
    target_file = open(f'data/output/{split_types[i]}_{Tback}_{Tfwd}.pickle', 'wb')
    pickle.dump(data_standard, target_file)
    target_file.close()

getting data for train
getting stats
standardizing data for train
saving train data for 24 6
getting data for val
standardizing data for val
saving val data for 24 6
getting data for test
standardizing data for test
saving test data for 24 6


In [142]:
xstats, ystats = get_stats(data)
data_standard = []
for X, Xlog, Y in data:
    x_standard = (X[:, :-1]-xstats[0][:-1])/xstats[1][:-1]
    xlog = np.log(Xlog)
    x_standard = np.concatenate((x_standard, xlog[:, None]), axis=1)
    y_log = np.log(Y)
    data_standard.append((x_standard, y_log))
target_file = open(f'data/output/val_{Tback}_{Tfwd}.pickle', 'wb')
pickle.dump(data_standard, target_file)
target_file.close()

tensor([11.1678, 11.1677, 11.1668, 11.1665, 11.1662, 11.1664])