In [1]:
import numpy as np
import random
import os, errno
import sys
from tqdm import trange
from copy import deepcopy

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import matplotlib.pyplot as plt

import pandas as pd
import math

In [2]:
data_folder = "C:\\Users\\spars\\Documents\\Master\\JHU\TML\\HomePriceBeastNew\\"

In [3]:
def convert_frame_to_numpy(df, remove_cols, target_prefix, related_prefixes, J,H):
    
    #assemble lag variables.
    y_lag_cols = [f'{target_prefix}_lag_{j}' for j in range(J,0,-1)]
    y_lead_cols = [target_prefix] + [f'{target_prefix}_lead_{h}' for h in range(1,H+1,1)]    
    
    flat_drop = []
    x_rel_cols = []
    for related_prefix in related_prefixes:
        if related_prefix == "inventory":
            curr_prefix = [f'{related_prefix}_lag_1'] + [related_prefix] + [f'{related_prefix}_lead_{h}' for h in range(1,H+1,1)]
        else:
            curr_prefix = [f'{related_prefix}_lag_{j}' for j in range(J,0,-1)]
        x_rel_cols.append(curr_prefix)
        flat_drop = flat_drop + curr_prefix
        
    other_cols = [x for x in df.columns if x not in y_lag_cols\
                  + y_lead_cols + flat_drop + remove_cols + related_prefixes]
    
    print(f"Length of other columns = {len(other_cols)}")
    print(other_cols)
    
    def get_label_row(row):
        label = np.array([row[remove_cols].values])
        
        return label
    
    def get_xvec_row(row):
        x = np.array([row[y_lag_cols].values])
        
        #Removing sale price in inventory models 
        for x_rel in x_rel_cols:
            x = np.append(x,[row[x_rel].values],axis=0)
        
        stat_val = row[other_cols].values
        stat_val = np.tile(stat_val,[J,1])
        stat_val = np.transpose(stat_val)
        x = np.append(x,stat_val,axis=0)
        
        return x
    
    def get_yvec_row(row):
        y = np.array([row[y_lead_cols].values])
        
        return y
            
    X = np.array(df.apply(get_xvec_row, axis = 1))
    y = np.array(df.apply(get_yvec_row, axis = 1))
    label = np.array(df.apply(get_label_row, axis = 1))
    
    return X,y, label
    
remove_cols = ["county_name", "period_begin", "state_code"]
target_prefix = 'median_sale_price'
related_prefix = ['inventory', 'week_num','month','week_offset']
J=5
H=3

In [4]:
post_covid_predicted_df_inv = pd.read_csv(f"{data_folder}post_covid_predicted_df_inv.csv")
post_covid_predicted_df_inv['period_begin'] = pd.to_datetime(post_covid_predicted_df_inv['period_begin'])
post_covid_predicted_df_inv.drop(["Unnamed: 0"], axis=1, inplace=True)
test_predicted_df_inv = pd.read_csv(f"{data_folder}ptest_predicted_df_inv.csv")
test_predicted_df_inv['period_begin'] = pd.to_datetime(test_predicted_df_inv['period_begin'])

In [5]:
post_covid_predicted_df_inv['inventory'] = [x.rstrip("]").lstrip("[").replace("nan","0") for x in post_covid_predicted_df_inv['inventory']]
post_covid_predicted_df_inv['inventory'] = pd.to_numeric(post_covid_predicted_df_inv['inventory'])

In [6]:
def get_lead_lag_features(feature_df,L,colname,time_feature,lag=True):
    col_mid = "lead"
    mult = -1
    
    if lag:
        col_mid = "lag"
        mult = 1
        
    other_cols = [x for x in feature_df.columns if x!=colname and x!=time_feature]
    print(other_cols)
    cols = other_cols+ [time_feature]
    feature_df = feature_df.sort_values(cols).reset_index(drop=True)
    
    new_feature_df = pd.DataFrame()
    for lag in range(L):
        l = lag +1
        col = f"{colname}_{col_mid}_{l}"
        new_feature_df[col] = feature_df.groupby(other_cols)[colname].shift(l*mult)
    new_feature_df.fillna(0,inplace=True)
    
    result = pd.concat([feature_df, new_feature_df], axis=1)
    return result

In [7]:
def get_lag_features(feature_df, J, colname, time_feature):
    
    return get_lead_lag_features(feature_df, J, colname, time_feature)

def get_lead_features(feature_df, H, colname, time_feature):
    
    return get_lead_lag_features(feature_df, H, colname, time_feature,lag=False)

In [8]:
dt_col = "period_begin"
y = "inventory"
H = 3
J = 5

y_feature_df_lag = get_lag_features(post_covid_predicted_df_inv, J, y, dt_col)    
y_feature_df_lead = get_lead_features(post_covid_predicted_df_inv, H, y, dt_col)
y_feature_df_lead = y_feature_df_lead[[x for x in y_feature_df_lead.columns if x not in y_feature_df_lag.columns]]

y_feature_df = pd.concat([y_feature_df_lag, 
                          y_feature_df_lead], axis=1)

post_covid_predicted_df_inv = y_feature_df

['state_code', 'county_name']
['state_code', 'county_name']


In [9]:
def chunks(a, size):
    arr = iter(a)
    for v in arr:
        tmp = [ v ]
        for i,v in zip( range( size - 1 ), arr ):
            tmp.append( v )
        yield tmp

In [10]:
post_covid_frame = pd.read_csv(f"{data_folder}post_covid_frame_df_sale_model.csv")
test_frame = pd.read_csv(f"{data_folder}test_frame_df_sale_model.csv")

In [11]:
del_cols = [x for x in post_covid_frame.columns if "inventory" in x]
del_cols

['inventory',
 'inventory_lag_1',
 'inventory_lag_2',
 'inventory_lag_3',
 'inventory_lag_4',
 'inventory_lag_5',
 'inventory_lead_1',
 'inventory_lead_2',
 'inventory_lead_3']

In [12]:
post_covid_counterfactual_frame = post_covid_frame.drop(del_cols, axis = 1)
post_covid_counterfactual_frame['period_begin'] = pd.to_datetime(post_covid_counterfactual_frame['period_begin'])
post_covid_counterfactual_frame.drop(["Unnamed: 0"], axis=1, inplace=True)

In [13]:
post_covid_counterfactual_frame = pd.merge(
    post_covid_counterfactual_frame,
    post_covid_predicted_df_inv,
    how="inner")

In [14]:
X_post_covid_counterfactual, y_post_covid_expected, post_covid_label_counterfactual = convert_frame_to_numpy(post_covid_counterfactual_frame, 
                                                                        remove_cols, 
                                                                        target_prefix, 
                                                                        related_prefix, 
                                                                        J,H)

Length of other columns = 86
['R_INTERNATIONAL_MIG_2019', 'Unemployment_rate_2020', 'PCT_COLL_4_2015_19', 'PCT_COLL_1TO3_2000', 'PCT_HSD_Only_2000', 'R_NET_MIG_2019', 'Med_HH_Income_Percent_of_State_Total_2019', 'GQ_ESTIMATES_2019', 'N_POP_CHG_2019', 'INTERNATIONAL_MIG_2019', 'NET_MIG_2019', 'HSD_Only_2000', 'DOMESTIC_MIG_2019', 'RESIDUAL_2019', 'Deaths_2019', 'COLL_4_2000', 'POP_ESTIMATE_2019', 'LT_HSD_2015_19', 'COLL_1TO3_2000', 'Unemployed_2020', 'NATURAL_INC_2019', 'GQ_ESTIMATES_BASE_2010', 'Employed_2020', 'LT_HSD_2000', 'COLL_4_2015_19', 'HSD_Only_2015_19', 'COLL_1TO3_2015_19', 'Civilian_labor_force_2020', 'CENSUS_2010_POP', 'PCT_LT_HSD_2000', 'R_birth_2019', 'PCT_COLL_1TO3_2015_19', 'PCT_COLL_4_2000', 'Economic_typology_2015', 'R_death_2019', 'state_code_dummy_AK', 'state_code_dummy_AL', 'state_code_dummy_AR', 'state_code_dummy_AZ', 'state_code_dummy_CA', 'state_code_dummy_CO', 'state_code_dummy_CT', 'state_code_dummy_DC', 'state_code_dummy_DE', 'state_code_dummy_FL', 'state_cod

In [15]:
stack_range = np.array(X_post_covid_counterfactual[0]).shape[0]
X_post_covid_counterfactual_stack = list(chunks(np.vstack(X_post_covid_counterfactual), stack_range))
post_covid_label_counterfactual_stack = np.expand_dims(np.vstack(post_covid_label_counterfactual),axis=2)
y_post_covid_expected_stack = np.expand_dims(np.vstack(y_post_covid_expected),axis=2)

X_post_covid_counterfactual_swap = np.array(X_post_covid_counterfactual_stack).swapaxes(0,1).swapaxes(0,2)
post_covid_label_counterfactual_swap = np.array(post_covid_label_counterfactual_stack).swapaxes(0,1)
y_post_covid_expected_swap = np.array(y_post_covid_expected_stack).swapaxes(0,1)

In [16]:
with open(f"{data_folder}post_covid_counterfactual_swap.npy", 'wb') as f:
    np.save(f, X_post_covid_counterfactual_swap.astype(float))
    np.save(f, y_post_covid_expected_stack.astype(float))
    
with open(f"{data_folder}post_covid_label_counterfactual_swap.npy", 'wb') as f:
    np.save(f, post_covid_label_counterfactual_swap)

In [None]:
s