In [1]:
import pandas as pd
import numpy as np

In [2]:
data_folder = "C:\\Users\\spars\\Documents\\Master\\JHU\TML\\HomePriceBeastNew\\"

In [3]:
merged_home_data_time_series = pd.read_csv(f"{data_folder}merged_home_data_time_series.csv", low_memory=False, thousands=',')

In [4]:
def transform_types_and_chunk(data, non_float_cols, date_col, dummy_cols):
    for x in data.columns:
        if x not in non_float_cols:
            data[x] = pd.to_numeric(data[x])
            
    data = pd.get_dummies(data, columns = dummy_cols)

    data[date_col] = pd.to_datetime(data[date_col])
    post_covid_frame = data[data[date_col] >= "2020-03-01"]
    train_frame = data[data[date_col] < "2019-12-01"]
    test_frame = data[((data[date_col] >= "2019-12-01") & (data[date_col] < "2020-03-01"))]
    
    return train_frame, test_frame, post_covid_frame

In [5]:
non_float_cols = ["state_code", "county_name", "period_begin"]

#convert these columns to encoding. 
dummy_cols = ["state_code"]

train_frame, test_frame, post_covid_frame = transform_types_and_chunk(merged_home_data_time_series,
                                 non_float_cols,
                                 "period_begin",
                                 dummy_cols)

In [6]:
train_frame.columns

Index(['county_name', 'period_begin', 'inventory', 'inventory_lag_1',
       'inventory_lag_2', 'inventory_lag_3', 'inventory_lag_4',
       'inventory_lag_5', 'inventory_lead_1', 'inventory_lead_2',
       'inventory_lead_3', 'median_sale_price', 'median_sale_price_lag_1',
       'median_sale_price_lag_2', 'median_sale_price_lag_3',
       'median_sale_price_lag_4', 'median_sale_price_lag_5',
       'R_INTERNATIONAL_MIG_2019', 'Unemployment_rate_2020',
       'PCT_COLL_4_2015_19', 'PCT_COLL_1TO3_2000', 'PCT_HSD_Only_2000',
       'R_NET_MIG_2019', 'Med_HH_Income_Percent_of_State_Total_2019',
       'GQ_ESTIMATES_2019', 'N_POP_CHG_2019', 'INTERNATIONAL_MIG_2019',
       'NET_MIG_2019', 'HSD_Only_2000', 'DOMESTIC_MIG_2019', 'RESIDUAL_2019',
       'Deaths_2019', 'COLL_4_2000', 'POP_ESTIMATE_2019', 'LT_HSD_2015_19',
       'COLL_1TO3_2000', 'Unemployed_2020', 'NATURAL_INC_2019',
       'GQ_ESTIMATES_BASE_2010', 'Employed_2020', 'LT_HSD_2000',
       'COLL_4_2015_19', 'HSD_Only_2015_19', 

In [7]:
def convert_frame_to_numpy(df, remove_cols, target_prefix, related_prefix, J,H):
    
    #assemble lag variables.
    y_lag_cols = [f'{target_prefix}_lag_{j}' for j in range(J,0,-1)]
    y_lead_cols = [target_prefix] + [f'{target_prefix}_lead_{h}' for h in range(1,H+1,1)]    
    x_rel_cols = [f'{related_prefix}_lag_{j}' for j in range(J,0,-1)]
    other_cols = [x for x in df.columns if x not in y_lag_cols + y_lead_cols + x_rel_cols + remove_cols]
    
    def get_xvec_row(row):
        x = np.array([row[y_lag_cols].values])
        x = np.append(x,[row[x_rel_cols].values],axis=0)
        stat_val = row[other_cols].values
        stat_val = np.tile(stat_val,[J,1])
        stat_val = np.transpose(stat_val)
        x = np.append(x,stat_val,axis=0)
        
        return x
    
    def get_yvec_row(row):
        y = np.array([row[y_lead_cols].values])
        
        return y
            
    X = np.array(df.apply(get_xvec_row, axis = 1))
    y = np.array(df.apply(get_yvec_row, axis = 1))
    
    return X,y
    
remove_cols = ["county_name", "period_begin"]
target_prefix = 'inventory'
related_prefix = 'median_sale_price'
J=5
H=3

In [8]:
X_train, y_train = convert_frame_to_numpy(train_frame, 
                                          remove_cols, 
                                          target_prefix, 
                                          related_prefix, 
                                          J,H)

In [9]:
X_test, y_test = convert_frame_to_numpy(test_frame, 
                                        remove_cols, 
                                        target_prefix, 
                                        related_prefix, 
                                        J,H)

In [10]:
def chunks(a, size):
    arr = iter(a)
    for v in arr:
        tmp = [ v ]
        for i,v in zip( range( size - 1 ), arr ):
            tmp.append( v )
        yield tmp

In [11]:
X_train_stack = list(chunks(np.vstack(X_train), 85))
X_test_stack = list(chunks(np.vstack(X_test), 85))
y_train_stack = np.expand_dims(np.vstack(y_train),axis=2)
y_test_stack = np.expand_dims(np.vstack(y_test),axis=2)

In [12]:
X_train_swap = np.array(X_train_stack).swapaxes(0,1).swapaxes(0,2)
X_test_swap = np.array(X_test_stack).swapaxes(0,1).swapaxes(0,2)
y_train_swap = np.array(y_train_stack).swapaxes(0,1)
y_test_swap = np.array(y_test_stack).swapaxes(0,1)

In [14]:
with open(f"{data_folder}all_model_data.npy", 'wb') as f:
    np.save(f, X_train_swap.astype(float))
    np.save(f, y_train_swap.astype(float))
    np.save(f, X_test_swap.astype(float))
    np.save(f, y_test_swap.astype(float))