In [30]:
import pandas as pd
import numpy as np
import pickle

In [2]:
ls DATA

Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv


# Data Import

In [31]:
df = pd.read_csv('DATA/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv')
df.head(3)

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,1996-01-31,...,2019-11-30,2019-12-31,2020-01-31,2020-02-29,2020-03-31,2020-04-30,2020-05-31,2020-06-30,2020-07-31,2020-08-31
0,61639,0,10025,Zip,NY,NY,New York,New York-Newark-Jersey City,New York County,245762.0,...,1292776.0,1288753.0,1269532.0,1243884.0,1211977.0,1197322.0,1185428.0,1179938.0,1175379.0,1173231.0
1,84654,1,60657,Zip,IL,IL,Chicago,Chicago-Naperville-Elgin,Cook County,209547.0,...,487111.0,486300.0,486154.0,487283.0,488823.0,489789.0,489865.0,490118.0,491195.0,493022.0
2,61637,2,10023,Zip,NY,NY,New York,New York-Newark-Jersey City,New York County,230594.0,...,1080810.0,1099111.0,1117633.0,1130101.0,1129983.0,1138594.0,1143043.0,1147409.0,1149477.0,1155724.0


The dataframe is in the wide format. I want it to have a column per region and row per timestamp. I will also filter the location down to include only New York. First let's check if our region's are unique.

## Export Metadata
Let's separate out the region meta info so we can reference it later.

In [32]:
meta = df.iloc[:, 0:9]

In [33]:
#import os
#os.mkdir('PKL')

In [34]:
# saving meta data
with open('PKL/meta.pkl', 'wb') as fp:
    pickle.dump(meta, fp)

## Subset
Subsetting to only Queens for now.

In [35]:
queens = df[df.CountyName == 'Queens County']

In [36]:
meta_cols = list(df.columns[0:9])

In [37]:
meta_cols.remove('RegionName')

In [38]:
queens = queens.drop(meta_cols, axis = 1)

In [39]:
queens.shape

(54, 297)

In [40]:
queens.columns[queens.isnull().sum() != 0][-1]

'2003-08-31'

It seems like we have full data of all queens zipcode starting from 2003 September. Let's cap it at that.

In [41]:
queens = queens.dropna(axis = 1)

## Percentage Increase
We are trying to find the best neighborhood to invest in. We can approach this in different ways. I can predict the housing price for coming year then calculate the difference OR I can predict the percent increase for each time point. I'll try the percent increase first.

In [54]:
def calculate_percent_increase(x1, x2):
    return ((x2-x1)/x1)*100

In [55]:
queens_p = queens.copy()
for i in range(queens_p.shape[1]-1,1, -1):
    prior = queens_p.iloc[:, i-1]
    current = queens_p.iloc[:, i] 
    queens_p.iloc[:, i] = calculate_percent_increase(prior, current)

In [56]:
queens_p = queens_p.drop(['2003-09-30'], axis = 1)

In [57]:
queens_p.head(2)

Unnamed: 0,RegionName,2003-10-31,2003-11-30,2003-12-31,2004-01-31,2004-02-29,2004-03-31,2004-04-30,2004-05-31,2004-06-30,...,2019-11-30,2019-12-31,2020-01-31,2020-02-29,2020-03-31,2020-04-30,2020-05-31,2020-06-30,2020-07-31,2020-08-31
20,11375,0.641106,0.913096,0.393119,0.631139,0.435425,0.625393,0.455833,0.666596,1.167856,...,-0.235229,0.303563,-0.195293,0.024489,-0.426299,-0.05689,-0.313316,0.178322,0.533047,0.64746
108,11377,0.025764,0.023655,0.921259,1.147181,1.159911,0.756265,0.799079,0.995691,1.538103,...,0.310192,0.464618,1.262602,1.144025,1.365576,1.264332,0.898478,0.54954,0.489825,0.827001


## Transpose
Now I'll change the format to the wide format.

In [59]:
queens_p = queens_p.set_index('RegionName').transpose()

## Fix Datetime
Let's convert the index to datetime.

In [60]:
queens_p.index = pd.DatetimeIndex(queens_p.index)

In [61]:
queens_p.index = queens_p.index.strftime('%m/%Y')

In [62]:
queens_p.columns.name = None

## Exporting 
Now let's export the dataframe.

In [63]:
with open('PKL/timeseries_queens_p.pkl', 'wb') as fp:
    pickle.dump(queens_p, fp)

In [64]:
queens_p.to_csv('DATA/timeseries_queens_p.csv', header=True)

In [65]:
queens_p.describe()

Unnamed: 0,11375,11377,11355,11385,11373,11372,11101,11368,11354,11374,...,11411,11426,11428,11693,11004,11416,11436,11366,11363,11430
count,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,...,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0
mean,0.346812,0.397997,0.377397,0.310279,0.316201,0.381431,0.533002,0.39144,0.360144,0.353884,...,0.273891,0.288595,0.231554,0.308075,0.197474,0.254911,0.311145,0.345299,0.196537,0.19916
std,0.669167,0.771704,0.665156,0.662708,0.782095,0.85691,0.785882,0.769664,0.670916,0.852044,...,0.736335,0.68583,0.734456,1.031377,0.705749,0.836916,0.816727,0.649757,0.620341,0.6596
min,-1.990846,-1.625696,-1.990776,-1.285888,-2.007884,-1.962546,-1.727259,-1.893513,-1.934911,-2.347053,...,-1.466769,-1.8406,-1.688152,-2.107444,-1.71441,-2.208515,-1.618457,-1.945184,-1.875439,-1.513223
25%,-0.054836,-0.131548,-0.038391,-0.110817,-0.182179,-0.212565,-0.004947,-0.036683,0.062209,-0.276091,...,-0.103728,-0.060181,-0.191193,-0.470629,-0.231815,-0.119193,-0.195303,-0.080522,-0.189808,-0.192352
50%,0.448977,0.440358,0.404875,0.341031,0.453339,0.501822,0.497912,0.438233,0.375084,0.4402,...,0.32074,0.372918,0.340239,0.329711,0.220565,0.357698,0.435287,0.421789,0.237238,0.299354
75%,0.761018,0.931774,0.806497,0.835186,0.808302,0.949512,0.99899,0.867516,0.804775,0.885384,...,0.757962,0.703217,0.642857,1.05963,0.668469,0.788138,0.865981,0.827332,0.580623,0.594032
max,1.910349,2.621683,1.974653,1.749658,2.06073,2.241417,2.578972,2.030631,2.003723,2.102992,...,2.192268,1.978977,2.031019,2.975201,1.837059,2.238624,2.114471,1.867921,1.884116,1.994809
