In [None]:
import os
import re

import pandas as pd
import numpy as np
import altair as alt
import datetime as dt

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose, DecomposeResult

In [None]:
msa_files = [os.path.join("msa", file_path) for file_path in os.listdir('msa')]
ZHVI_files = [os.path.join('zillow', file_path) for file_path in os.listdir( 'zillow')]

In [None]:
msa_dfs = [pd.read_excel(f, skiprows=7)[1:] for f in msa_files]
print(msa_files)

In [None]:
print(msa_dfs[0].columns)

msa_cols = ['CSA', 'CBSA', 'Name', 'Total', '1 Unit', '2 Units', 
            '3 and 4 Units','5 Units or More', 
            'Num of Structures With 5 Units or More']
msa_dfs = [df[msa_cols] for df in msa_dfs]

msa_dfs[0].head()

In [None]:
print(msa_dfs[0].columns)

In [None]:
for df, name in zip(msa_dfs, msa_files):
    df['Date'] = name[name.rfind('_')+1:name.rfind('.')]

df.head()

In [None]:
permits_df = pd.concat(msa_dfs).dropna(axis = 1)
permits_df.Name = permits_df.Name.apply(lambda x: x.strip())
permits_df['Year'] = permits_df.Date.apply(lambda x: int(x[:4]))
permits_df['Month'] = permits_df.Date.apply(lambda x: int(x[4:]))

permits_df.head()

In [None]:
ZHVI_dfs = [pd.read_csv(z) for z in ZHVI_files]
ZHVI_dfs = [df[df['RegionType']=='msa'] for df in ZHVI_dfs]

ZHVI_dfs[0].head()

In [None]:
for df,f in zip(ZHVI_dfs, ZHVI_files):
    df['filename'] = f[f.rfind('/')+1:-4]
df.head()

In [None]:
ZHVI_df = pd.concat(ZHVI_dfs)

In [None]:
print(ZHVI_df.columns)

In [None]:
ZHVI_df.head()

In [None]:
ZHVI_df = pd.concat(ZHVI_dfs)

date_columns = [column for column in ZHVI_df.columns if re.match(r'\d{4}-\d{2}-\d{2}', column)]
other_columns = [column for column in ZHVI_df.columns if not re.match(r'\d{4}-\d{2}-\d{2}', column)]
ZHVI_df = ZHVI_df.melt(id_vars=other_columns, value_vars=date_columns, var_name='Date', value_name='Price')

# ZHVI_df.dropna(subset=['Price',], axis='columns', inplace=True)
ZHVI_df['Year'] = ZHVI_df.Date.apply(lambda x: int(x.split("-")[0]))
ZHVI_df['Month'] = ZHVI_df.Date.apply(lambda x: int(x.split("-")[1]))

ZHVI_df.head(10)

In [None]:
zhvi_msa_set = set(ZHVI_df.RegionName.unique())
permit_msa_set = set(permits_df.Name.unique())

msa_intersection = zhvi_msa_set.intersection(permit_msa_set)
unmatched_zillow_msas = zhvi_msa_set.difference(permit_msa_set)
unmatched_permit_msas = permit_msa_set.difference(zhvi_msa_set)

# Set of tuples of form (ZHVI msa, Census MSA)
msa_matches = set()

for matched_msa in msa_intersection:
    msa_matches.add((matched_msa, matched_msa))
    
print((f"Matched: {len(msa_matches)}, Unmatched Zillow: {len(unmatched_zillow_msas)}, "
       f"Unmatched Census {len(unmatched_permit_msas)}"))

In [None]:
left_matches = set()
for uzm in unmatched_zillow_msas:
    city, state = uzm.split(", ")
    for permit_msa in unmatched_permit_msas:
        if city in permit_msa and state in permit_msa:
            left_matches.add((uzm, permit_msa))
            break

left_match_df = pd.DataFrame(left_matches, columns=['Zillow', 'MSA'])
auto_match_df = pd.DataFrame(msa_matches, columns=['Zillow', 'MSA'])

msa_match_df = pd.concat([left_match_df, auto_match_df]).sort_values(['MSA', 'Zillow'])
msa_match_df.head()

In [None]:
msa_match_df.to_csv('msa_match.csv', index=False)

In [None]:
zillow_msa_matches = set(msa_match_df.Zillow.unique())
permit_msa_matches = set(msa_match_df.MSA.unique())

unmatched_zillow_msas = zhvi_msa_set.difference(zillow_msa_matches)
unmatched_permit_msas = permit_msa_set.difference(permit_msa_matches)

print((f"Matched: {len(msa_match_df)}, Unmatched Zillow: {len(unmatched_zillow_msas)}, "
       f"Unmatched Census {len(unmatched_permit_msas)}"))

In [None]:
big_df = msa_match_df.merge(permits_df, how='left', left_on='MSA', right_on='Name')
big_df = big_df.merge(
    ZHVI_df, how='inner', left_on=['Zillow', 'Year', 'Month'], right_on=['RegionName', 'Year', 'Month']
)

drop_columns = ['Name', 'RegionID', 'RegionName', 'RegionType',  'SizeRank', 'Date_x', 'Date_y', 'StateName']
big_df['Date'] = pd.to_datetime(big_df['Date_y'])

big_df.drop(columns=drop_columns, inplace=True)
big_df.head()

In [None]:
# big_df.shape
big_df.dtypes

In [None]:
big_df.head()

In [None]:
trend_df = pd.DataFrame()

for index, seasonal_df in big_df.set_index('Date').sort_index().groupby(['filename', 'Zillow']):
    try:
        decompose_result = seasonal_decompose(seasonal_df.Price.dropna())
        
        seasonal_df = seasonal_df.join(decompose_result.seasonal)
        seasonal_df = seasonal_df.join(decompose_result.trend)
        seasonal_df = seasonal_df.join(decompose_result.resid)
        
        seasonal_df.rename(columns={'seasonal': 'Seasonal', 'trend': 'Trend', 'resid': 'Residual'}, inplace=True)
    except ValueError:
        seasonal_df['Seasonal'] = None
        seasonal_df['Trend'] = None
        seasonal_df['Residual'] = None
    
    seasonal_df['Price Change'] = seasonal_df.Price.diff(periods=1)
    
    try:
        seasonal_df['Trend Change'] = seasonal_df.Trend.diff(periods=1)
    except TypeError:
        seasonal_df['Trend Change'] = None
    
    seasonal_df.reset_index(inplace=True)
    trend_df = pd.concat([trend_df, seasonal_df])

trend_df.head()

In [None]:
seasonal_df[-10:]

In [None]:
trend_df[trend_df.MSA == 'Abilene, TX'].head(10)

In [None]:
def get_pop(x):
    if np.isnan(x['Population 1']) and np.isnan(x['Population 5']):
        return None
    
    if np.isnan(x['Population 1']):
        return x['Population 5']
    if np.isnan(x['Population 5']):
        return x['Population 1']
    
    return (x['Population 5'] + x['Population 5'])/2

In [None]:
acs_1_df = pd.DataFrame()
# Add ACS 1-year estimates
for year in (2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2021):
    acs_year_df = pd.read_csv(f"acs-1-year/ACSDP1Y{year}.DP05-Data.csv", skiprows=1)[1:].copy()
    acs_year_df['MSA'] = acs_year_df['Geographic Area Name'].apply(
        lambda x: x.replace(" Micro Area", "").replace(" Metro Area", "")
    )
    
    acs_year_df.rename(columns={'Estimate!!SEX AND AGE!!Total population': 'Population 1'}, inplace=True)
    acs_year_df = acs_year_df[["MSA", "Population 1"]].copy()
    acs_year_df['Year'] = year
    
    acs_1_df = pd.concat([acs_1_df, acs_year_df])

# Add ACS 5-year estimates
acs_5_df = pd.DataFrame()
for year in (2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020):
    acs_year_df = pd.read_csv(f"acs-5-year/ACSDP5Y{year}.DP05-Data.csv", skiprows=1)[1:].copy()
    acs_year_df['MSA'] = acs_year_df['Geographic Area Name'].apply(
        lambda x: x.replace(" Micro Area", "").replace(" Metro Area", "")
    )
    
    acs_year_df.rename(columns={'Estimate!!SEX AND AGE!!Total population': 'Population 5'}, inplace=True)
    acs_year_df = acs_year_df[["MSA", "Population 5"]].copy()
    acs_year_df['Year'] = year
    
    acs_5_df = pd.concat([acs_5_df, acs_year_df])

# Merge our datasets and reconcile our estimates
acs_df = pd.merge(acs_1_df, acs_5_df, on=['MSA', 'Year'], how='outer')
acs_df['Population Diff'] = 1 - acs_df['Population 5'] / acs_df['Population 1']
acs_df['Population'] = acs_df.apply(get_pop, axis=1)


# Add population growth as features
def add_pop_growth(df, diff):
    df['Year as Date'] = df.Year.apply(lambda year: dt.datetime(year=year, month=1, day=1))
    df = df.set_index(['MSA', 'Year as Date']).sort_index()
    
    df[f'Pop Growth {diff} Year'] = df.groupby('MSA')['Population'].diff(diff)
    df[f'Pop -{diff} Years'] = df.groupby('MSA')['Population'].shift(diff)
    df[f'Pop Percent {diff} Year'] = df[f'Pop Growth {diff} Year'] / df[f'Pop -{diff} Years']
    
    df.reset_index(inplace=True)
    return df


acs_df = add_pop_growth(acs_df, 1)
acs_df = add_pop_growth(acs_df, 5)
acs_df = acs_df[acs_df.Year >= 2019].copy()

acs_df.head()

In [None]:
acs_df[acs_df.MSA == 'Abilene, TX']

In [None]:
acs_df.to_csv("ACS.csv", index=False)

In [None]:
combined_df = pd.merge(trend_df, acs_df, on=['MSA', 'Year'], how='left')
combined_df.to_csv("combined.csv")

combined_df.head()

In [None]:
combined_df.dropna(subset=['Population']).shape

In [None]:
# 5 year: 2019, 2020
# 1 year: 2019, 2021