In [None]:
pip install statsmodels

In [None]:
import os
import re

import pandas as pd
import numpy as np
import altair as alt
import datetime as dt

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.tsa.seasonal import seasonal_decompose, DecomposeResult

In [None]:
msa_files = [os.path.join("msa", file_path) for file_path in os.listdir('msa')]
ZHVI_files = [os.path.join('zillow', file_path) for file_path in os.listdir( 'zillow')]

In [None]:
msa_dfs = [pd.read_excel(f, skiprows=7)[1:] for f in msa_files]
print(msa_files)

In [None]:
print(msa_dfs[0].columns)

msa_cols = ['CSA', 'CBSA', 'Name', 'Total', '1 Unit', '2 Units', 
            '3 and 4 Units','5 Units or More', 
            'Num of Structures With 5 Units or More']
msa_dfs = [df[msa_cols] for df in msa_dfs]

msa_dfs[0].head()

In [None]:
print(msa_dfs[0].columns)

In [None]:
for df, name in zip(msa_dfs, msa_files):
    df['Date'] = name[name.rfind('_')+1:name.rfind('.')]

df.head()

In [None]:
permits_df = pd.concat(msa_dfs).dropna(axis = 1)
permits_df.Name = permits_df.Name.apply(lambda x: x.strip())
permits_df['Year'] = permits_df.Date.apply(lambda x: int(x[:4]))
permits_df['Month'] = permits_df.Date.apply(lambda x: int(x[4:]))

permits_df.head()

In [None]:
ZHVI_dfs = [pd.read_csv(z) for z in ZHVI_files]
ZHVI_dfs = [df[df['RegionType']=='msa'] for df in ZHVI_dfs]

ZHVI_dfs[0].head()

In [None]:
for df,f in zip(ZHVI_dfs, ZHVI_files):
    df['filename'] = f[f.rfind('/')+1:-4]
df.head()

In [None]:
ZHVI_df = pd.concat(ZHVI_dfs)

In [None]:
print(ZHVI_df.columns)

In [None]:
ZHVI_df.head()

In [None]:
ZHVI_df = pd.concat(ZHVI_dfs)

date_columns = [column for column in ZHVI_df.columns if re.match(r'\d{4}-\d{2}-\d{2}', column)]
other_columns = [column for column in ZHVI_df.columns if not re.match(r'\d{4}-\d{2}-\d{2}', column)]
ZHVI_df = ZHVI_df.melt(id_vars=other_columns, value_vars=date_columns, var_name='Date', value_name='Price')

# ZHVI_df.dropna(subset=['Price',], axis='columns', inplace=True)
ZHVI_df['Year'] = ZHVI_df.Date.apply(lambda x: int(x.split("-")[0]))
ZHVI_df['Month'] = ZHVI_df.Date.apply(lambda x: int(x.split("-")[1]))

ZHVI_df.head(10)

In [None]:
zhvi_msa_set = set(ZHVI_df.RegionName.unique())
permit_msa_set = set(permits_df.Name.unique())

msa_intersection = zhvi_msa_set.intersection(permit_msa_set)
unmatched_zillow_msas = zhvi_msa_set.difference(permit_msa_set)
unmatched_permit_msas = permit_msa_set.difference(zhvi_msa_set)

# Set of tuples of form (ZHVI msa, Census MSA)
msa_matches = set()

for matched_msa in msa_intersection:
    msa_matches.add((matched_msa, matched_msa))
    
print((f"Matched: {len(msa_matches)}, Unmatched Zillow: {len(unmatched_zillow_msas)}, "
       f"Unmatched Census {len(unmatched_permit_msas)}"))

In [None]:
left_matches = set()
for uzm in unmatched_zillow_msas:
    city, state = uzm.split(", ")
    for permit_msa in unmatched_permit_msas:
        if city in permit_msa and state in permit_msa:
            left_matches.add((uzm, permit_msa))
            break

left_match_df = pd.DataFrame(left_matches, columns=['Zillow', 'MSA'])
auto_match_df = pd.DataFrame(msa_matches, columns=['Zillow', 'MSA'])

msa_match_df = pd.concat([left_match_df, auto_match_df]).sort_values(['MSA', 'Zillow'])
msa_match_df.head()

In [None]:
msa_match_df.to_csv('msa_match.csv', index=False)

In [None]:
zillow_msa_matches = set(msa_match_df.Zillow.unique())
permit_msa_matches = set(msa_match_df.MSA.unique())

unmatched_zillow_msas = zhvi_msa_set.difference(zillow_msa_matches)
unmatched_permit_msas = permit_msa_set.difference(permit_msa_matches)

print((f"Matched: {len(msa_match_df)}, Unmatched Zillow: {len(unmatched_zillow_msas)}, "
       f"Unmatched Census {len(unmatched_permit_msas)}"))

In [None]:
big_df = msa_match_df.merge(permits_df, how='left', left_on='MSA', right_on='Name')
big_df = big_df.merge(
    ZHVI_df, how='inner', left_on=['Zillow', 'Year', 'Month'], right_on=['RegionName', 'Year', 'Month']
)

# drop_columns = ['Name', 'RegionID', 'RegionName', 'RegionType',  'SizeRank', 'Date_x', 'Date_y', 'StateName']
drop_columns = ['Name', 'RegionID', 'RegionName', 'RegionType',  'SizeRank', 'Date_x', 'StateName']
big_df['Date_y'] = pd.to_datetime(big_df['Date_y'])


big_df.drop(columns=drop_columns, inplace=True)
big_df.head()

In [None]:
# big_df.shape
big_df.dtypes

In [None]:
z_fnames = big_df['filename'].unique()

z_cities = big_df['Zillow'].unique()

city_dfs = [big_df[big_df['Zillow']==z] for z in z_cities]
z_cat_dfs = [big_df[big_df['filename']==f] for f in z_fnames]




In [None]:
fig1, axes1 = plt.subplots(4, len(z_fnames))
i=0

reduced_cities = z_cities[:4]
reduced_cities_dfs = city_dfs[:4]
# print(z_fnames)

for city,df in zip(reduced_cities, reduced_cities_dfs):
    city_files = {}
    j = 0
    for z_f in z_fnames:
        city_files[z_f] = df[df['filename']==z_f][['Date_y', 'Price']].dropna().set_index('Date_y').sort_index()
        try: 
            res = seasonal_decompose(city_files[z_f], model = 'additive', period = 12)
            axes1[i][j].plot(res.seasonal)
            axes1[i][j].set_title[city + ' ' + z_f]
            j+=1
        except: j+=1
        
    i += 1


In [None]:
fig2, axes2 = plt.subplots(4, len(z_fnames))
i=0

reduced_cities = z_cities[:4]
reduced_cities_dfs = city_dfs[:4]
# print(z_fnames)

for city,df in zip(reduced_cities, reduced_cities_dfs):
    city_files = {}
    j = 0
    for z_f in z_fnames:
        city_files[z_f] = df[df['filename']==z_f][['Date_y', 'Price']].dropna().set_index('Date_y').sort_index()
        try: 
            res = seasonal_decompose(city_files[z_f], model = 'additive', period = 12)
            axes2[i][j].plot(res.observed)
            axes2[i][j].set_title[city + ' ' + z_f]
            j+=1
        except: j+=1
        
    i += 1

In [None]:
fig3, axes3 = plt.subplots(4, len(z_fnames))
i=0

reduced_cities = z_cities[:4]
reduced_cities_dfs = city_dfs[:4]
# print(z_fnames)

for city,df in zip(reduced_cities, reduced_cities_dfs):
    city_files = {}
    j = 0
    for z_f in z_fnames:
        city_files[z_f] = df[df['filename']==z_f][['Date_y', 'Price']].dropna().set_index('Date_y').sort_index()
        try: 
            res = seasonal_decompose(city_files[z_f], model = 'additive', period = 12)
            axes3[i][j].plot(res.trend)
            axes3[i][j].set_title[city + ' ' + z_f]
            j+=1
        except: j+=1
        
    i += 1

In [None]:
fig4, axes4 = plt.subplots(4, len(z_fnames))
i=0

reduced_cities = z_cities[:4]
reduced_cities_dfs = city_dfs[:4]
# print(z_fnames)

for city,df in zip(reduced_cities, reduced_cities_dfs):
    city_files = {}
    j = 0
    for z_f in z_fnames:
        city_files[z_f] = df[df['filename']==z_f][['Date_y', 'Price']].dropna().set_index('Date_y').sort_index()
        try: 
            res = seasonal_decompose(city_files[z_f], model = 'additive', period = 12)
            axes4[i][j].plot(res.resid)
            axes4[i][j].set_title[city + ' ' + z_f]
            j+=1
        except: j+=1
        
    i += 1

In [None]:
fig5, axes5 = plt.subplots(4, 1)
i=0

reduced_cities = z_cities[:4]
reduced_cities_dfs = city_dfs[:4]

for city,df in zip(reduced_cities, reduced_cities_dfs):
    city_files = {}
    city_files[city] = df[['Date_y', 'Price']].dropna().set_index('Date_y').sort_index()
    try: 
        res = seasonal_decompose(city_files[city], model = 'additive', period = 12)
        axes5[i].plot(res.seasonal)
        axes5[i].set_title[city]
        i+=1
    except: i+=1
        


In [None]:
fig, axes = plt.subplots(len(city_dfs), len(z_fnames), figsize = (100,200))


i=0
for city,df in zip(z_cities, city_dfs):
    city_files = {}
    j = 0
    for z_f in z_fnames:
        city_files[z_f] = df[df['filename']==z_f][['Date_y', 'Price']].dropna().set_index('Date_y').sort_index()
        try: 
            res = seasonal_decompose(city_files[z_f], model = 'additive', period = 12)
            axes[i][j].plot(res.seasonal)
            axes[i][j].set_title[city + ' ' + z_f]
            j+=1
        except: j+=1
        
    i += 1
        
        
    
