In [1]:
import os
import re

import pandas as pd
import numpy as np
import altair as alt
import datetime as dt

In [2]:
msa_files = [os.path.join("msa", file_path) for file_path in os.listdir('msa')]
ZHVI_files = [os.path.join('zillow', file_path) for file_path in os.listdir( 'zillow')]

In [3]:
msa_dfs = [pd.read_excel(f, skiprows=7)[1:] for f in msa_files]
print(msa_files)

['msa/msamonthly_202102.xls', 'msa/msamonthly_202103.xls', 'msa/msamonthly_202101.xls', 'msa/msamonthly_202110.xls', 'msa/msamonthly_202104.xls', 'msa/msamonthly_202105.xls', 'msa/msamonthly_202111.xls', 'msa/msamonthly_202107.xls', 'msa/msamonthly_202112.xls', 'msa/msamonthly_202106.xls', 'msa/msamonthly_202001.xls', 'msa/msamonthly_202203.xls', 'msa/msamonthly_201911.xls', 'msa/msamonthly_202202.xls', 'msa/msamonthly_202002.xls', 'msa/msamonthly_201912.xls', 'msa/msamonthly_202201.xls', 'msa/msamonthly_202003.xls', 'msa/msamonthly_202007.xls', 'msa/msamonthly_202205.xls', 'msa/msamonthly_202204.xls', 'msa/msamonthly_202012.xls', 'msa/msamonthly_202006.xls', 'msa/msamonthly_202010.xls', 'msa/msamonthly_202004.xls', 'msa/msamonthly_202206.xls', 'msa/msamonthly_202207.xls', 'msa/msamonthly_202005.xls', 'msa/msamonthly_202011.xls', 'msa/msamonthly_202008.xls', 'msa/msamonthly_202009.xls', 'msa/msamonthly_202108.xls', 'msa/msamonthly_202109.xls']


In [4]:
print(msa_dfs[0].columns)

msa_cols = ['CSA', 'CBSA', 'Name', 'Total', '1 Unit', '2 Units', 
            '3 and 4 Units','5 Units or More', 
            'Num of Structures With 5 Units or More']
msa_dfs = [df[msa_cols] for df in msa_dfs]

msa_dfs[0].head()

Index(['CSA', 'CBSA', 'Name', 'Total', '1 Unit', '2 Units', '3 and 4 Units',
       '5 Units or More', 'Num of Structures With 5 Units or More',
       'Monthly Coverage Percent*', 'Unnamed: 10', 'Total.1', '1 Unit.1',
       '2 Units.1', '3 and 4 Units.1', '5 Units or More.1',
       'Num of Structures With 5 Units or More.1'],
      dtype='object')


Unnamed: 0,CSA,CBSA,Name,Total,1 Unit,2 Units,3 and 4 Units,5 Units or More,Num of Structures With 5 Units or More
1,999.0,10180.0,"Abilene, TX ...",33.0,31.0,2.0,0.0,0.0,0.0
2,184.0,10420.0,"Akron, OH ...",45.0,45.0,0.0,0.0,0.0,0.0
3,999.0,10500.0,"Albany, GA ...",32.0,15.0,0.0,0.0,17.0,1.0
4,440.0,10540.0,"Albany-Lebanon, OR ...",58.0,37.0,0.0,0.0,21.0,3.0
5,104.0,10580.0,"Albany-Schenectady-Troy, NY ...",268.0,118.0,4.0,0.0,146.0,2.0


In [5]:
print(msa_dfs[0].columns)

Index(['CSA', 'CBSA', 'Name', 'Total', '1 Unit', '2 Units', '3 and 4 Units',
       '5 Units or More', 'Num of Structures With 5 Units or More'],
      dtype='object')


In [6]:
for df, name in zip(msa_dfs, msa_files):
    df['Date'] = name[name.rfind('_')+1:name.rfind('.')]

df.head()

Unnamed: 0,CSA,CBSA,Name,Total,1 Unit,2 Units,3 and 4 Units,5 Units or More,Num of Structures With 5 Units or More,Date
1,999.0,10180.0,"Abilene, TX ...",62.0,56.0,6.0,0.0,0.0,0.0,202109
2,184.0,10420.0,"Akron, OH ...",62.0,57.0,0.0,0.0,5.0,1.0,202109
3,999.0,10500.0,"Albany, GA ...",40.0,26.0,0.0,0.0,14.0,1.0,202109
4,440.0,10540.0,"Albany-Lebanon, OR ...",66.0,29.0,4.0,3.0,30.0,3.0,202109
5,104.0,10580.0,"Albany-Schenectady-Troy, NY ...",273.0,97.0,2.0,3.0,171.0,4.0,202109


In [7]:
permits_df = pd.concat(msa_dfs).dropna(axis = 1)
permits_df.Name = permits_df.Name.apply(lambda x: x.strip())

permits_df

Unnamed: 0,CSA,CBSA,Name,Total,1 Unit,2 Units,3 and 4 Units,5 Units or More,Num of Structures With 5 Units or More,Date
1,999.0,10180.0,"Abilene, TX",33.0,31.0,2.0,0.0,0.0,0.0,2021-02-01
2,184.0,10420.0,"Akron, OH",45.0,45.0,0.0,0.0,0.0,0.0,2021-02-01
3,999.0,10500.0,"Albany, GA",32.0,15.0,0.0,0.0,17.0,1.0,2021-02-01
4,440.0,10540.0,"Albany-Lebanon, OR",58.0,37.0,0.0,0.0,21.0,3.0,2021-02-01
5,104.0,10580.0,"Albany-Schenectady-Troy, NY",268.0,118.0,4.0,0.0,146.0,2.0,2021-02-01
...,...,...,...,...,...,...,...,...,...,...
364,148.0,49340.0,"Worcester, MA-CT",13.0,13.0,0.0,0.0,0.0,0.0,2021-09-01
365,999.0,49420.0,"Yakima, WA",48.0,34.0,6.0,8.0,0.0,0.0,2021-09-01
366,276.0,49620.0,"York-Hanover, PA",9.0,9.0,0.0,0.0,0.0,0.0,2021-09-01
367,566.0,49660.0,"Youngstown-Warren-Boardman, OH-PA",34.0,31.0,0.0,3.0,0.0,0.0,2021-09-01


In [8]:
ZHVI_dfs = [pd.read_csv(z) for z in ZHVI_files]

ZHVI_dfs = [df[df['RegionType']=='msa'] for df in ZHVI_dfs]
ZHVI_dfs[0].head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,2000-01-31,2000-02-29,2000-03-31,2000-04-30,2000-05-31,...,2021-11-30,2021-12-31,2022-01-31,2022-02-28,2022-03-31,2022-04-30,2022-05-31,2022-06-30,2022-07-31,2022-08-31
1,394913,1,"New York, NY",msa,NY,398618.0,400989.0,403125.0,407346.0,411288.0,...,944392.0,949401.0,955581.0,963163.0,973716.0,985181.0,997357.0,1007381.0,1014851.0,1019097.0
2,753899,2,"Los Angeles, CA",msa,CA,412833.0,414349.0,416476.0,420539.0,424798.0,...,1459690.0,1475813.0,1494850.0,1516710.0,1547884.0,1582369.0,1612876.0,1617590.0,1615957.0,1595977.0
3,394463,3,"Chicago, IL",msa,IL,295109.0,295865.0,296827.0,298700.0,300646.0,...,464815.0,468835.0,472579.0,476171.0,481214.0,486852.0,493379.0,498309.0,501462.0,502214.0
4,394514,4,"Dallas, TX",msa,TX,231125.0,231200.0,231294.0,231544.0,231819.0,...,546482.0,556356.0,568739.0,582375.0,595125.0,610729.0,625328.0,638078.0,641222.0,639479.0
5,394692,5,"Houston, TX",msa,TX,226539.0,226746.0,226568.0,226884.0,226956.0,...,455285.0,460482.0,465816.0,472578.0,481148.0,490962.0,500094.0,506863.0,510726.0,512294.0


In [9]:
for df,f in zip(ZHVI_dfs, ZHVI_files):
    df['filename'] = f[f.rfind('/')+1:-4]
df.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,2000-01-31,2000-02-29,2000-03-31,2000-04-30,2000-05-31,...,2021-12-31,2022-01-31,2022-02-28,2022-03-31,2022-04-30,2022-05-31,2022-06-30,2022-07-31,2022-08-31,filename
1,394913,1,"New York, NY",msa,NY,269152.0,270818.0,272236.0,275165.0,277939.0,...,653921.0,658777.0,664655.0,672580.0,681386.0,690621.0,698178.0,703790.0,706621.0,Metro_zhvi_bdrmcnt_4_uc_sfrcondo_tier_0.33_0.6...
2,753899,2,"Los Angeles, CA",msa,CA,286048.0,287357.0,288898.0,291929.0,294973.0,...,1035580.0,1052255.0,1070162.0,1094323.0,1119753.0,1142443.0,1145790.0,1142258.0,1125769.0,Metro_zhvi_bdrmcnt_4_uc_sfrcondo_tier_0.33_0.6...
3,394463,3,"Chicago, IL",msa,IL,216379.0,216954.0,217723.0,219175.0,220698.0,...,388057.0,391541.0,395013.0,399657.0,404845.0,410553.0,414859.0,417500.0,418126.0,Metro_zhvi_bdrmcnt_4_uc_sfrcondo_tier_0.33_0.6...
4,394514,4,"Dallas, TX",msa,TX,183601.0,183579.0,183670.0,183826.0,184014.0,...,445207.0,455516.0,466538.0,476764.0,489400.0,501440.0,511803.0,514244.0,512422.0,Metro_zhvi_bdrmcnt_4_uc_sfrcondo_tier_0.33_0.6...
5,394692,5,"Houston, TX",msa,TX,165886.0,165941.0,165744.0,165896.0,165865.0,...,338518.0,342701.0,348083.0,354884.0,362637.0,369808.0,375118.0,378198.0,379363.0,Metro_zhvi_bdrmcnt_4_uc_sfrcondo_tier_0.33_0.6...


In [10]:
ZHVI_df = pd.concat(ZHVI_dfs)

In [37]:
print(ZHVI_df.columns)

Index(['RegionID', 'SizeRank', 'RegionName', 'RegionType', 'StateName',
       '2000-01-31', '2000-02-29', '2000-03-31', '2000-04-30', '2000-05-31',
       ...
       '1999-03-31', '1999-04-30', '1999-05-31', '1999-06-30', '1999-07-31',
       '1999-08-31', '1999-09-30', '1999-10-31', '1999-11-30', '1999-12-31'],
      dtype='object', length=326)


In [38]:
ZHVI_df

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,2000-01-31,2000-02-29,2000-03-31,2000-04-30,2000-05-31,...,1999-03-31,1999-04-30,1999-05-31,1999-06-30,1999-07-31,1999-08-31,1999-09-30,1999-10-31,1999-11-30,1999-12-31
1,394913,1,"New York, NY",msa,NY,398618.0,400989.0,403125.0,407346.0,411288.0,...,,,,,,,,,,
2,753899,2,"Los Angeles, CA",msa,CA,412833.0,414349.0,416476.0,420539.0,424798.0,...,,,,,,,,,,
3,394463,3,"Chicago, IL",msa,IL,295109.0,295865.0,296827.0,298700.0,300646.0,...,,,,,,,,,,
4,394514,4,"Dallas, TX",msa,TX,231125.0,231200.0,231294.0,231544.0,231819.0,...,,,,,,,,,,
5,394692,5,"Houston, TX",msa,TX,226539.0,226746.0,226568.0,226884.0,226956.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,395186,922,"Vermillion, SD",msa,SD,,,,,,...,,,,,,,,,,
888,394743,924,"Ketchikan, AK",msa,AK,,,,,,...,,,,,,,,,,
889,753874,925,"Craig, CO",msa,CO,114605.0,114530.0,114562.0,114511.0,114621.0,...,,,,,,,,,,
890,394767,926,"Lamesa, TX",msa,TX,,,,,,...,,,,,,,,,,


In [39]:
# for df in ZHVI_dfs:
#     columns = df.columns.tolist()
#     new_columns = []
    
#     for column in columns:
#         if column.startswith('2'):
#             new_column = str(column)
#             # Joan: can you please explain what this line does?
#             new_columns.append(
#                 # What does this expression mean?
#                 new_column[:new_column.find('-')] +
#                 # And this one?
#                 new_column[new_column.find('-') + 1:new_column.rfind('-')]
#             )
#         else: new_columns.append(column)
#     df.columns = new_columns

In [40]:
dumb_df = ZHVI_df.copy()
dumb_df.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,2000-01-31,2000-02-29,2000-03-31,2000-04-30,2000-05-31,...,1999-03-31,1999-04-30,1999-05-31,1999-06-30,1999-07-31,1999-08-31,1999-09-30,1999-10-31,1999-11-30,1999-12-31
1,394913,1,"New York, NY",msa,NY,398618.0,400989.0,403125.0,407346.0,411288.0,...,,,,,,,,,,
2,753899,2,"Los Angeles, CA",msa,CA,412833.0,414349.0,416476.0,420539.0,424798.0,...,,,,,,,,,,
3,394463,3,"Chicago, IL",msa,IL,295109.0,295865.0,296827.0,298700.0,300646.0,...,,,,,,,,,,
4,394514,4,"Dallas, TX",msa,TX,231125.0,231200.0,231294.0,231544.0,231819.0,...,,,,,,,,,,
5,394692,5,"Houston, TX",msa,TX,226539.0,226746.0,226568.0,226884.0,226956.0,...,,,,,,,,,,


In [41]:
ZHVI_df = pd.concat(ZHVI_dfs)

date_columns = [column for column in ZHVI_df.columns if re.match(r'\d{4}-\d{2}-\d{2}', column)]
other_columns = [column for column in ZHVI_df.columns if not re.match(r'\d{4}-\d{2}-\d{2}', column)]
ZHVI_df = ZHVI_df.melt(id_vars=other_columns, value_vars=date_columns, var_name='Date', value_name='Price')

# ZHVI_df.dropna(subset=['Price',], axis='columns', inplace=True)
ZHVI_df['Year'] = ZHVI_df.Date.apply(lambda x: int(x.split("-")[0]))
ZHVI_df['Month'] = ZHVI_df.Date.apply(lambda x: int(x.split("-")[1]))

ZHVI_df.head(10)

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,filename,Date,Price,Year,Month
0,394913,1,"New York, NY",msa,NY,Metro_zhvi_uc_sfrcondo_tier_0.67_1.0_sm_sa_month,2000-01-31,398618.0,2000,1
1,753899,2,"Los Angeles, CA",msa,CA,Metro_zhvi_uc_sfrcondo_tier_0.67_1.0_sm_sa_month,2000-01-31,412833.0,2000,1
2,394463,3,"Chicago, IL",msa,IL,Metro_zhvi_uc_sfrcondo_tier_0.67_1.0_sm_sa_month,2000-01-31,295109.0,2000,1
3,394514,4,"Dallas, TX",msa,TX,Metro_zhvi_uc_sfrcondo_tier_0.67_1.0_sm_sa_month,2000-01-31,231125.0,2000,1
4,394692,5,"Houston, TX",msa,TX,Metro_zhvi_uc_sfrcondo_tier_0.67_1.0_sm_sa_month,2000-01-31,226539.0,2000,1
5,395209,6,"Washington, DC",msa,VA,Metro_zhvi_uc_sfrcondo_tier_0.67_1.0_sm_sa_month,2000-01-31,321180.0,2000,1
6,394856,7,"Miami, FL",msa,FL,Metro_zhvi_uc_sfrcondo_tier_0.67_1.0_sm_sa_month,2000-01-31,244156.0,2000,1
7,394974,8,"Philadelphia, PA",msa,PA,Metro_zhvi_uc_sfrcondo_tier_0.67_1.0_sm_sa_month,2000-01-31,224947.0,2000,1
8,394347,9,"Atlanta, GA",msa,GA,Metro_zhvi_uc_sfrcondo_tier_0.67_1.0_sm_sa_month,2000-01-31,258618.0,2000,1
9,394976,10,"Phoenix, AZ",msa,AZ,Metro_zhvi_uc_sfrcondo_tier_0.67_1.0_sm_sa_month,2000-01-31,234570.0,2000,1


In [44]:
#changes date column to a datetime and changes the day of the month to 1 to harmonize with the permits dates
melt_df['Date'] = pd.to_datetime(melt_df['Date']).apply(lambda dt: dt.replace(day=1))

melt_df.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,filename,Date,Price
0,394913,1,"New York, NY",msa,NY,Metro_zhvi_uc_sfrcondo_tier_0.67_1.0_sm_sa_month,2000-01-01,398618.0
1,753899,2,"Los Angeles, CA",msa,CA,Metro_zhvi_uc_sfrcondo_tier_0.67_1.0_sm_sa_month,2000-01-01,412833.0
2,394463,3,"Chicago, IL",msa,IL,Metro_zhvi_uc_sfrcondo_tier_0.67_1.0_sm_sa_month,2000-01-01,295109.0
3,394514,4,"Dallas, TX",msa,TX,Metro_zhvi_uc_sfrcondo_tier_0.67_1.0_sm_sa_month,2000-01-01,231125.0
4,394692,5,"Houston, TX",msa,TX,Metro_zhvi_uc_sfrcondo_tier_0.67_1.0_sm_sa_month,2000-01-01,226539.0


In [43]:
# permits_df['Date'] = pd.to_datetime(permits_df['Date'])
permits_df.head()

Unnamed: 0,CSA,CBSA,Name,Total,1 Unit,2 Units,3 and 4 Units,5 Units or More,Num of Structures With 5 Units or More,Date
1,999.0,10180.0,"Abilene, TX ...",33.0,31.0,2.0,0.0,0.0,0.0,2021-02-01
2,184.0,10420.0,"Akron, OH ...",45.0,45.0,0.0,0.0,0.0,0.0,2021-02-01
3,999.0,10500.0,"Albany, GA ...",32.0,15.0,0.0,0.0,17.0,1.0,2021-02-01
4,440.0,10540.0,"Albany-Lebanon, OR ...",58.0,37.0,0.0,0.0,21.0,3.0,2021-02-01
5,104.0,10580.0,"Albany-Schenectady-Troy, NY ...",268.0,118.0,4.0,0.0,146.0,2.0,2021-02-01


In [45]:
ZHVI_msas = ZHVI_dfs[0]['RegionName'].unique().tolist()
msa_msas = msa_dfs[0]['Name'].str.strip().unique().tolist()


combo_list = [city for city in ZHVI_msas if city in msa_msas]
ZHVI_unique = [city for city in ZHVI_msas if city not in msa_msas]
msa_unique = [city for city in msa_msas if city not in ZHVI_msas]


In [46]:
for item in msa_unique:
    state = item[-2:]
    for city in ZHVI_unique:
        if city[-2:] == state:
            continue
      # print(item)
      # print(city)
      # print()


In [19]:
ZHVI_state_dictionary = {}
for city in ZHVI_unique:
    if city[-2:] not in ZHVI_state_dictionary.keys():
        ZHVI_state_dictionary[city[-2:]] = [city]
    else:
        ZHVI_state_dictionary[city[-2:]].append(city)

In [15]:
print(ZHVI_state_dictionary['NJ'])

['Trenton, NJ', 'Atlantic City, NJ', 'Vineland, NJ']


In [21]:
msa_state_dictionary = {}
for city in msa_unique:
    if city[-2:] not in msa_state_dictionary.keys():
        msa_state_dictionary[city[-2:]] = [city]
    else:
        msa_state_dictionary[city[-2:]].append(city)

In [22]:
multi_state_msa = [city for city in msa_msas if '-' in city]

In [23]:
def split_cities(l):
    individual_cities = []
    multis = []
    for cities in l:
        c,s = cities.split(',')
        cs = c.split('-')
        ss = s.split('-')

    for c in cs:
        for s in ss:
            city = c + ', ' + s.strip()
            individual_cities.append(city)
            multis.append(cities)
    return individual_cities, multis

In [24]:
ind_cities, multis = split_cities(multi_state_msa)

In [25]:
cities_dict = {}
combo_cities = []
matched_cities = []

for city, combo in zip(ind_cities,multis):
    try:
        if city in ZHVI_state_dictionary[city[-2:]]:
            matched_cities.append(city)
            combo_cities.append(combo)
            cities_dict[combo] = city
    except: continue

{'Youngstown-Warren-Boardman, OH-PA': 'Warren, PA'}


In [26]:
permits_df.head()

Unnamed: 0,CSA,CBSA,Name,Total,1 Unit,2 Units,3 and 4 Units,5 Units or More,Num of Structures With 5 Units or More,Date,RegionName
1,999.0,10180.0,"Abilene, TX ...",33.0,31.0,2.0,0.0,0.0,0.0,2021-02-01,"Abilene, TX ..."
2,184.0,10420.0,"Akron, OH ...",45.0,45.0,0.0,0.0,0.0,0.0,2021-02-01,"Akron, OH ..."
3,999.0,10500.0,"Albany, GA ...",32.0,15.0,0.0,0.0,17.0,1.0,2021-02-01,"Albany, GA ..."
4,440.0,10540.0,"Albany-Lebanon, OR ...",58.0,37.0,0.0,0.0,21.0,3.0,2021-02-01,"Albany-Lebanon, OR ..."
5,104.0,10580.0,"Albany-Schenectady-Troy, NY ...",268.0,118.0,4.0,0.0,146.0,2.0,2021-02-01,"Albany-Schenectady-Troy, NY ..."


In [22]:
city_df = pd.DataFrame()
city_df['multi'] = combo_cities
city_df['corresponding'] = matched_cities

city_df.to_csv('citi_matches.csv', index = False)

In [41]:
zhvi_msa_set = set(ZHVI_df.RegionName.unique())
permit_msa_set = set(permits_df.Name.unique())

msa_intersection = zhvi_msa_set.intersection(permit_msa_set)
unmatched_zhvi_msas = zhvi_msa_set.difference(permit_msa_set)
unmatched_permit_msas = permit_msa_set.difference(zhvi_msa_set)

# Set of tuples of form (ZHVI msa, Census MSA)
msa_matches = set()

for matched_msa in msa_intersection:
    msa_matches.add((matched_msa, matched_msa))
    
print((f"Matched: {len(msa_matches)}, Unmatched Zillow: {len(unmatched_zhvi_msas)}, "
       f"Unmatched Census {len(unmatched_permit_msas)}"))

Matched: 216, Unmatched Zillow: 682, Unmatched Census 168


In [42]:
for uzm in unmatched_zhvi_msas:
    city, state = uzm.split(", ")
    

{'Harrisburg, PA', 'Warren, PA', 'Laramie, WY', 'Cleveland, OH', 'Warsaw, IN', 'Austin, TX', 'Lawrenceburg, TN', 'Marquette, MI', 'Madison, IN', 'Liberal, KS', 'Emporia, KS', 'Washington, DC', 'Huntingdon, PA', 'New York, NY', 'Clearlake, CA', 'Pontiac, IL', 'Gloversville, NY', 'Spencer, IA', 'Altus, OK', 'Milwaukee, WI', 'Urbana, OH', 'Carlsbad, NM', 'Montrose, CO', 'Findlay, OH', 'Marshall, MO', 'Palatka, FL', 'Vernon, TX', 'Jefferson, GA', 'Quincy, IL', 'Marion, OH', 'Talladega, AL', 'Columbus, NE', 'Gallup, NM', 'Storm Lake, IA', 'Lebanon, NH', 'New Castle, IN', 'Kennewick, WA', 'Fort Leonard Wood, MO', 'Whitewater, WI', 'Show Low, AZ', 'Ottawa, KS', 'Durham, NC', 'Arcadia, FL', 'Bardstown, KY', 'Lewisburg, TN', 'Middlesborough, KY', 'Myrtle Beach, SC', 'Ca-¦on City, CO', 'North Port, FL', 'Rexburg, ID', 'Huntington, WV', 'Sebastian, FL', 'Utica, NY', 'Salina, KS', 'Seattle, WA', 'San Luis Obispo, CA', 'Wheeling, WV', 'Kapaa, HI', 'Brookings, OR', 'Oil City, PA', 'Vidalia, GA', 'Ca