In [None]:
import os
import re

import pandas as pd
import numpy as np
import altair as alt
import datetime as dt

In [None]:
msa_files = [os.path.join("msa", file_path) for file_path in os.listdir('msa')]
ZHVI_files = [os.path.join('zillow', file_path) for file_path in os.listdir( 'zillow')]

In [None]:
msa_dfs = [pd.read_excel(f, skiprows=7)[1:] for f in msa_files]
print(msa_files)

In [None]:
print(msa_dfs[0].columns)

msa_cols = ['CSA', 'CBSA', 'Name', 'Total', '1 Unit', '2 Units', 
            '3 and 4 Units','5 Units or More', 
            'Num of Structures With 5 Units or More']
msa_dfs = [df[msa_cols] for df in msa_dfs]

msa_dfs[0].head()

In [None]:
print(msa_dfs[0].columns)

In [None]:
for df, name in zip(msa_dfs, msa_files):
    df['Date'] = name[name.rfind('_')+1:name.rfind('.')]

df.head()

In [None]:
permits_df = pd.concat(msa_dfs).dropna(axis = 1)
permits_df.Name = permits_df.Name.apply(lambda x: x.strip())

permits_df

In [None]:
ZHVI_dfs = [pd.read_csv(z) for z in ZHVI_files]

ZHVI_dfs = [df[df['RegionType']=='msa'] for df in ZHVI_dfs]
ZHVI_dfs[0].head()

In [None]:
for df,f in zip(ZHVI_dfs, ZHVI_files):
    df['filename'] = f[f.rfind('/')+1:-4]
df.head()

In [None]:
ZHVI_df = pd.concat(ZHVI_dfs)

In [None]:
print(ZHVI_df.columns)

In [None]:
ZHVI_df

In [None]:
# for df in ZHVI_dfs:
#     columns = df.columns.tolist()
#     new_columns = []
    
#     for column in columns:
#         if column.startswith('2'):
#             new_column = str(column)
#             # Joan: can you please explain what this line does?
#             new_columns.append(
#                 # What does this expression mean?
#                 new_column[:new_column.find('-')] +
#                 # And this one?
#                 new_column[new_column.find('-') + 1:new_column.rfind('-')]
#             )
#         else: new_columns.append(column)
#     df.columns = new_columns

In [None]:
dumb_df = ZHVI_df.copy()
dumb_df.head()

In [None]:
ZHVI_df = pd.concat(ZHVI_dfs)

date_columns = [column for column in ZHVI_df.columns if re.match(r'\d{4}-\d{2}-\d{2}', column)]
other_columns = [column for column in ZHVI_df.columns if not re.match(r'\d{4}-\d{2}-\d{2}', column)]
ZHVI_df = ZHVI_df.melt(id_vars=other_columns, value_vars=date_columns, var_name='Date', value_name='Price')

# ZHVI_df.dropna(subset=['Price',], axis='columns', inplace=True)
ZHVI_df['Year'] = ZHVI_df.Date.apply(lambda x: int(x.split("-")[0]))
ZHVI_df['Month'] = ZHVI_df.Date.apply(lambda x: int(x.split("-")[1]))

ZHVI_df.head(10)

In [None]:
# permits_df['Date'] = pd.to_datetime(permits_df['Date'])
permits_df.head()

In [None]:
ZHVI_msas = ZHVI_dfs[0]['RegionName'].unique().tolist()
msa_msas = msa_dfs[0]['Name'].str.strip().unique().tolist()


combo_list = [city for city in ZHVI_msas if city in msa_msas]
ZHVI_unique = [city for city in ZHVI_msas if city not in msa_msas]
msa_unique = [city for city in msa_msas if city not in ZHVI_msas]


In [None]:
for item in msa_unique:
    state = item[-2:]
    for city in ZHVI_unique:
        if city[-2:] == state:
            continue
      # print(item)
      # print(city)
      # print()


In [None]:
ZHVI_state_dictionary = {}
for city in ZHVI_unique:
    if city[-2:] not in ZHVI_state_dictionary.keys():
        ZHVI_state_dictionary[city[-2:]] = [city]
    else:
        ZHVI_state_dictionary[city[-2:]].append(city)

In [None]:
print(ZHVI_state_dictionary['NJ'])

In [None]:
msa_state_dictionary = {}
for city in msa_unique:
    if city[-2:] not in msa_state_dictionary.keys():
        msa_state_dictionary[city[-2:]] = [city]
    else:
        msa_state_dictionary[city[-2:]].append(city)

In [None]:
multi_state_msa = [city for city in msa_msas if '-' in city]

In [None]:
def split_cities(l):
    individual_cities = []
    multis = []
    for cities in l:
        c,s = cities.split(',')
        cs = c.split('-')
        ss = s.split('-')

    for c in cs:
        for s in ss:
            city = c + ', ' + s.strip()
            individual_cities.append(city)
            multis.append(cities)
    return individual_cities, multis

In [None]:
ind_cities, multis = split_cities(multi_state_msa)

In [None]:
cities_dict = {}
combo_cities = []
matched_cities = []

for city, combo in zip(ind_cities,multis):
    try:
        if city in ZHVI_state_dictionary[city[-2:]]:
            matched_cities.append(city)
            combo_cities.append(combo)
            cities_dict[combo] = city
    except: continue

In [None]:
permits_df.head()

In [None]:
city_df = pd.DataFrame()
city_df['multi'] = combo_cities
city_df['corresponding'] = matched_cities

city_df.to_csv('citi_matches.csv', index = False)

In [None]:
zhvi_msa_set = set(ZHVI_df.RegionName.unique())
permit_msa_set = set(permits_df.Name.unique())

msa_intersection = zhvi_msa_set.intersection(permit_msa_set)
unmatched_zhvi_msas = zhvi_msa_set.difference(permit_msa_set)
unmatched_permit_msas = permit_msa_set.difference(zhvi_msa_set)

# Set of tuples of form (ZHVI msa, Census MSA)
msa_matches = set()

for matched_msa in msa_intersection:
    msa_matches.add((matched_msa, matched_msa))
    
print((f"Matched: {len(msa_matches)}, Unmatched Zillow: {len(unmatched_zhvi_msas)}, "
       f"Unmatched Census {len(unmatched_permit_msas)}"))

In [None]:
for uzm in unmatched_zhvi_msas:
    city, state = uzm.split(", ")
    