### Import Packages

In [1]:
import pandas as pd 
import numpy as np
from zipfile import ZipFile
import json

import os

# Append the entire repo parent director so files therein can be accessed in notebook
import sys
import pathlib
sys.path.append(str(pathlib.Path().absolute().parent))

from src import helper

### Data Dictionary

In [2]:
data_dict = pd.read_csv('../references/DataDictionary.csv', encoding='latin-1')
pd.set_option('max_colwidth', 1000)
display(data_dict)
pd.reset_option('max_colwidth')

Unnamed: 0,Variable,Definition
0,AgeOfInventory,"Each Wednesday, age of inventory is calculated as the median number of days all active listings as of that Wednesday have been current. These medians are then aggregated into the number reported by taking the median across weekly values."
1,DaysOnZillow,"The median days on market of homes sold within a given month, including foreclosure re-sales. The latest data is for one month prior to the current ZHVI (e.g., if the most current month for ZHVI data is January, the most current month for Days on Zillow data will be December)."
2,HomesSoldAsForeclosuresRatio,"The number of homes (per 10,000 homes) that were foreclosed upon in a given month. A foreclosure occurs when a homeowner loses their home to their lending institution or it is sold to a third party at an auction."
3,InventorySeasonallyAdjusted,A seasonally adjusted measure of the median of weekly snapshot of for-sale homes within a region for a given month
4,InventoryRaw,Median of weekly snapshot of for-sale homes within a region for a given month
5,MedianListingPricePerSqft,Median of list prices divided by the square footage of a home
6,MedianListingPrice,Median of the list price (or asking price) for homes listed on Zillow
7,MedianPctOfPriceReduction,Median of the percentage price reduction for homes with a price reduction during the month
8,MedianPriceCutDollar,Median of the price reduction for homes with a price reduction during the month
9,MedianRentalPricePerSqft,Median of the rental price per square foot of homes listed for rent on Zillow in a given region


### Load Data

In [3]:
sorted(os.listdir('../data/raw'))

['City_time_series.csv.zip',
 'CountyCrossWalk_Zillow.csv',
 'County_time_series.csv.zip',
 'Metro_time_series.csv.zip',
 'Neighborhood_time_series.csv.zip',
 'State_time_series.csv.zip',
 'Zip_time_series.csv.zip',
 'all_available_metrics.json',
 'cities_crosswalk.csv.zip',
 'fields_per_level.json',
 'unzipped']

In [4]:
csv_dfs = {}

for filename in sorted(os.listdir('../data/raw/')):
    if '.csv.zip' in filename: 
        with ZipFile(f'../data/raw/{filename}', 'r') as a_zip:
            a_zip.extractall(f'../data/raw/unzipped/{filename[:-4]}')
            
    elif '.csv' in filename:
        csv_dfs[filename[:-4]] = pd.read_csv(f'../data/raw/{filename}')
             
for filename in sorted(os.listdir('../data/raw/unzipped/')):
    csv_dfs[filename[:-4]] = (pd.read_csv(f'../data/raw/unzipped/{filename}/{filename}', engine='python'))

In [5]:
csv_dfs.keys()

dict_keys(['CountyCrossWalk_Zillow', 'City_time_series', 'County_time_series', 'Metro_time_series', 'Neighborhood_time_series', 'State_time_series', 'Zip_time_series', 'cities_crosswalk'])

### Inspect Data
- CountyCrosswalk
- cities_crosswalk


- State_TS
- County_TS
- Metro_TS
- City_TS
- Neighborhood_TS
- Zip_TS

In [6]:
county_crosswalk = csv_dfs['CountyCrossWalk_Zillow']
city_ts = csv_dfs['City_time_series']
county_ts = csv_dfs['County_time_series']
metro_ts = csv_dfs['Metro_time_series']
neighborhood_ts = csv_dfs['Neighborhood_time_series']
state_ts = csv_dfs['State_time_series']
zip_ts = csv_dfs['Zip_time_series']
cities_crosswalk = csv_dfs['cities_crosswalk']

#### County Crosswalk

In [7]:
helper.high_level_inspect_df(county_crosswalk)

  CountyName     StateName  StateFIPS  CountyFIPS MetroName_Zillow  \
0       Pike  Pennsylvania         42         103     New York, NY   
1      Bronx      New York         36           5     New York, NY   
2      Essex    New Jersey         34          13     New York, NY   

                                CBSAName  CountyRegionID_Zillow  \
0  New York-Newark-Jersey City, NY-NJ-PA                    280   
1  New York-Newark-Jersey City, NY-NJ-PA                    401   
2  New York-Newark-Jersey City, NY-NJ-PA                    504   

   MetroRegionID_Zillow   FIPS  CBSACode  
0              394913.0  42103   35620.0  
1              394913.0  36005   35620.0  
2              394913.0  34013   35620.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3144 entries, 0 to 3143
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   CountyName             3144 non-null   object 
 1   StateName 

#### Cities Crosswalk

In [8]:
helper.high_level_inspect_df(cities_crosswalk)

          Unique_City_ID         City     County State
0   oak_grovechristianky    Oak Grove  Christian    KY
1  jarvisburgcurritucknc   Jarvisburg  Currituck    NC
2   mcminnvilleyamhillor  McMinnville    Yamhill    OR
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25341 entries, 0 to 25340
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unique_City_ID  25341 non-null  object
 1   City            25341 non-null  object
 2   County          25341 non-null  object
 3   State           25341 non-null  object
dtypes: object(4)
memory usage: 792.0+ KB
None
                count unique              top  freq
Unique_City_ID  25341  25341  hansonhopkinsky     1
City            25341  16609         Franklin    28
County          25341   1652        Jefferson   392
State           25341     51               PA  2692
Unique_City_ID : 25341 unique values.
City : 16609 unique values.
County : 1652 unique values.
State 

#### State time series

In [9]:
helper.high_level_inspect_df(state_ts)

         Date RegionName  DaysOnZillow_AllHomes  \
0  1996-04-30    Alabama                    NaN   
1  1996-04-30    Arizona                    NaN   
2  1996-04-30   Arkansas                    NaN   

   InventorySeasonallyAdjusted_AllHomes  InventoryRaw_AllHomes  \
0                                   NaN                    NaN   
1                                   NaN                    NaN   
2                                   NaN                    NaN   

   MedianListingPricePerSqft_1Bedroom  MedianListingPricePerSqft_2Bedroom  \
0                                 NaN                                 NaN   
1                                 NaN                                 NaN   
2                                 NaN                                 NaN   

   MedianListingPricePerSqft_3Bedroom  MedianListingPricePerSqft_4Bedroom  \
0                                 NaN                                 NaN   
1                                 NaN                              

                                       count          mean           std  \
DaysOnZillow_AllHomes                 4845.0    110.117363     27.474195   
InventorySeasonallyAdjusted_AllHomes  4896.0  33292.662786  34926.704422   
InventoryRaw_AllHomes                 4896.0  33299.013685  35014.163714   
MedianListingPricePerSqft_1Bedroom    3586.0    182.470536     99.809488   
MedianListingPricePerSqft_2Bedroom    4534.0    135.490657     74.250919   
...                                      ...           ...           ...   
ZRI_AllHomes                          4254.0   1321.329807    371.006175   
ZRI_AllHomesPlusMultifamily           4336.0   1318.055581    368.783157   
ZriPerSqft_AllHomes                   4336.0      0.929661      0.284986   
Zri_MultiFamilyResidenceRental        4336.0   1233.050277    369.426763   
Zri_SingleFamilyResidenceRental       4254.0   1327.520686    383.309444   

                                             min          25%           50%  \
DaysOnZi

In [10]:
states = state_ts['RegionName'].unique().tolist()

for state in states:
    print(f'There are {state_ts[state_ts["RegionName"] == state].shape[0]} records in {state}.')

There are 261 records in Alabama.
There are 261 records in Arizona.
There are 261 records in Arkansas.
There are 261 records in California.
There are 261 records in Colorado.
There are 261 records in Connecticut.
There are 261 records in Delaware.
There are 261 records in Florida.
There are 261 records in Georgia.
There are 261 records in Hawaii.
There are 261 records in Idaho.
There are 261 records in Illinois.
There are 261 records in Indiana.
There are 261 records in Iowa.
There are 261 records in Kansas.
There are 261 records in Kentucky.
There are 261 records in Louisiana.
There are 261 records in Maine.
There are 261 records in Maryland.
There are 261 records in Massachusetts.
There are 261 records in Michigan.
There are 261 records in Minnesota.
There are 261 records in Mississippi.
There are 261 records in Missouri.
There are 261 records in Nebraska.
There are 261 records in Nevada.
There are 261 records in NewHampshire.
There are 261 records in NewJersey.
There are 261 records

#### County time series

In [11]:
helper.high_level_inspect_df(county_ts)

         Date  RegionName  DaysOnZillow_AllHomes  \
0  1996-04-30       10001                    NaN   
1  1996-04-30       10003                    NaN   
2  1996-04-30        1003                    NaN   

   InventorySeasonallyAdjusted_AllHomes  InventoryRaw_AllHomes  \
0                                   NaN                    NaN   
1                                   NaN                    NaN   
2                                   NaN                    NaN   

   MedianListingPricePerSqft_1Bedroom  MedianListingPricePerSqft_2Bedroom  \
0                                 NaN                                 NaN   
1                                 NaN                                 NaN   
2                                 NaN                                 NaN   

   MedianListingPricePerSqft_3Bedroom  MedianListingPricePerSqft_4Bedroom  \
0                                 NaN                                 NaN   
1                                 NaN                          

                                         count          mean           std  \
RegionName                            518791.0  30740.207658  15197.121398   
DaysOnZillow_AllHomes                  43692.0    112.776313     33.901225   
InventorySeasonallyAdjusted_AllHomes  177024.0    853.768884   1830.864948   
InventoryRaw_AllHomes                 177024.0    853.934393   1835.198584   
MedianListingPricePerSqft_1Bedroom      9432.0    246.774545    185.323088   
...                                        ...           ...           ...   
ZRI_AllHomes                          150077.0   1163.598939    347.136641   
ZRI_AllHomesPlusMultifamily           151251.0   1157.171807    339.960046   
ZriPerSqft_AllHomes                   139194.0      0.790006      0.264212   
Zri_MultiFamilyResidenceRental         97008.0   1083.978713    341.784842   
Zri_SingleFamilyResidenceRental       150983.0   1171.543114    366.933723   

                                              min          25% 

#### Metro time series

In [12]:
helper.high_level_inspect_df(metro_ts)

         Date RegionName  AgeOfInventory  DaysOnZillow_AllHomes  \
0  1996-04-30      10180             NaN                    NaN   
1  1996-04-30      10220             NaN                    NaN   
2  1996-04-30      10300             NaN                    NaN   

   InventorySeasonallyAdjusted_AllHomes  InventoryRaw_AllHomes  \
0                                   NaN                    NaN   
1                                   NaN                    NaN   
2                                   NaN                    NaN   

   InventorySeasonallyAdjusted_BottomTier  \
0                                     NaN   
1                                     NaN   
2                                     NaN   

   InventorySeasonallyAdjusted_MiddleTier  \
0                                     NaN   
1                                     NaN   
2                                     NaN   

   InventorySeasonallyAdjusted_TopTier  MedianListingPricePerSqft_1Bedroom  \
0                         

                                          count         mean           std  \
AgeOfInventory                          24696.0    90.707766     28.587343   
DaysOnZillow_AllHomes                   24982.0   112.709322     31.178244   
InventorySeasonallyAdjusted_AllHomes    68352.0  4508.221983  65064.674744   
InventoryRaw_AllHomes                   68352.0  4509.104269  65138.626469   
InventorySeasonallyAdjusted_BottomTier  39467.0  1764.748803  20065.256544   
...                                         ...          ...           ...   
ZRI_AllHomes                            58028.0  1135.414645    315.022427   
ZRI_AllHomesPlusMultifamily             58221.0  1128.008348    307.588897   
ZriPerSqft_AllHomes                     56812.0     0.775399      0.212946   
Zri_MultiFamilyResidenceRental          46731.0  1019.547859    287.047627   
Zri_SingleFamilyResidenceRental         58004.0  1137.873285    324.197993   

                                            min     25%       5

#### City time series

In [13]:
helper.high_level_inspect_df(city_ts)

         Date          RegionName  InventorySeasonallyAdjusted_AllHomes  \
0  1996-04-30  abbottstownadamspa                                   NaN   
1  1996-04-30   aberdeenbinghamid                                   NaN   
2  1996-04-30   aberdeenharfordmd                                   NaN   

   InventoryRaw_AllHomes  MedianListingPricePerSqft_1Bedroom  \
0                    NaN                                 NaN   
1                    NaN                                 NaN   
2                    NaN                                 NaN   

   MedianListingPricePerSqft_2Bedroom  MedianListingPricePerSqft_3Bedroom  \
0                                 NaN                                 NaN   
1                                 NaN                                 NaN   
2                                 NaN                                 NaN   

   MedianListingPricePerSqft_4Bedroom  \
0                                 NaN   
1                                 NaN   
2         

                                          count         mean         std  \
InventorySeasonallyAdjusted_AllHomes   771744.0   163.438201  451.979813   
InventoryRaw_AllHomes                  771744.0   163.469371  452.845056   
MedianListingPricePerSqft_1Bedroom       7670.0   260.912038  169.798120   
MedianListingPricePerSqft_2Bedroom      74574.0   167.764079  118.391115   
MedianListingPricePerSqft_3Bedroom     250633.0   145.076803   93.495189   
...                                         ...          ...         ...   
ZRI_AllHomes                          1098167.0  1402.930083  750.948172   
ZRI_AllHomesPlusMultifamily           1100850.0  1391.670705  728.289036   
ZriPerSqft_AllHomes                   1009397.0     0.910898    0.336117   
Zri_MultiFamilyResidenceRental         450219.0  1322.360111  528.027246   
Zri_SingleFamilyResidenceRental       1098919.0  1424.704015  808.380067   

                                             min         25%          50%  \
InventoryS

#### Neighborhood time series

In [14]:
helper.high_level_inspect_df(neighborhood_ts)

         Date RegionName  InventorySeasonallyAdjusted_AllHomes  \
0  1996-04-30      10007                                   NaN   
1  1996-04-30      10329                                   NaN   
2  1996-04-30     104898                                   NaN   

   InventoryRaw_AllHomes  MedianListingPricePerSqft_1Bedroom  \
0                    NaN                                 NaN   
1                    NaN                                 NaN   
2                    NaN                                 NaN   

   MedianListingPricePerSqft_2Bedroom  MedianListingPricePerSqft_3Bedroom  \
0                                 NaN                                 NaN   
1                                 NaN                                 NaN   
2                                 NaN                                 NaN   

   MedianListingPricePerSqft_4Bedroom  \
0                                 NaN   
1                                 NaN   
2                                 NaN   

   M

                                         count         mean          std  \
InventorySeasonallyAdjusted_AllHomes  201408.0    78.207301   116.279778   
InventoryRaw_AllHomes                 201408.0    78.220940   116.704730   
MedianListingPricePerSqft_1Bedroom      2780.0   485.845341   448.100647   
MedianListingPricePerSqft_2Bedroom     11473.0   353.179001   416.904876   
MedianListingPricePerSqft_3Bedroom     34393.0   195.918099   260.892989   
...                                        ...          ...          ...   
ZRI_AllHomes                          554742.0  1609.351226  1007.138431   
ZRI_AllHomesPlusMultifamily           554828.0  1581.790773   974.824301   
ZriPerSqft_AllHomes                   546398.0     1.129954     0.592957   
Zri_MultiFamilyResidenceRental        335732.0  1431.808993   689.340929   
Zri_SingleFamilyResidenceRental       539670.0  1686.849315  1150.181356   

                                             min          25%          50%  \
Inventory

#### Zip time series

In [15]:
helper.high_level_inspect_df(zip_ts)

         Date  RegionName  InventorySeasonallyAdjusted_AllHomes  \
0  1996-04-30        1001                                   NaN   
1  1996-04-30        1002                                   NaN   
2  1996-04-30        1005                                   NaN   

   InventoryRaw_AllHomes  MedianListingPricePerSqft_1Bedroom  \
0                    NaN                                 NaN   
1                    NaN                                 NaN   
2                    NaN                                 NaN   

   MedianListingPricePerSqft_2Bedroom  MedianListingPricePerSqft_3Bedroom  \
0                                 NaN                                 NaN   
1                                 NaN                                 NaN   
2                                 NaN                                 NaN   

   MedianListingPricePerSqft_4Bedroom  \
0                                 NaN   
1                                 NaN   
2                                 NaN   



                                          count          mean           std  \
RegionName                            4383885.0  46837.261770  28833.342816   
InventorySeasonallyAdjusted_AllHomes  1051104.0    123.049029    117.287520   
InventoryRaw_AllHomes                 1051104.0    123.072143    118.017261   
MedianListingPricePerSqft_1Bedroom       8194.0    322.914946    301.027216   
MedianListingPricePerSqft_2Bedroom      86554.0    188.769142    186.646266   
...                                         ...           ...           ...   
ZRI_AllHomes                          1337362.0   1429.687799    719.716319   
ZRI_AllHomesPlusMultifamily           1339353.0   1414.117267    691.326536   
ZriPerSqft_AllHomes                   1253969.0      0.948460      0.424044   
Zri_MultiFamilyResidenceRental         723542.0   1325.079733    568.827402   
Zri_SingleFamilyResidenceRental       1334321.0   1463.457235    807.792422   

                                             min   