# Pare Down Data Scope for Project (CA vs. TX)
- look at all levels of granularity to determine what the State of California Coverage is: 
    - County
        - 58 total counties
        - 57 in the dataset (or 285,747 records)
    - City
        - 482 total cities
        - 807 in the dataset (or 201,417 records)
        - Note: Zillow draws no distinction b/t municipalities vs. cities therefore it's a pretty complete
    - Zip Code
        - 2.6K+ total zipcodes in CA
        - 1.3K in the dataset (or 332,663 records)
   
   
- Alternatively, consider paring down the scope of the work to Texas
    - County 
    - City
    - Zip

In [1]:
import pandas as pd 
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from collections import Counter
from pprint import pprint

import zipcodes

# Append the entire repo parent director so files therein can be accessed in notebook
import os
import sys
import pathlib
sys.path.append(str(pathlib.Path().absolute().parent))

from src import helper
%load_ext autoreload

### Import Data

In [2]:
county_ts = pd.read_csv('../data/raw/unzipped/County_time_series.csv/County_time_series.csv')
city_ts = pd.read_csv('../data/raw/unzipped/City_time_series.csv/City_time_series.csv')
zip_ts = pd.read_csv('../data/raw/unzipped/Zip_time_series.csv/Zip_time_series.csv')

fips_mapping = pd.read_pickle('../data/interim/fips_map.pickle')

In [3]:
county_ts.head(3)

Unnamed: 0,Date,RegionName,DaysOnZillow_AllHomes,InventorySeasonallyAdjusted_AllHomes,InventoryRaw_AllHomes,MedianListingPricePerSqft_1Bedroom,MedianListingPricePerSqft_2Bedroom,MedianListingPricePerSqft_3Bedroom,MedianListingPricePerSqft_4Bedroom,MedianListingPricePerSqft_5BedroomOrMore,...,ZHVI_BottomTier,ZHVI_CondoCoop,ZHVI_MiddleTier,ZHVI_SingleFamilyResidence,ZHVI_TopTier,ZRI_AllHomes,ZRI_AllHomesPlusMultifamily,ZriPerSqft_AllHomes,Zri_MultiFamilyResidenceRental,Zri_SingleFamilyResidenceRental
0,1996-04-30,10001,,,,,,,,,...,65600.0,,85700.0,86000.0,129700.0,,,,,
1,1996-04-30,10003,,,,,,,,,...,81900.0,,116800.0,116900.0,181400.0,,,,,
2,1996-04-30,1003,,,,,,,,,...,70600.0,,110600.0,106800.0,192300.0,,,,,


In [4]:
city_ts.head(3)

Unnamed: 0,Date,RegionName,InventorySeasonallyAdjusted_AllHomes,InventoryRaw_AllHomes,MedianListingPricePerSqft_1Bedroom,MedianListingPricePerSqft_2Bedroom,MedianListingPricePerSqft_3Bedroom,MedianListingPricePerSqft_4Bedroom,MedianListingPricePerSqft_5BedroomOrMore,MedianListingPricePerSqft_AllHomes,...,ZHVI_BottomTier,ZHVI_CondoCoop,ZHVI_MiddleTier,ZHVI_SingleFamilyResidence,ZHVI_TopTier,ZRI_AllHomes,ZRI_AllHomesPlusMultifamily,ZriPerSqft_AllHomes,Zri_MultiFamilyResidenceRental,Zri_SingleFamilyResidenceRental
0,1996-04-30,abbottstownadamspa,,,,,,,,,...,,,,,108700.0,,,,,
1,1996-04-30,aberdeenbinghamid,,,,,,,,,...,,,,,168400.0,,,,,
2,1996-04-30,aberdeenharfordmd,,,,,,,,,...,81300.0,137900.0,109600.0,108600.0,147900.0,,,,,


In [5]:
zip_ts.head(3)

Unnamed: 0,Date,RegionName,InventorySeasonallyAdjusted_AllHomes,InventoryRaw_AllHomes,MedianListingPricePerSqft_1Bedroom,MedianListingPricePerSqft_2Bedroom,MedianListingPricePerSqft_3Bedroom,MedianListingPricePerSqft_4Bedroom,MedianListingPricePerSqft_5BedroomOrMore,MedianListingPricePerSqft_AllHomes,...,ZHVI_BottomTier,ZHVI_CondoCoop,ZHVI_MiddleTier,ZHVI_SingleFamilyResidence,ZHVI_TopTier,ZRI_AllHomes,ZRI_AllHomesPlusMultifamily,ZriPerSqft_AllHomes,Zri_MultiFamilyResidenceRental,Zri_SingleFamilyResidenceRental
0,1996-04-30,1001,,,,,,,,,...,68700.0,67000.0,101900.0,107000.0,124800.0,,,,,
1,1996-04-30,1002,,,,,,,,,...,97000.0,81300.0,135200.0,145800.0,213600.0,,,,,
2,1996-04-30,1005,,,,,,,,,...,85400.0,,101200.0,101200.0,125700.0,,,,,


In [6]:
fips_mapping.head(3)

Unnamed: 0,RegionName,MetroName,StateName,CensusRegion,Unique_City_ID,City,County,State
0,10001,"Dover, DE",Delaware,South,highland_acreskentde,Highland Acres,Kent,DE
1,10001,"Dover, DE",Delaware,South,kentonkentde,Kenton,Kent,DE
2,10001,"Dover, DE",Delaware,South,farmingtonkentde,Farmington,Kent,DE


### Pare down to California at each level

#### County-Level Scope Down for California

In [7]:
county_ts_merged = county_ts.merge(fips_mapping,
                                    how='left',
                                    on='RegionName')

In [8]:
ca_county_ts = county_ts_merged[county_ts_merged['State'] == 'CA']
ca_county_ts.shape

(285747, 89)

![image.png](attachment:image.png)

In [9]:
ca_county_ts['RegionName'].nunique()

57

#### City-Level Scope Down for California

In [10]:
city_ts_merged = city_ts.merge(fips_mapping,
                              how='left',
                              left_on='RegionName',
                              right_on='Unique_City_ID')
print(city_ts.shape)
print(city_ts_merged.shape)

(3762566, 81)
(3762566, 89)


In [11]:
ca_city_ts = city_ts_merged[city_ts_merged['State'] == 'CA']
ca_city_ts.shape

(201417, 89)

![image.png](attachment:image.png)

In [12]:
ca_city_ts['Unique_City_ID'].nunique()

807

In [13]:
len(set([ca_cityID for ca_cityID in ca_city_ts['Unique_City_ID'] if ca_cityID[-2:] == 'ca' ]))

807

#### Zip-Level Scope Down for California
- The provided data only captures about 1.3K of possible 2.6K unique zip codes in the state of California
- Use the `zipcodes` package to see if I can link in the city data to see what we have in total for the state of California.

In [14]:
zip_ts['ZipCode_str'] = zip_ts['RegionName'].astype(str)
zip_ts['ZipCode_str'] = ['0' + zipcode if len(zipcode) == 4 else zipcode for zipcode in zip_ts['ZipCode_str']]
zip_ts.head(3)

Unnamed: 0,Date,RegionName,InventorySeasonallyAdjusted_AllHomes,InventoryRaw_AllHomes,MedianListingPricePerSqft_1Bedroom,MedianListingPricePerSqft_2Bedroom,MedianListingPricePerSqft_3Bedroom,MedianListingPricePerSqft_4Bedroom,MedianListingPricePerSqft_5BedroomOrMore,MedianListingPricePerSqft_AllHomes,...,ZHVI_CondoCoop,ZHVI_MiddleTier,ZHVI_SingleFamilyResidence,ZHVI_TopTier,ZRI_AllHomes,ZRI_AllHomesPlusMultifamily,ZriPerSqft_AllHomes,Zri_MultiFamilyResidenceRental,Zri_SingleFamilyResidenceRental,ZipCode_str
0,1996-04-30,1001,,,,,,,,,...,67000.0,101900.0,107000.0,124800.0,,,,,,1001
1,1996-04-30,1002,,,,,,,,,...,81300.0,135200.0,145800.0,213600.0,,,,,,1002
2,1996-04-30,1005,,,,,,,,,...,,101200.0,101200.0,125700.0,,,,,,1005


In [15]:
# California zipcodes all start with 9
potential_CA_zipcodes = [zipcode for zipcode in zip_ts['ZipCode_str'].unique().tolist() if zipcode[0] == '9']

# Filter down on these potential California zipcodes
maybe_ca_zip_ts = zip_ts[zip_ts['ZipCode_str'].isin(potential_CA_zipcodes)]
maybe_ca_zip_ts.shape

(511863, 77)

In [16]:
len(potential_CA_zipcodes)

2036

![image.png](attachment:image.png)

In [17]:
confirmed_CA_zips = []

for zipcode in potential_CA_zipcodes:
    zip_info = zipcodes.matching(zipcode)[0]
    if zip_info['state'] == 'CA':
        confirmed_CA_zips.append(zipcode)
    else:
        pass
    
len(confirmed_CA_zips)

1311

In [18]:
ca_zip_ts = maybe_ca_zip_ts[maybe_ca_zip_ts['ZipCode_str'].isin(confirmed_CA_zips)]
ca_zip_ts.shape

(332663, 77)

In [19]:
ca_zip_ts.head(3)

Unnamed: 0,Date,RegionName,InventorySeasonallyAdjusted_AllHomes,InventoryRaw_AllHomes,MedianListingPricePerSqft_1Bedroom,MedianListingPricePerSqft_2Bedroom,MedianListingPricePerSqft_3Bedroom,MedianListingPricePerSqft_4Bedroom,MedianListingPricePerSqft_5BedroomOrMore,MedianListingPricePerSqft_AllHomes,...,ZHVI_CondoCoop,ZHVI_MiddleTier,ZHVI_SingleFamilyResidence,ZHVI_TopTier,ZRI_AllHomes,ZRI_AllHomesPlusMultifamily,ZriPerSqft_AllHomes,Zri_MultiFamilyResidenceRental,Zri_SingleFamilyResidenceRental,ZipCode_str
11779,1996-04-30,90001,,,,,,,,,...,,114100.0,114100.0,128400.0,,,,,,90001
11780,1996-04-30,90002,,,,,,,,,...,,105700.0,105700.0,121300.0,,,,,,90002
11781,1996-04-30,90003,,,,,,,,,...,,103800.0,103800.0,124000.0,,,,,,90003


In [20]:
ca_county_ts.reset_index(drop=True, inplace=True)
ca_city_ts.reset_index(drop=True, inplace=True)
ca_zip_ts.reset_index(drop=True, inplace=True)

ca_county_ts.to_pickle('../data/interim/california-county-ts.pickle')
ca_city_ts.to_pickle('../data/interim/california-city-ts.pickle')
ca_zip_ts.to_pickle('../data/interim/california-zip-ts.pickle')

### ZipCode Data Expansion
**OBJECTIVE** Since there are only 1.3K zipcodes in the Zillow Data, try to use the city time series data to see if we can expand upon what's available to us in the zipcode regions. Explore to see if we can use latitude and longitude metadata to aid in this effort.

- The CA city data holds 110 unique cities more than the CA Zip data.
- Notably, sometimes a particular city can have > 1 zipcode. 
- This could be what's driving the difference between the two datasets.


In [21]:
ca_city_ts.head()

Unnamed: 0,Date,RegionName_x,InventorySeasonallyAdjusted_AllHomes,InventoryRaw_AllHomes,MedianListingPricePerSqft_1Bedroom,MedianListingPricePerSqft_2Bedroom,MedianListingPricePerSqft_3Bedroom,MedianListingPricePerSqft_4Bedroom,MedianListingPricePerSqft_5BedroomOrMore,MedianListingPricePerSqft_AllHomes,...,Zri_MultiFamilyResidenceRental,Zri_SingleFamilyResidenceRental,RegionName_y,MetroName,StateName,CensusRegion,Unique_City_ID,City,County,State
0,1996-04-30,actonlos_angelesca,,,,,,,,,...,,,6037.0,"Los Angeles-Long Beach-Anaheim, CA",California,West,actonlos_angelesca,Acton,Los Angeles,CA
1,1996-04-30,adelantosan_bernardinoca,,,,,,,,,...,,,6071.0,"Riverside, CA",California,West,adelantosan_bernardinoca,Adelanto,San Bernardino,CA
2,1996-04-30,agoura_hillslos_angelesca,,,,,,,,,...,,,6037.0,"Los Angeles-Long Beach-Anaheim, CA",California,West,agoura_hillslos_angelesca,Agoura Hills,Los Angeles,CA
3,1996-04-30,aguangariversideca,,,,,,,,,...,,,6065.0,"Riverside, CA",California,West,aguangariversideca,Aguanga,Riverside,CA
4,1996-04-30,ahwahneemaderaca,,,,,,,,,...,,,6039.0,"Madera, CA",California,West,ahwahneemaderaca,Ahwahnee,Madera,CA


In [22]:
# Get city names for the zip_ts data 
# Then check to see which cities do NOT overlap vs. those that DO overlap with that of city_time series

zip_city_map = dict()
zip_county_map = dict()
zip_latlong = dict()

for zipcode in ca_zip_ts['ZipCode_str'].unique():
    zipinfo = zipcodes.matching(zipcode)
    try: 
        for info in zipinfo:
            zip_city_map[zipcode] = info['city']
            zip_county_map[zipcode] = info['county']
            zip_latlong[zipcode] = {'lat' : info['lat'], 'long' : info['long']}
            
    except:
        pass

print(len(zip_city_map))
print(len(zip_county_map))
print(len(zip_latlong))

1311
1311
1311


In [23]:
ca_zip_ts['City'] = [zip_city_map[row.ZipCode_str] for row in ca_zip_ts.itertuples()]
ca_zip_ts['County'] = [zip_county_map[row.ZipCode_str] for row in ca_zip_ts.itertuples()]
ca_zip_ts['Lat'] = [zip_latlong[row.ZipCode_str]['lat'] for row in ca_zip_ts.itertuples()]
ca_zip_ts['Long'] = [zip_latlong[row.ZipCode_str]['long'] for row in ca_zip_ts.itertuples()]

In [24]:
len(set(ca_city_ts['City'].unique()) - (set(ca_zip_ts['City'].unique())))

110

In [25]:
# Cities missing from the ca_zip_ts table: 
(set(ca_city_ts['City'].unique()) - (set(ca_zip_ts['City'].unique())))

{'Alondra Park',
 'Alta',
 'Alum Rock',
 'Angels',
 'Annapolis',
 'Arden-Arcade',
 'August',
 'Avila Beach',
 'Avocado Heights',
 'Baker',
 'Bell',
 'Bell Canyon',
 'Belvedere',
 'Blairsden-Graeagle',
 'Bodfish',
 'Bolinas',
 'Broadmoor Village',
 'Buttonwillow',
 'Caliente',
 'Cambrian Park',
 'Canyon Lake',
 'Clio',
 'Commerce',
 'Coto de Caza',
 'Country Club',
 'Cudahy',
 'Del Monte Forest',
 'Del Rey Oaks',
 'Doyle',
 'East Foothills',
 'East La Mirada',
 'East Los Angeles',
 'East Palo Alto',
 'East Pasadena',
 'Easton',
 'Eastvale',
 'Edwards',
 'Emerald Lake Hills',
 'Essex',
 'Florence-Graham',
 'Florin',
 'Foster City',
 'Garden Acres',
 'Garden Valley',
 'Glen Avon',
 'Hidden Hills',
 'Hidden Meadows',
 'Highgrove',
 'Highlands-Baywood Park',
 'Hillsborough',
 'Home Gardens',
 'Homeland',
 'Hornbrook',
 'Kennedy',
 'Kensington',
 'Kentfield',
 'La Crescenta-Montrose',
 'La Habra Heights',
 'La Riviera',
 'Ladera Heights',
 'Lagunitas',
 'Lexington Hills',
 'Lincoln Village',

### Get Latitude Longitude Information for as many records as possible

### Check Data Sparsity Across All Sets
- County
- City
- Zip