In [56]:
import metatab as mt

doc = mt.open_package('../_packages/cdss.ca.gov-residential_care_facilities-2017-ca-6')
assert int(doc.get_value('Root.Version')) >=6 
geocodes = doc.resource('geocodes')
facilities = doc.resource('facilities')
fac_df = facilities.dataframe()
geo_df = geocodes.dataframe()

facg_df = fac_df.merge(geo_df, left_on='facility_number', right_on='unique_id')

facg_df.head()

In [58]:
facg_df.match.value_counts()


Match          10675
No_Match         558
Tie               72
Parse Error        5
Name: match, dtype: int64

In [61]:
# How many of the entries do not have tracts?
facg_df[facg_df.tract_fips.isnull()]

Unnamed: 0,facility_type,facility_number,facility_name,licensee,facility_administrator,facility_telephone_number,facility_address,facility_city,facility_state,facility_zip,...,match_address,lat,lon,tiger_id,side_of_street,state_fips,county_fips,tract_fips,block_fips,tract_geoid
713,RESIDENTIAL CARE ELDERLY,306002321,ALISO LAGUNA VILLA,"PRISM HEALTH, LLC",ROBERT GREGORY,(949) 425-8300,24552 PACIFIC PARK,ALISO VIEJO,CA,92600.0,...,,,,,,,,,,
1374,RESIDENTIAL CARE ELDERLY,366426705,AT HOME CARE - COCQUI,"AT HOME CARE, INC.","SANTA MARIA, PATRICIA",(760) 880-2227,18609 COCQUI RD.,APPLE VALLEY,CA,92207.0,...,,,,,,,,,,


In [16]:
zip_xwalk_doc = mt.open_package('http://s3.amazonaws.com/library.metatab.org/huduser.gov-zip_tract-2016-2.csv')
zip_xwalk_doc

In [17]:

zip_xwalk

Header,Type,Description
zip,integer,5 digit USPS ZIP code
tract,integer,11 digit unique 2000 or 2010 Census tract GEOID consisting of state FIPS + county FIPS + tract code. The decimal is implied and leading and trailing zeros have been preserved.
res_ratio,number,"The ratio of residential addresses in the ZIP – Tract, County, or CBSA part to the total number of residential addresses in the entire ZIP."
bus_ratio,number,"The ratio of business addresses in the ZIP – Tract, County, or CBSA part to the total number of business addresses in the entire ZIP."
oth_ratio,number,"The ratio of other addresses in the ZIP – Tract, County, or CBSA part to the total number of other addresses in the entire ZIP."
tot_ratio,number,"The ratio of all addresses in the ZIP – Tract, County, or CBSA part to the total number of all types of addresses in the entire ZIP."


In [18]:
zip_xwalk_df = zip_xwalk.dataframe()
zip_xwalk_df.head()

Unnamed: 0,zip,tract,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,501,36103158607,0.0,1.0,0.0,1.0
1,601,72001956600,0.136735,0.358911,0.347222,0.15427
2,601,72001956700,0.859996,0.636139,0.652778,0.842387
3,601,72001956300,0.003269,0.00495,0.0,0.003343
4,602,72003430300,0.037744,0.2725,0.315789,0.055036


In [None]:
zip_xwalk_doc = mt.open_package('http://library.metatab.org/huduser.gov-zip_tract-2016-2.csv')
zip_xwalk = zip_xwalk_doc.resource('zip-tract')
zip_xwalk_df = zip_xwalk.dataframe()
zx_groups = zip_xwalk_df.sort_values('res_ratio').groupby('zip')

In [41]:
def make_zip_map():
    """Create a map from zip to track that uses the HUD zip-tract cross walk as a probablilty
    map, with the facility it used as the probability. Using the facility ID makes the mapping stable. """

    def make_single_zip_map_f(groups, zip):
        """Function to create a closure for mapping for a single zip, from an id value to 
         tract"""
        import numpy as np
        import pandas as pd

        # Use the resigential ratios, the portion of the homes in the zip that are in each tract. 
        res_ratios = list(zx_groups.get_group(zip).cumsum().res_ratio)
        tracts = list(zx_groups.get_group(zip).tract)
        
        assert len(res_ratios) == len(tracts)

        def _f(id):
            # Use the end of the ID value to ensure repeadability
            n = float(id%100) / 100.0
            index = np.argmax(pd.Series(res_ratios) > n)

            return tracts[index]

        return _f
    
    f_map = {}
    
    # dict that returns, for each zip, the function to get a tract for the id number. 
    for zp in zx_groups.groups.keys():
        f_map[zp] = make_single_zip_map_f(zx_groups, zp)
        

    # Finally, put it all together in a single clsure. 
    def lookup(zip, n):

        try:
            print (int(zip), int(n)%100 / 100.0, )
            return str(f_map[int(zip)](int(n)%100 / 100.0))
        except KeyError:
            return None
        

    return lookup

zip_to_tract = make_zip_map() 

In [42]:
print(zip_to_tract(623, 40))

623 0.4
72023830102
