# Setup

In [1]:
pwd

'/home/umni2/a/umnilab/users/verma99/mk/spr_4711/code'

In [2]:
from mobilkit.umni import *
from project import *

In [3]:
import urllib.request
from zipfile import ZipFile

import fiona
import osmnx
import yaml

In [4]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [5]:
SP.start()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/29 13:24:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/09/29 13:24:38 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [6]:
# spatial scales of the maps
SCALES = ('BG', 'COUNTY', 'TRACT')

# Zones
- The **UMN AAA data** uses the **2010** definitions of the census zones.
- **POI accessibility** uses travel times between zone centroids. The travel times obtained from Google Distance Matrix are for the **2010** zones.
- The **EJ Screen data** is available from 2015 to 2023. For the analysis, the **2020** data is used.

## Download
<!-- Note that the TIGER/LINE boundaries and ACS data of the subdivisions of Indiana at different scales were already downloaded in [../../spr_4608/code/1_Geometry_ACS.ipynb](../../spr_4608/code/1_Geometry_ACS.ipynb), but they were for the year 2020 (I think). The files were then directly copied from `$MK/spr_4608/data/geometry` to `$MK/spr_4711/data/acs`. -->

<!-- However, since the UMN access data used the 2010 definitions, they are downloaded explicitly from the Census website. -->

In [7]:
def get_tiger_zones(scale, year, state='Indiana', name_col='name',
                    save=True, overwrite=False):
    old_api = year <= 2015
    outfile = Path(f'../data/zones/{scale.lower()}_{year}.parquet')
    zipfile = Path(f'../data/tiger_boundary_{scale}.zip')
    assert scale in ['TABBLOCK', 'BG', 'TAZ', 'TRACT', 'COUNTY', 'CBSA']
    if outfile.exists() and not overwrite:
        return
    print('Downloading zones at scale:', scale)
    fips = mk.geo.US_STATES_FIPS[state.upper()]
    if old_api:
        year_label = str(year % 2000)
        url = (f'https://www2.census.gov/geo/pvs/tiger{year}st/{fips}_{state}/'
               f'{fips}/tl_{year}_{fips}_{scale.lower()}{year_label}.zip')
    else:
        root = f'https://www2.census.gov/geo/tiger/TIGER{year}/{scale}'
        base = f'tl_{year}_{fips}_{scale.lower()}.zip'
        if scale == 'COUNTY':
            base = f'tl_{year}_us_county.zip'
        url = f'{root}/{base}'
    urllib.request.urlretrieve(url, zipfile)
    df = gpd.read_file(zipfile).to_crs(CRS_DEG)
    df = df.rename(columns=str.lower)
    if old_api:
        df = df.rename(columns=lambda x: x.replace(year_label, ''))
    if scale == 'COUNTY':
        df = df.query(f'statefp == "{fips}"').reset_index(drop=True)
    df = df[['geoid', name_col, 'aland', 'awater', 'geometry']]
    df = df.rename(columns={name_col: 'name'})
    if save:
        df.to_parquet(U.mkfile(outfile))
    zipfile.unlink()
    return df

In [8]:
years = [2010, 2020]
scales_names = [('COUNTY', 'name'), ('TRACT', 'namelsad'), ('BG', 'namelsad')]
for year, (scale, name_col) in it.product(years, scales_names):
    get_tiger_zones(scale, year, name_col=name_col, overwrite=0)

## [Relationship files](https://www.census.gov/geographies/reference-files/time-series/geo/relationship-files.2020.html)
These files show the mapping between the old (2010) and the new (2020) zones for [block groups](https://www2.census.gov/geo/docs/maps-data/data/rel2020/blkgrp/tab20_blkgrp20_blkgrp10_natl.txt), [tracts](https://www2.census.gov/geo/docs/maps-data/data/rel2020/tract/tab20_tract20_tract10_natl.txt), [counties](https://www2.census.gov/geo/docs/maps-data/data/rel2020/cousub/tab20_cousub20_cousub10_natl.txt).

In [9]:
def get_census_relationship(scale, keyword, state='Indiana', year1=2010,
                            year2=2020, save=True, overwrite=False):
    scale = scale.lower()
    label = f'{scale}-{year1}-{year2}'
    infile = f'../data/zones/xtab-{label}.txt'
    outfile = Path(f'../data/zones/relation-{label}.parquet')
    if outfile.exists() and not overwrite:
        return pd.read_parquet(outfile)
    y1, y2 = year1 % 2000, year2 % 2000
    df = pd.read_csv(infile, sep='|').rename(columns=str.lower)
    df = df.rename(columns=lambda x: x.replace('_' + keyword.lower(), ''))
    df = df.astype({f'geoid_{y1}': str, f'geoid_{y2}': str})
    max_len = df[f'geoid_{y1}'].str.len().max()
    for y in [y1, y2]:
        df[f'geoid_{y}'] = df[f'geoid_{y}'].str.zfill(max_len)
        df[f'state_{y}'] = df[f'geoid_{y}'].str.slice(0, 2)
    fips = f'{mk.geo.US_STATES_FIPS[state.upper()]:02}'
    df = df[(df[f'state_{y1}'] == fips) & (df[f'state_{y2}'] == fips)]
    df = df[[f'geoid_{y1}', f'geoid_{y2}',
             f'arealand_{y1}', f'arealand_{y2}', 'arealand_part',
             f'areawater_{y1}', f'areawater_{y2}', 'areawater_part']]
    df = df.dropna().reset_index(drop=True)
    if save:
        df.to_parquet(U.mkfile(outfile))
    return df

x = get_census_relationship('county', 'cousub', overwrite=0)
x = get_census_relationship('tract', 'tract', overwrite=0)
x = get_census_relationship('bg', 'blkgrp', overwrite=0)

In [10]:
bg1020 = pd.read_parquet('../data/zones/relation-bg-2010-2020.parquet').disp()

7,046 rows x 8 cols; Memory: 1.2 MiB


Unnamed: 0,geoid_10,geoid_20,arealand_10,arealand_20,arealand_part,areawater_10,areawater_20,areawater_part
,<object>,<object>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>
0.0,180010301001,180010301001,59364016,59364016,59364016,161597,161597,161597


# ACS

In [11]:
def download_acs_shp(scale, state='Indiana', year=2020,
                     fields=mk.acs.IMP_FIELDS, outroot='../data/acs',
                     save=True, overwrite=False):
    geo = {'state': f'{state:02}'} | {
        'COUNTY': {'county': '*'},
        'TRACT': {'county': '*', 'tract': '*'},
        'BG': {'county': '*', 'tract': '*', 'block group': '*'}
    }[scale.upper()]
    acs = (mk.acs.download(list(geo.items()), list(fields.keys()), year=year)
           .rename(columns=fields).reorder_levels(list(geo.keys())))
    acs.index = acs.index.map(''.join).rename('geoid')
    acs = acs[acs['popu'] > 0]
    return acs

# %time x = download_acs_shp('bg', year=2013).disp()
# x.to_csv('../data/acs/acs_2013.csv')

In [None]:
acs13 = pd.read_csv('../data/acs/acs_2013.csv').disp()

4,808 rows x 33 cols; Memory: 1.2 MiB


Unnamed: 0,geoid,popu,hh,age_minor,age_adult,age_senior,age_median,sex_female,sex_male,race_white,race_black,edu_eligible,edu_bachelors,employ_total,employ_in_LF,employ_not_in_LF,inc_total,inc_avg,inc_total_hh,inc_median,pop_poor,pop_nonpoor,hh_poor,hh_nonpoor,cm_car,cm_pool,cm_pt,cm_bus,cm_subway,cm_taxi,cm_bike,cm_walk,cm_wfm
,<int64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>
0.0,180571109041,1567.0,500.0,409.0,1016.0,58.0,39.5,817.0,750.0,1480.0,5.0,970.0,707.0,701.0,725.0,443.0,102434200.0,65370.0,102271000.0,168750.0,,,16.0,484.0,588.0,8.0,0.0,0.0,0.0,0.0,48.0,0.0,596.0


# Misc.

## Social Vulnerability Index

In [13]:
svi = gpd.read_file('../data/_etc/svi/tract/SVI2020_INDIANA_tract.shp').disp()

1,693 rows x 161 cols; Memory: 2.8 MiB; CRS: EPSG:4269


Unnamed: 0,ST,STATE,ST_ABBR,STCNTY,COUNTY,FIPS,LOCATION,AREA_SQMI,E_TOTPOP,M_TOTPOP,E_HU,M_HU,E_HH,M_HH,E_POV150,M_POV150,E_UNEMP,M_UNEMP,E_HBURD,M_HBURD,E_NOHSDP,M_NOHSDP,E_UNINSUR,M_UNINSUR,E_AGE65,M_AGE65,E_AGE17,M_AGE17,E_DISABL,M_DISABL,E_SNGPNT,M_SNGPNT,E_LIMENG,M_LIMENG,E_MINRTY,M_MINRTY,E_MUNIT,M_MUNIT,E_MOBILE,M_MOBILE,E_CROWD,M_CROWD,E_NOVEH,M_NOVEH,E_GROUPQ,M_GROUPQ,EP_POV150,MP_POV150,EP_UNEMP,MP_UNEMP,...,F_AGE17,F_DISABL,F_SNGPNT,F_LIMENG,F_THEME2,F_MINRTY,F_THEME3,F_MUNIT,F_MOBILE,F_CROWD,F_NOVEH,F_GROUPQ,F_THEME4,F_TOTAL,E_DAYPOP,E_NOINT,M_NOINT,E_AFAM,M_AFAM,E_HISP,M_HISP,E_ASIAN,M_ASIAN,E_AIAN,M_AIAN,E_NHPI,M_NHPI,E_TWOMORE,M_TWOMORE,E_OTHERRAC,M_OTHERRAC,EP_NOINT,MP_NOINT,EP_AFAM,MP_AFAM,EP_HISP,MP_HISP,EP_ASIAN,MP_ASIAN,EP_AIAN,MP_AIAN,EP_NHPI,MP_NHPI,EP_TWOMORE,MP_TWOMORE,EP_OTHERRA,MP_OTHERRA,SHAPE_STAr,SHAPE_STLe,geometry
,<object>,<object>,<object>,<object>,<object>,<object>,<object>,<float64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<float64>,<float64>,<float64>,<float64>,...,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<geometry>
0.0,18,Indiana,IN,18001,Adams,18001030100,"Census Tract 301, Adams County, Indiana",77.63825,4975,403,1977,189,1882,174,489,172,76,58,185,67,206,103,224,112,1040,238,1180,171,583,204,32,35,0,48,322,166,21,27,66,47,5,14,20,19,0,12,9.8,3.4,3.1,2.3,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3680,589,599,0,12,300,161,0,12,5,12,0,12,17,36,0,12,11.8,1.0,0.0,0.7,6.0,3.3,0.0,0.7,0.1,0.2,0.0,0.7,0.3,0.7,0.0,0.7,0.021526,0.77535,"POLYGON ((-85.073861 40.917823, -85.053433 40...."
