In [None]:
pip install censusdata

In [None]:
import time
import numpy as np
import pandas as pd
import geopandas as gpd
import censusdata
import matplotlib.pyplot as plt
from shapely.geometry import Point, Polygon, MultiPolygon

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Files
We have a few files that contain information we're interested in
- `/kaggle/input/census-shapes/Census_Tract.shp`: shape file for census tracts in San Jose
- `/kaggle/input/censusbg/cb_2018_06_bg_500k.shp`: shape file of block groups in San Jose
- `/kaggle/input/transit/Major_Transit_Stops_(2021).csv`: shape file of transit stops in the Bay Area. We will filter this down to San Jose VTA stops.

In [None]:
#############
# Constants #
#############
CA_CODE = '06'
SANTA_CLARA_COUNTY_CODE = '085'

In [None]:
def pts_in_geometry(transit_df, gdf):
    '''
    transit_df: DataFrame with a 'geometry' column containing shapely.Points
    gdf: DataFrame with a 'geometry' column containing shapely.Polygons
    Returns: pandas Series containing True/False whether or not the given transit stop is in San Jose
        as determined by whether or not the lat/lng of the stop is inside one of the tracts in sj_tracts
    '''
    # dissolve will basically create a df that unions all the polygons
    blob = gdf.dissolve()['geometry'][0]
    in_geom = transit_df['geometry'].apply(lambda p: blob.contains(p))
    return in_geom

def load_sj_tracts(fn='/kaggle/input/census-shapes/Census_Tract.shp', tract_col='CENSUSTRAC'):
    '''
    fn: str, filename of census tract shape file of San Jose
    tract_col: str, column name for the tract
    Returns: geopandas DataFrame of San Jose tracts
    '''
    df = gpd.read_file(fn)
    df.to_crs('EPSG:4326', inplace=True)
    df[tract_col] = df[tract_col].apply(lambda x: x.replace('.', ''))
    return df

def load_sj_bgs(sj_tracts_set, bg_fn='/kaggle/input/censusbg/cb_2018_06_bg_500k.shp'):
    '''
    sj_tracts_set: set of SJ tract codes (strings)
    bg_fn: str, filename of California block group shape file
    Returns: geopandas DataFrame of San Jose block groups
    '''
    sj_tracts_set = set(sj_tracts['CENSUSTRAC'].unique())
    ca_bgs = gpd.read_file(bg_fn)
    santa_clara_bgs = ca_bgs[ca_bgs['COUNTYFP'] == SANTA_CLARA_COUNTY_CODE]
    sj_bgs = ca_bgs[ca_bgs['TRACTCE'].isin(sj_tracts_set)]
    return sj_bgs

def load_sj_transit(sj_bgs, fn='/kaggle/input/transit/Major_Transit_Stops_(2021).csv'):
    '''
    sj_bgs: geopandas DataFrame containing the Polygons of the various block groups of San Jose
    fn: str, filename of the transit locations shape file
    Returns: geopandas DataFrame of transit locations in San Jose
    '''
    transit = pd.read_csv(fn)
    transit['geometry'] = transit.apply(lambda x: Point(x.X, x.Y), axis=1)
    
    _st = time.time()
    transit['in_sj'] = pts_in_geometry(transit, sj_bgs) # this is pretty fast
    _end = time.time()
    print('Elapsed time for computing pts in San Jose: {:.2f}s'.format(_end - _st))
    sj_transit = transit[transit['in_sj']].copy() # avoid chained assignment later
    
    # then also add a tract, block group column
    # TODO: there is probably a cleaner way to do this
    _st = time.time()
    tracts = []
    bgs = []
    geoids = []
    for _, transit_row in sj_transit.iterrows():
        for _, bg_row in sj_bgs.iterrows():
            if bg_row.geometry.contains(transit_row.geometry):
                tracts.append(bg_row['TRACTCE'])
                bgs.append(bg_row['BLKGRPCE'])
                geoids.append(bg_row['GEOID']) # we don't really need this but it might be useful later
                break
    _end = time.time()
    print('Elapsed time for computing tracts/bgs of each transit: {:.2f}s'.format(_end - _st))
    
    assert len(tracts) == len(sj_transit)
    assert len(bgs) == len(sj_transit)
                
    sj_transit['tract']       = pd.Series(tracts, index=sj_transit.index)
    sj_transit['block_group'] = pd.Series(bgs, index=sj_transit.index)
    sj_transit['geoid']       = pd.Series(geoids, index=sj_transit.index)
    return sj_transit

In [None]:
sj_tracts = load_sj_tracts('/kaggle/input/census-shapes/Census_Tract.shp')
sj_tracts_set = set(sj_tracts['CENSUSTRAC'].unique())

sj_bgs = load_sj_bgs(sj_tracts_set)
transit = load_sj_transit(sj_bgs)

Computing which tract and block group a given transit location lives in is a little gross but we can figure out how to speed that up later. For now, all that matters is that we can compute this mapping.

So now we have the following dataframes:
- `sj_bgs`: geopandas dataframe of the block groups in San Jose
- `transit`: geopandas dataframe of transit stops in the Bay Area. We filter this down below to San Jose block groups.

## Filter transit geopandas dataframe
We want to filter various housing related stats on the block groups that contain:
- VTA stops (bus and rail)
- VTA bus stops
- VTA rail stops

In [None]:
vta_transit = transit[transit['agency_nm'] == 'VTA']
vta_bus = vta_transit[vta_transit['route_type'] == 'Bus']
vta_rail = vta_transit[vta_transit['route_type'] == 'Tram, Streetcar, Light Rail']

In [None]:
for vta_type, _vta_df in zip(['any', 'bus', 'rail'], [vta_transit, vta_bus, vta_rail]):
    print(f'Num block groups containing {vta_type} VTA stops:', len(_vta_df['geoid'].unique()))
    print(f'Num tracts containing {vta_type} VTA stops      :', len(_vta_df['tract'].unique()))
    print('----------')

## Grabbing Census data
Now that we have loaded the transit and San Jose block group data, we can grab the housing related statistics with the `censusdata` package. After we download the relevant housing statistics (see the [ACS reference manual](https://www2.census.gov/acs2011_5yr/summaryfile/ACS_2007-2011_SF_Tech_Doc.pdf) Appendix E for the variable names we'll want to query).

In [None]:
def to_geoid(cgeo):
    '''
    cgeo: censusgeo object
    Return: geoid of the censusgeo object
        A geoid is just a string that contains the state id, county id, tract id, and block group id concattenated together
        A geoid is a unique identifier for a block group.
    '''
    state_tup, county_tup, tract_tup, bg_tup = cgeo.params()
    geoid = '{}{}{}{}'.format(
        state_tup[1],
        county_tup[1],
        tract_tup[1],
        bg_tup[1]
    )
    return geoid

def get_block_stats(var_map, src='acs5', year=2019):
    '''
    var_lst: list of str, list of census variables to query
    src: table to query
    year: int
    Returns: geopandas DataFrame
    '''
    _TRACT_IDX = 2
    _BG_IDX = 3
    sj_geo = censusdata.censusgeo([('state', '06'), ('county', '085'), ('block group', '*')])
    var_lst = list(var_map.keys())
    df = censusdata.download(src, year, sj_geo, var_lst)
    
    df.columns = [var_map[c] for c in df.columns] # Rename them for easier eyeballing
    df['geoid'] = pd.Series([to_geoid(idx) for idx in df.index], df.index) # add geo id column for easy of grouping later
    df['tract'] = [h.params()[_TRACT_IDX][1] for h in df.index]
    df['block_group'] = [h.params()[_BG_IDX][1] for h in df.index]
    return df

def filter_tracts(df, tracts):
    return df[df['tract'].isin(tracts)]

def filter_bgs(df, geoids):
    '''
    df: geopandas DataFrame to filter down. It should have a geoid column that we can filter on.
    geoids: list of geoids (str). Note: a geoid is a unique identifier of a census block group.
    Returns: df filtered down to the geoids
    '''
    return df[df['geoid'].isin(geoids)]

## Utility Functions to grab/filter census data
- `get_block_stats` will download the relevant ACS data given a dictionary of variables to download. I made it accept a dictionary of variable name to string (easier identifier for the varialbe )
- `filter_tracts`: this will filter a census dataframe down to the tracts of interest
- `filter_bgs`: this will filter a census dataframe down to the block groups of interest. You need to pass in a set/list of geoids (recall that a geoid is a unique identifier of a block group)

See sample usage below:

In [None]:
var_map = {
    'B25024_001E': 'Total',
    'B25024_002E': 'SingleDetached',
    'B25024_003E': 'SingleAttached',
    'B25003_001E': 'TenureTotal',
    'B25003_002E': 'TenureOwner',
    'B25003_003E': 'TenureRenter',
    'B25003A_001E': 'TotalWhiteHouseholder', # householder can be owner or renter
    'B25003A_002E': 'TotalWhiteOwner',
    'B25003A_003E': 'TotalWhiteRenter',
    'B25003B_001E': 'TotalBlackHouseholder',
    'B25003B_002E': 'TotalBlackOwner',
    'B25003B_003E': 'TotalBlackRenter',
    # etc etc
}
sc_stats = get_block_stats(var_map)
sc_stats.head(1)

## Filter Santa Clara stats
Now that we've downloaded Santa Clara County (county id = '085' is Santa Clara County) we can further filter these stats down to:
- San Jose tracts
- block groups containing VTA stops
- block groups containing VTA bus stops
- block groups containing VTA rail stops
= block groups that do not contain VTA stops

In [None]:
# get San Jose related stats by filtering on tract id
sj_stats = filter_tracts(sc_stats, sj_tracts_set)

# compute the geoids of interest
# Recall that a geoid (state/county/tract/blockgroup) is a unique identifier for a block group
sj_geoids = set([to_geoid(idx) for idx in sj_stats.index])
vta_geoids = set(vta_transit['geoid'])
vta_bus_geoids = set(vta_bus['geoid'])
vta_rail_geoids = set(vta_rail['geoid'])
non_vta_geoids = sj_geoids.difference(vta_geoids)

vta_stats = filter_bgs(sj_stats, vta_geoids)
vta_bus_stats = filter_bgs(sj_stats, vta_bus_geoids)
vta_rail_stats = filter_bgs(sj_stats, vta_rail_geoids)
non_sj_stats = filter_bgs(sj_stats, non_vta_geoids)

## Print out various stats of interest

In [None]:
print('San Jose   Single Family:total housing: {:.3f}'.format(
    (sj_stats['SingleDetached'].sum() + sj_stats['SingleAttached'].sum()) / sj_stats['Total'].sum()
))
print('VTA bgs Single Family:total housing: {:.3f}'.format(
    (vta_stats['SingleDetached'].sum() + vta_stats['SingleAttached'].sum()) / vta_stats['Total'].sum()
))
print('-------------')

print('San Jose  Owner:total housing : {:.3f}'.format(
    (sj_stats['TenureOwner'].sum()) / sj_stats['TenureTotal'].sum()
))
print('VTA bgs Owner:total housing: {:.3f}'.format(
    vta_stats['TenureOwner'].sum() / vta_stats['TenureTotal'].sum()
))
print('-------------')

print('San Jose White householder: Total Householder: {:.3f}'.format(
    sj_stats['TotalWhiteHouseholder'].sum() / sj_stats['TenureTotal'].sum()
))
print('VTA bgs White householder: Total Householder: {:.3f}'.format(
    vta_stats['TotalWhiteHouseholder'].sum() / vta_stats['TenureTotal'].sum()
))
print('------------')

print('San Jose Black householder: Total Householder: {:.3f}'.format(
    sj_stats['TotalBlackHouseholder'].sum() / sj_stats['TenureTotal'].sum()
))
print('VTA bgs Black householder: Total Householder: {:.3f}'.format(
    vta_stats['TotalBlackHouseholder'].sum() / vta_stats['TenureTotal'].sum()
))
print('------------')

This is just a demo of how you might go about grabbing the relevant stats for comparing block groups containing a VTA stop vs block groups containing VTA Bus stops vs San Jose overall etc, etc.

A lot of the grabbing stats and normalizing is manual in the cell above. Likewise for populating the `var_map` with the relevant entries but the variables for ethnicities follows some structure if I recall correctly.
Ex: variables for black end in B, variables for American Indian ends in C, variables for Asians end in D, etc
