In [None]:
pip install censusdata

In [None]:
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, Image
from IPython.display import display, HTML

from shapely.geometry import Point, Polygon
from shapely.ops import transform
import geopandas as gpd
import pyproj
from pyproj import Proj, Transformer
import censusdata

import warnings
warnings.filterwarnings('ignore')

# set the display of dataframes to stretch across pages 
pd.set_option('display.expand_frame_repr', False)
# set the display of float to have 2 decimal points
pd.set_option('display.precision', 2)

In [None]:
def convert_lat_long(gdf):
    '''
    @gdf: geoDataFrame, with a column titled 'geometry' containing the Polygons
    Returns: geoDataFrame with coordinates converted to lat/longs
    '''
    g = gdf.set_crs('esri:102643') # currently doesnt have a coordinate system
    return g.to_crs('EPSG:4326')   # lat/long coordinate system

have a function:
    spits out summary stats of the tract groups

Ex:
- Given tract group 550554 -> what proportion are white/black/asian/etc homeowners

rank all tract groups by metrics: slide with top K (k = 3/5 or w/e)
- total houses
- housing density (houses/people living in tract group)
- which tract groups have highest homeownership proportion by __ subgroup 

TODO:
- getting intersections of polygons
        - we have polygons/shapes for census tracts, have them for these proposed housing areas -> get intersection -> get stats related to intersection
- get raw lat/longs of light rail stops/caltrain/buses
        - query census tracts that are within 1/2 mile from it

1) given tract shapes/lat longs and df of lat longs of transit sites -> figure out which tracts are close to which transit sites

a) given one lta/long (one bus stop) -> what tract is it in?
b) filter transit stops -> stuff in san jose
    - check each lat long, is this in a tract
        Polygon.contains(pt)


In [None]:
transit = pd.read_csv('/kaggle/input/transit/Major_Transit_Stops_(2021).csv')
transit['geometry'] = transit.apply(lambda x: Point(x.X, x.Y), axis=1)

In [None]:
transit.head(2)

The [Major Transit Stops data](https://opendata.mtc.ca.gov/datasets/major-transit-stops-2021/explore?location=37.320960%2C-121.874016%2C12.03) gives a few bits of info that we care about
- lat/long of the stop
- agency name (Probably mostly only care about VTA for San Jose since that makes the bulk of the San Jose public transport iirc)
- status: we just want the existing/built stations


In [None]:
print('transit statuses:', transit['status'].unique())

## SJ Census Tract Info
We can get the census tracts from https://gisdata-csj.opendata.arcgis.com/datasets/CSJ::census-tract/about
for some quick splicing. From their info, it seems like these tracts are from the 2010 census though. For now, it suffices just to get some quick development work done.

Later on, we will want to try to find the 2020 shape files because tracts are often updated before each census.

In [None]:
sj_tracts = gpd.read_file('/kaggle/input/census-shapes/Census_Tract.shp')
sj_tracts = sj_tracts.to_crs('EPSG:4326') # convert to lat/long
sj_tracts['HOUSINGUNI'] = sj_tracts['HOUSINGUNI'].astype(int)
sj_tracts.head(2)

The things we care about in this census tract dataframe are:
- geometry: lat/longs for the coordinates that comprise the exterior/interior of a tract shape
- CENSUSTRAC: census tract id number

In [None]:
def pts_in_sj(transit_df, sj_tracts):
    '''
    transit_df: DataFrame with a 'geometry' column containing shapely.Points
    sj_tract_df: DataFrame with a 'geometry' column containing shapely.Polygons
    Returns: pandas Series containing True/False whether or not the given transit stop is in San Jose
        as determined by whether or not the lat/lng of the stop is inside one of the tracts in sj_tracts
    '''
    # dissolve will basically create a df that unions all the polygons
    sj_poly = sj_tracts.dissolve()['geometry'][0]
    in_sj = transit_df['geometry'].apply(lambda p: sj_poly.contains(p))
    return in_sj

    # doing the dissolve instead of the double for-loop is quite a bit faster (<2s vs 6s)
    '''
    in_tract_pts = []
    for p in transit_df['geometry']:
        for sjp in sj_tracts['geometry']:
            if sjp.contains(p):
                in_tract_pts.append(p)
                break
    return in_tract_pts
    '''

In [None]:
in_sj = pts_in_sj(transit, sj_tracts)
transit['in_sj'] = in_sj
sj_vta = transit[transit['in_sj'] & (transit['agency_nm'] == 'VTA') & (transit['status'] == 'Existing/Built')]
print('Number of VTA stops in SJ:', len(sj_vta))

We've filtered down the transit stops to just the ones that are in San Jose. We also want to know which tract the transit stop is located in.
(So actually we could have gotten this if we used the double for-loop version of the `pts_in_sj` function but oh well.)

In the following `get_transit_tract` function, we construct a function `get_tract` that determines the tract that the input lat/lng point belongs to.

In [None]:
def get_transit_tract(transit, sj_tracts):
    '''
    transit: DataFrame of transit stops with a 'geometry' column
    sj_tracts: DataFrame of SJ tracts with a 'CENSUSTRAC' column
    Returns: Pandas Series containing the tract for each transit stop
    '''
    def get_tract(p, sj_tracts):
        '''
        p: shapely Point
        sj_tracts: geopandas DataFrame with a 'geometry' column
        Returns: census tract that contains p
        '''
        tracts = []
        for idx, row in sj_tracts.iterrows():
            if row.geometry.contains(p):
                return row.CENSUSTRAC
        return None

    return transit['geometry'].apply(lambda p: get_tract(p, sj_tracts))

In [None]:
%time sj_transit_tracts = get_transit_tract(sj_vta, sj_tracts)
sj_vta['tract'] = sj_transit_tracts

In [None]:
print('Number of tracts containing a VTA stop:', len(sj_transit_tracts.unique()))

In [None]:
sj_tracts['HOUSINGUNI'] = sj_tracts['HOUSINGUNI'].astype(int)
transit_tracts = set(sj_vta['tract'])
print('Proportion of population in SJ in tracts with a VTA stop   : {:.3f}'.format(
    sj_tracts[sj_tracts['CENSUSTRAC'].isin(transit_tracts)]['POPULATION'].sum() / sj_tracts['POPULATION'].sum()
))
print('Proportion of housing units in SJ in tracts with a VTA stop: {:.3f}'.format(
    sj_tracts[sj_tracts['CENSUSTRAC'].isin(transit_tracts)]['HOUSINGUNI'].sum() / sj_tracts['HOUSINGUNI'].sum()
))
print('Total pop    :', sj_tracts['POPULATION'].sum())
print('Total housing:',  sj_tracts['HOUSINGUNI'].sum())

Again, note that these stats are probably not up to date. Let's now use the `censusdata` package to get updated figures for these tracts.

In [None]:
censusdata.printtable(censusdata.censustable('acs5', 2019, 'B25001'))

In [None]:
def _tot_housing_df(var_lst, src='acs5', year=2019):
    sj_geo = censusdata.censusgeo([('state', '06'), ('county', '085'), ('tract', '*')])
    return censusdata.download(src, year, sj_geo, var_lst)

# Take a look at the first three rows of the returned dataframe
var_lst = ['B25001_001E']
tot_housing_df = _tot_housing_df(var_lst)
tot_housing_df.head(n=3)

To quickly filter this dataframe by tract
- add a tract column
- then we can do something like: `df[df['tract'].isin(transit_tracts)]`

In [None]:
# There's an extra '.' in the middle of the census tract from sj_tracts that we need to deal with
print(sj_tracts['CENSUSTRAC'][:3])
print([p.params()[2][1] for p in tot_housing_df.index][:3])

In [None]:
sj_vta.head(2)

In [None]:
def filter_tracts(df, tract_series):
    '''
    df: geopandas DataFrame to filter down. The index of this dataframe should be a censusgeo object
    tract_series: pandas Series of census tracts. The format expected is 
    Returns: df filtered down to the tracts in tract_series
    '''
    tracts = set([s.replace('.', '') for s in tract_series])
    
    # we have to grab the tract from the index of the DataFrame
    df['tract'] = [p.params()[2][1] for p in tot_housing_df.index]
    sj_housing_df = df[tot_housing_df['tract'].isin(tracts)]
    return sj_housing_df

In [None]:
sj_housing_df = filter_tracts(tot_housing_df, sj_tracts['CENSUSTRAC'])
vta_housing_df = filter_tracts(tot_housing_df, sj_vta['tract'])
print('Total housing units in San Jose            :', sj_housing_df['B25001_001E'].sum())
print('Total housing units in tracts with VTA stop:', vta_housing_df['B25001_001E'].sum())
print('Prop housing units in tracts with VTA stop : {:.3f}'.format(vta_housing_df['B25001_001E'].sum() / sj_housing_df['B25001_001E'].sum()))

So the stats form 2010 seem sort of reasonable. In any case, we can now also filter housing stats by tracts containing a VTA stop once we grab those relevant stats from `censusdata.download`.

Lets see if there's a discrepancy between proportion of single family units in 

In [None]:
var_map = {
'B25024_001E': 'Total',
'B25024_002E': 'Single_detached',
'B25024_003E': 'Single_attached',
'B25003_001E': 'TenureTotal',
'B25003_002E': 'TenureOwner',
'B25003_003E': 'TenureRenter',
}
housing_df = _tot_housing_df(list(var_map.keys()))
housing_df.columns = [var_map[c] for c in housing_df.columns]

sj_df = filter_tracts(housing_df, sj_tracts['CENSUSTRAC'])
vta_df = filter_tracts(housing_df, sj_vta['tract'])

In [None]:
print('San Jose   Single Family:total housing: {:.3f}'.format(
    (sj_df['Single_detached'].sum() + sj_df['Single_attached'].sum()) / sj_df['Total'].sum()
))
print('VTA tracts Single Family:total housing: {:.3f}'.format(
    (vta_df['Single_detached'].sum() + vta_df['Single_attached'].sum()) / vta_df['Total'].sum()
))
print('-------------')

print('San Jose  Owner:total housing : {:.3f}'.format(
    (sj_df['TenureOwner'].sum()) / sj_df['TenureTotal'].sum()
))
print('VTA tracts Owner:total housing: {:.3f}'.format(
    vta_df['TenureOwner'].sum() / vta_df['TenureTotal'].sum()
))

Next steps:
- population related stats for tracts overall and tracts with VTA stops
- do this for various stats of interest (housing by ethnicity)
- zoom in on various tracts (ex: densest tracts, tracts with the most VTA stops, etc)

In [None]:
# TODO: fix
'''
function take in Pt(lat/lng), geopandas DataFrame of tracts with columns for housing info, threshold distance -> filter DataFrame by distance to point

for a given bus stop
- exists in some tract
- other tracts can be close too -> grab all tracts with distance <= threshold

Return: dict bus stop -> list of tracts
'''
def close_tracts(transit_stops, tract_df, dist):
    '''
    transit_stops: DataFrame, with lat/longs/bus id (some unique identifier)
        - assume that this dataframe has columns: 'objectid', 'X', 'Y'
    tract_df: DataFrame with tract polygons, unique identifier of tract (tract id)
        - assume that this dataframe has columns: 'geometry', 'CENSUSTRAC'
    dist: float, units in miles
    
    Return: dict from transit ('objectid') -> list of tract ids
    (Note that )
    Ex:
    {
        'bus_stop_1': ['500544', '343522'],
        'bus_stop_2': ['500544', '343522'],
        ...
    }
    '''
    _tmap = {}
    for _idx, row in transit_stops.iterrows():
        pt = Point(row.X, row.Y)
        _tmap[row['objectid']] = []
        
        for _tfidx, tfrow in tract_df.iterrows():
            # TODO: what do we need to do to ensure that this distance is in miles (or km)?
            if tfrow['geometry'].exterior.distance(pt) <= dist:
                _tmap[p].append(tfrow['CENSUSTRAC'])
                
    return _tmap