# 1. Setup

In [1]:
from mobilkit.umni import *

warnings.simplefilter(action='ignore', category=FutureWarning)

# 2. Process data

## 2.1. Zones
Simplify geometry for faster loading and processing in the dashboard.

In [2]:
# CBSAs for which the zones are not to be simplified (remain in HD)
hd_cbsas = (
    'Indianapolis-Carmel-Greenwood, IN',
    'Chicago-Naperville-Elgin, IL-IN',
    'Fort Wayne, IN',
    'South Bend-Mishawaka, IN-MI'
)

In [16]:
zones = (
    gpd.read_parquet('../data/zones/in_2010.parquet')
    .assign(scale=lambda df: df['scale'].map(
        D(COUNTY='County', TRACT='Tract', BG='Block Group')))
    # .assign(geometry=lambda df: df.geometry.simplify(0.005))
    .merge(pd.read_parquet('../data/ses/acs_in_2019.parquet')
           .astype({'popu': np.int32})[['geoid', 'popu']])
    [['geoid', 'scale', 'county', 'cbsa', 'popu', 'geometry']]
    .rename(columns=lambda x: x.upper() if x != 'geometry' else x)
).set_crs(CRS_DEG).disp()
hd_idx = zones.CBSA.isin(hd_cbsas)
for i, level in [(hd_idx, 0.002), (~hd_idx, 0.005)]:
    zones.loc[i, 'geometry'] = zones.loc[i, 'geometry'].simplify(level)

6,407 rows x 6 cols; Memory: 1.7 MiB; CRS: EPSG:4326


Unnamed: 0,GEOID,SCALE,COUNTY,CBSA,POPU,geometry
,<object>,<object>,<object>,<object>,<int32>,<geometry>
0.0,18001,County,Adams,"Decatur, IN",35376,"POLYGON ((-84.80234 40.834685, -84.802336 40.8..."


In [17]:
zones.to_file('../data/dashboard/zones.shp.zip', driver='ESRI Shapefile')

## 2.2. SES
The column labels and descriptions were manually created in `data/acs/ses_cols.csv`.

In [5]:
ses_cols = pd.read_csv('../data/ses/ses_cols.csv').disp()

23 rows x 4 cols; Memory: 0.0 MiB


Unnamed: 0,code,label,category,description
,<object>,<object>,<object>,<object>
0.0,popu,Population,Demographics,Total population


In [6]:
ses = (
    pd.read_parquet('../data/ses/acs_in_2019.parquet').disp(0)
    .melt(['scale', 'geoid'], var_name='indicator', value_name='value')
    .merge(ses_cols, left_on='indicator', right_on='code')
    .drop(columns=['code', 'indicator']).rename(columns={'label': 'indicator'})
    .assign(percentile=lambda df: (100 * (
        df.groupby(['scale', 'indicator'])['value']
        .rank(pct=True))).fillna(0).astype(int).rename('rank'))
    .rename(columns=D(value=False, percentile=True)).drop(columns='scale')
    .melt(['geoid', 'category', 'indicator', 'description'], var_name='is_pctile')
    .astype(D(geoid=CAT, category=CAT, indicator=CAT, description=CAT, is_pctile=bool))
    .rename(columns=str.upper)
).disp()

6,407 rows x 25 cols; Memory: 1.9 MiB


Unnamed: 0,scale,geoid,popu,pop_density,p_minor,p_poc,p_lowedu,m_income,p_pov,p_pov150,p_pov200,p_snap,p_unemploy,p_noinsur,p_disabled,p_lowenglish,p_snglparent,p_crowded,p_renter,p_mobilehome,p_noveh,m_hhperveh,p_nowfh,p_transit,p_walkbike
,<object>,<object>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>


281,908 rows x 6 cols; Memory: 4.4 MiB


Unnamed: 0,GEOID,CATEGORY,INDICATOR,DESCRIPTION,IS_PCTILE,VALUE
,<category>,<category>,<category>,<category>,<bool>,<float64>
0.0,18069,Demographics,Population,Total population,False,36359.0


In [7]:
# ses.to_csv('../data/dashboard/ses.csv', index=False)
ses.to_parquet('../data/dashboard/ses.parquet')

## 2.3. EJ

In [8]:
ejs = (
    pd.read_parquet('../data/ejs/ejs.parquet').disp(0)
    .merge(zones.rename(columns=str.lower)[['geoid']])
    .assign(percentile=lambda df: (100 * (
        df.groupby(['scale', 'indicator'])['value']
        .rank(pct=True))).fillna(0).astype(int))
    .rename(columns=D(value=False, percentile=True)).drop(columns='scale')
    .melt(['geoid', 'category', 'indicator', 'description'], var_name='is_pctile')
    .astype(D(geoid=CAT, is_pctile=bool, value=np.float32))
    .rename(columns=str.upper)
).disp()

64,170 rows x 6 cols; Memory: 1.5 MiB


Unnamed: 0,geoid,scale,category,indicator,description,value
,<category>,<category>,<category>,<category>,<category>,<float64>


128,140 rows x 6 cols; Memory: 1.9 MiB


Unnamed: 0,GEOID,CATEGORY,INDICATOR,DESCRIPTION,IS_PCTILE,VALUE
,<category>,<category>,<category>,<category>,<bool>,<float32>
0.0,180010301001,Transportation,Diesel PM,Diesel particulate matter level in air,False,0.24585


In [9]:
ejs.to_parquet('../data/dashboard/ej.parquet')

## 2.4. Accessibility

### 2.4.1. Jobs

In [10]:
aaa = (
    pd.read_parquet('../data/aaa/in_aaa.parquet').disp(0)
    .query('is_rac').drop(columns=['is_rac'])
    .merge(zones.rename(columns=str.lower)[['geoid', 'popu']])
    .rename(columns=D(tmax='thresh'))
    .assign(category=lambda df: df.category.map(D(
        TOTAL='Overall', SALARY_LOW='Low earnings',
        SALARY_HIGH='High earnings'))).dropna()
    .assign(per_person=lambda df: df['njobs'] / df.pop('popu'))
    .melt(['geoid', 'scale', 'category', 'mode', 'thresh'])
    .assign(per_person=lambda df: df.pop('variable').map({
        'per_person': True, 'njobs': False}))
    .assign(percentile=lambda df: 100 - (100 * (
        df.groupby(['scale', 'category', 'mode', 'thresh', 'per_person'])['value']
        .rank(pct=True))).fillna(0).astype(int))
    .rename(columns=D(value=False, percentile=True)).drop(columns='scale')
    .melt(['geoid', 'category', 'mode', 'thresh', 'per_person'], var_name='is_pctile')
    .assign(target='Jobs').rename(columns=str.upper)
).disp()

718,032 rows x 7 cols; Memory: 10.9 MiB


Unnamed: 0,scale,mode,tmax,geoid,category,njobs,is_rac
,<category>,<category>,<category>,<category>,<category>,<float64>,<bool>


615,072 rows x 8 cols; Memory: 143.3 MiB


Unnamed: 0,GEOID,CATEGORY,MODE,THRESH,PER_PERSON,IS_PCTILE,VALUE,TARGET
,<object>,<object>,<category>,<category>,<bool>,<object>,<float64>,<object>
0.0,18001,Overall,BIKE,15,False,False,1186.442203,Jobs


### 2.4.2. POIs

In [11]:
poi = (
    pd.read_parquet('../data/access/in_poi_2010.parquet').disp(0)
    .query('wt_decay').drop(columns=['wt_decay'])
    .pipe(lambda df: df[(df.thresh.isin(['15', '30', '45', '60']))])
    .merge(Pdf(columns=['purpose', 'kind', 'category'], data=[
        ('Overall', 'Overall', 'Overall'),
        ('Services', 'Primary', 'Essential Services'),
        ('Shopping', 'Essential', 'Essential Shopping')
    ]), on=('purpose', 'kind')).drop(columns=['purpose', 'kind'])
    .merge(zones.rename(columns=str.lower)[['geoid', 'scale']], on='geoid')
    .rename(columns=D(access='value'))
    .assign(percentile=lambda df: 100 - (100 * (
        df.groupby(['scale', 'category', 'mode', 'thresh', 'per_person'])['value']
        .rank(pct=True))).fillna(0).astype(int))
    .rename(columns=D(value=False, percentile=True)).drop(columns='scale')
    .melt(['geoid', 'category', 'mode', 'thresh', 'per_person'], var_name='is_pctile')
    .assign(target='Non-work').rename(columns=str.upper)
).disp()

5,370,400 rows x 8 cols; Memory: 62.1 MiB


Unnamed: 0,purpose,kind,mode,thresh,geoid,wt_decay,per_person,access
,<category>,<category>,<category>,<category>,<category>,<bool>,<bool>,<float32>


1,166,016 rows x 8 cols; Memory: 280.0 MiB


Unnamed: 0,GEOID,CATEGORY,MODE,THRESH,PER_PERSON,IS_PCTILE,VALUE,TARGET
,<object>,<object>,<category>,<category>,<bool>,<object>,<float64>,<object>
0.0,18001,Overall,DRIVE,15,False,False,1353.0,Non-work


### 2.4.3. Combine

In [12]:
# t=0:03
access = (
    pd.concat([aaa, poi]).rename(columns=str.lower)
    .assign(mode=lambda df: df['mode'].str.title())
    .astype(D(target=CAT, category=CAT, mode=CAT, thresh=CAT,
              geoid=CAT, is_pctile=bool, value=np.float32))
    .rename(columns=str.upper)
).disp()

1,781,088 rows x 8 cols; Memory: 34.6 MiB


Unnamed: 0,GEOID,CATEGORY,MODE,THRESH,PER_PERSON,IS_PCTILE,VALUE,TARGET
,<category>,<category>,<category>,<category>,<bool>,<bool>,<float32>,<category>
0.0,18001,Overall,Bike,15,False,False,1186.442261,Jobs


In [13]:
access.to_parquet('../data/dashboard/access.parquet')

In [22]:
for x in access.select_dtypes(CAT): print(access[x].value_counts(), '\n')

GEOID
18001           288
180973517003    288
180973517001    288
18097351700     288
180973516002    288
               ... 
181770107001    180
181630106002    180
181359521001    172
180759629002    172
180290802021    168
Name: count, Length: 6407, dtype: int64 

CATEGORY
Overall               614216
Essential Services    378896
Essential Shopping    377928
High earnings         205024
Low earnings          205024
Name: count, dtype: int64 

MODE
Bike       608248
Transit    586872
Drive      307376
Walk       278592
Name: count, dtype: int64 

THRESH
60    451648
45    449368
30    445220
15    434852
Name: count, dtype: int64 

TARGET
Non-work    1166016
Jobs         615072
Name: count, dtype: int64 



# 3. Miscellaneous

## 3.1. Long to wide
For David

In [14]:
(pd.read_parquet('../data/ejs/ejs.parquet')
 .pivot_table('value', 'geoid', 'indicator').disp()
 .to_csv(U.mkfile('../data/dashboard/david/ej.csv')))

6,417 rows x 10 cols; Memory: 1.2 MiB


indicator,Air Toxics Cancer,Air Toxics Respiratory HI,Diesel PM,Ozone,PM2.5,Proximity to NPL,Proximity to RMP,Proximity to TSDF,Proximity to Traffic,Water Discharge
geoid,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>,<float64>
18001,635.60506,7.94658,7.50542,1333.715654,259.173035,0.85877,21.12884,54.02471,6896.231984,0.042906


In [18]:
# (pd.read_csv('../data/acs/acs2019.csv').disp(0)
#  .to_csv(U.mkfile('../data/dashboard/david/ses.csv'), index=False))