# 1. Setup

In [1]:
from pqdm.processes import pqdm

from utils import *

# 2. Prepare data

## 2.1. Prepare data

### 2.1.1. POI categories
Each SafeGraph POI category was manually classified by trip purpose and essentiality.

In [2]:
with open('../data/pois/poi_categories.yml', 'r') as f:
    cats = yaml.safe_load(f)['SafeGraph']
cats = Pdf(sum([[D(purpose=k, top_category=x) for x in v] 
                for k, v in cats.items()], []))
cats = cats.query('purpose != "Overall"').disp()

20 rows x 2 cols; Memory: 0.0 MiB


Unnamed: 0,purpose,top_category
,<object>,<object>
90.0,Services,Child Day Care Services


### 2.1.2. Count POIs by purpose & BG

In [3]:
def get_sg_poi_count(categories=cats, year=2020, overwrite=False):
    outpath = Path(f'../data/pois/sg_count_{year}.parquet')
    if outpath.exists() and not overwrite:
        return pd.read_parquet(outpath)
    path = '/home/umni2/a/umnilab/data/safegraph/pois/us/2020-11-06.parquet'
    poi = pd.read_parquet(path, columns=['top_category', 'lon', 'lat'])
    poi = poi.merge(categories, 'left', on='top_category')
    poi = pdf2gdf(poi, 'lon', 'lat', CRS_DEG)
    bg = gpd.read_parquet(f'../data/zones/us/bg_{year}.parquet')
    poi = poi.sjoin(bg[['geoid', 'geometry']], predicate='within')
    poi = pd.concat([
        poi.groupby(['purpose', 'geoid']).size().reset_index(),
        poi.groupby('geoid').size().reset_index().assign(purpose='Overall')
    ]).reset_index(drop=1).rename(columns={0: 'n_pois', 'geoid': 'cbg'})
    poi.to_parquet(mkfile(outpath))
    return poi

tot_poi = get_sg_poi_count(overwrite=0).disp() # t=0:44

681,065 rows x 3 cols; Memory: 91.8 MiB


Unnamed: 0,purpose,cbg,n_pois
,<object>,<object>,<int64>
0.0,Leisure,010010201001,2


In [4]:
tot_poi.groupby('purpose')['n_pois'].sum().to_frame().T

purpose,Leisure,Overall,Services,Shopping
n_pois,1040318,5551806,559395,290096


### 2.1.3. Impedance function

In [5]:
imped_params = (
    pd.read_csv('../data/nhts/fitted-params-power-exponential.csv')
    .query(' & '.join(['mode != "Transit"', 'measure == "Duration"',
                       'parameter != "r2"', 'purpose != "Work"']))
    .pivot_table('value', ['mode', 'purpose'], 'parameter').reset_index()
    .assign(mode=lambda df: df['mode'].str.lower()).rename_axis(None, axis=1)
    .assign(purpose=lambda df: df['purpose']
            .str.replace('Social/Recreational', 'Leisure')
            .str.replace('Other', 'Services'))
).disp(2)

12 rows x 4 cols; Memory: 0.0 MiB


Unnamed: 0,mode,purpose,alpha,beta
,<object>,<object>,<float64>,<float64>
0.0,bike,Services,0.044686,0.981751
1.0,bike,Overall,0.025914,1.222447


### 2.1.4. Potential accessibility
This only requires total jobs at the destination, not the OD flows

In [6]:
# travel time thresholds (in minutes)
thresholds = (15, 30, 45, 60, 90)

In [7]:
def get_potential_access(level, region, scale, mode, thresh,
                         year=2020, params=imped_params):
    cols = 'region scale mode thresh kind geoid opport'.split()
    try:
        pois = get_sg_poi_count(year=year)
        pois['geoid'] = pois['cbg'].str[:D(county=5, tract=11, bg=12)[scale]]
        pois = pois.groupby(['purpose', 'geoid'])['n_pois'].sum().reset_index()
        # read travel time between the zones of given scale by given mode
        tt_file = Path(f'../data/osrm/{level}/{region}/{scale}_{mode}_{year}.parquet')
        tt_cols = D(src_geoid='src', trg_geoid='trg', duration='time')
        tt = pd.read_parquet(tt_file, columns=tt_cols.keys()).rename(columns=tt_cols)
        zones = tt['src'].unique()
        tt = pd.concat([tt, Pdf(D(src=zones, trg=zones, time=0))])
        tt = tt.set_index(['src', 'trg'])['time'] / 60 # travel time in minutes
        t = tt[tt <= thresh] # travel times for filtered OD pairs
        dfs = []
        params = params.query(f'mode == "{mode}"')
        assert set(params.purpose) == set(pois.purpose), "Purposes don't match"
        for _, r in params.iterrows(): # for each purpose
            od = np.exp(-r.alpha * t ** r.beta).rename('weight').reset_index()
            od = od.merge(pois[pois.purpose == r.purpose],
                        left_on='trg', right_on='geoid')
            od['wtd_pois'] = od['n_pois'] * od['weight']
            dfs.append(od)
        xs = (pd.concat(dfs).groupby(['src', 'purpose'])['wtd_pois'].sum()
              .reset_index().rename(columns=D(src='geoid'))
              .assign(region=region, scale=scale, mode=mode, thresh=str(thresh))
              [cols].rename(columns=D(purpose='kind', wtd_pois='opport')))
    except Exception:
        xs = Pdf([], columns=cols)
    return xs

x = get_potential_access('state', '01-alabama', 'tract', 'drive', 60).disp()
# x = get_potential_access('msa', '29820-las-vegas', 'bg', 'drive', 30); x

0 rows x 7 cols; Memory: 0.0 MiB


Unnamed: 0,region,scale,mode,thresh,kind,geoid,opport
,<object>,<object>,<object>,<object>,<object>,<object>,<object>


In [8]:
def get_potential_access_all(level, year=2020, thresholds=thresholds,
                             njobs=4, overwrite=False):
    outpath = Path(f'../data/access/poi_{level}_{year}.parquet')
    if outpath.exists() and not overwrite:
        return pd.read_parquet(outpath)
    files = Path(f'../data/osrm/{level}').glob(f'*/*_{year}.parquet')
    cases = sum([[D(level=level, region=f.parent.stem,
                    scale=f.stem.split('_')[0], mode=f.stem.split('_')[1],
                    thresh=t, year=year) for t in thresholds]
                 for f in files], [])
    np.random.shuffle(cases) # shuffle to hopefully reduce parallel load
    dfs = pqdm(cases, get_potential_access, n_jobs=njobs, argument_type='kwargs')
    cat_cols = 'region scale mode thresh purpose geoid'.split()
    access = (pd.concat(dfs).astype(D(wtd_pois=np.int32))
              .sort_values(cat_cols).reset_index(drop=1)
              .astype({x: CAT for x in cat_cols})[cat_cols + ['wtd_pois']])
    access.to_parquet(mkfile(outpath))
    return access

# xs_st = get_potential_access_all('state', overwrite=0, njobs=16).disp() # t=2:36
xs_msa = get_potential_access_all('msa', overwrite=1, njobs=16).disp() # t=30:08

QUEUEING TASKS | :   0%|          | 0/2100 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/2100 [00:00<?, ?it/s]

# 3. Visualize