# 1. Setup

In [2]:
import sys
sys.path.append('../..')
import mobiquity as mq
from mobiquity.names import *

import urllib.request

# 3. OSM database

## 3.1. Download

### 3.1.1. Census regions

In [3]:
def download_region_db(name, overwrite=False):
    name = name.lower().replace(' ', '-')
    assert name in ['midwest', 'northeast', 'pacific', 'south', 'west'], name
    outpath = DATA / f'osm/region/{name}.osm.pbf'
    if outpath.exists() and not overwrite:
        return
    baseUrl = 'https://download.geofabrik.de/north-america'
    url = f'{baseUrl}/us-{name}-latest.osm.pbf'
    urllib.request.urlretrieve(url, U.mkfile(outpath))

# download_region_db('pacific')

### 3.1.2. States

In [4]:
def download_state_db(name, overwrite=False):
    name = name.lower().replace(' ', '-')
    outpath = DATA / f'osm/state/{name}/{name}.osm.pbf'
    if outpath.exists() and not overwrite:
        return
    baseUrl = 'https://download.geofabrik.de/north-america/us'
    url = f'{baseUrl}/{name}-latest.osm.pbf'
    urllib.request.urlretrieve(url, U.mkfile(outpath))

# for state in tqdm(mk.geo.US_STATES_FIPS.keys()):
#     download_state_db(state)

### 3.1.3. Extract regional database

In [5]:
# %%bash
# #! long time to run
# cd ../data/osm/region
# for rgn in midwest northeast south west; do
#     outfile=$rgn.osm
#     if [ -f $outfile ]; then osmium cat $rgn.osm.pbf -o $outfile; fi
# done

## 3.2. Extract for MSAs

### 3.2.1. Prepare MSA lists
The 50 largest MSAs were manually assigned a US region label to allow extracting the MSA OSM database from the regional OSM extract instead of the state's extract since an MSA can span multiple states but lies in only one region.

In [7]:
msa2rgn = U.load(DATA / 'msa2region.csv').disp()

50 rows x 2 cols; Memory: 0.0 MiB


Unnamed: 0,cbsa,region
,<object>,<object>
0.0,"Atlanta-Sandy Springs-Roswell, GA",south
1.0,"Austin-Round Rock-San Marcos, TX",south


In [None]:
top_msas = (
    U.load(DATA / 'ses/acs/acs_2021.parquet',
           filters=[('scale', '==', 'BG')], columns=['geoid', 'popu'])
    .merge(urba, on='geoid').groupby('urba')['popu'].sum().astype(int)
    .sort_values(ascending=False).reset_index()
    .head(50))
top_msas['msa'] = [x.split(',')[0].split('-')[0].split('/')[0]
                   for x in top_msas['urba']]
top_msas.disp(); pass

### 3.2.2. Prepare GeoJSON
Save the GeoJSON file of each MSA's boundary for the corresponding region for the `osmium extract` command to work.

In [15]:
def get_msa_json(year=2020, msa2rgn=msa2rgn, top_msas=topMSAs):
    msa = top_msas.merge(msa2rgn, on='cbsa').rename(
        columns=D(geoid='cbsa_id'))[['cbsa_id', 'key', 'region']]
    df = gpd.read_parquet(DATA / f'zones/zones_{year}.parquet',
                          filters=[('scale', '==', 'county')])
    df = df.merge(msa, on='cbsa_id')
    df = df[['key', 'region', 'geometry']]
    df = df.dissolve('key').reset_index()
    # for multipolygon features, keep only the largest polygon for simplicity
    df = df.explode(subset='geometry', index_parts=True)
    df['area'] = df.to_crs(CRS_M).area
    df = df.sort_values('area').groupby('key').last()
    for key, r in df.iterrows():
        outpath = DATA / f'osm/msa/json/{r.region}/{key}.geojson'
        df = Gdf(r.to_frame().T, crs=CRS_DEG)
        df.to_file(U.mkfile(outpath), driver='GeoJSON')

# get_msa_json() # t=0:14

### 3.2.3. Extract MSA database by region
Using the `./get_msa_osm.sh` script.