# Population Estimates: Samples submitted to Cloud Factory

In [1]:
from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

tqdm.pandas()

In [2]:
# Import functions
%run inference-functions.ipynb import load_data, get_bounds, assign_cbgs, visualize_tile_predvOSM, assign_cbgs_by_coverage
%run ADU_permit_matching-polygon-pipeline.ipynb import load_sources

In [3]:
# Paths
OAK_FP = '/oak/stanford/groups/deho/building_compliance/'
OUTPUT_FP = os.path.join(OAK_FP, 'outputs', 'Population-Estimates', 'outputs')
PERMIT_INPUT_FP = os.path.join(OAK_FP, 'outputs', 'Permit-Matching', 'inputs')

BUILD_FP = os.path.join(OAK_FP, 'outputs', 'cbg-inference-{}', 'inference_building_processed')
OSM_FP = os.path.join(OAK_FP, 'outputs', 'cbg-inference-2020', 'osm_building_processed')

In [4]:
# Load data
# * Residential parcels (assigned to CBGs)
sj_parcels_res = gpd.read_file(os.path.join(OUTPUT_FP, 'sj-parcels-res-cbgs'))
assert sj_parcels_res.duplicated('APN').sum() == 0

# * Census block groups
cbg_income_2016_SJ = gpd.read_file(os.path.join(OUTPUT_FP, 'Strata', 'cbg-income-strata'))

# * Permits
permits = pd.read_csv(os.path.join(OAK_FP, 'outputs/Permit-Matching/outputs/all_permits.csv'))
permits['geometry_permit'] = gpd.GeoSeries.from_wkt(permits['geometry_permit'])
permits_2015_20 = permits.loc[
    permits['issue_year'].isin([2015, 2016, 2017, 2018, 2019, 2020])].copy()

In [5]:
# Add income strata information to parcel data
sj_parcels_res = pd.merge(
    sj_parcels_res, cbg_income_2016_SJ[['GEOID', 'strata_inc']], 
    how='left', validate='many_to_one')

## 1. CF Batch 1: Sample 3.5% of the parcels in each strata
Sample ~5,000 residential parcels.

In [14]:
# Sample 3% of parcels
percentage_of_samples = 0.035
iteration1 = pd.DataFrame()
for strata_inc in tqdm(sj_parcels_res['strata_inc'].unique()):
    parcels_to_review = sj_parcels_res.loc[sj_parcels_res['strata_inc'] == strata_inc]['APN'].unique()

    # Sample parcels
    np.random.seed(42)
    strata_inc_N = len(parcels_to_review)
    parcels_to_review = np.random.choice(
        parcels_to_review, size=int(strata_inc_N * percentage_of_samples), replace=False)

    for i, parcel_apn in enumerate(parcels_to_review):
        parcel_dict = {'APN': [parcel_apn], 'strata_inc': [strata_inc]}
        iteration1 = pd.concat([iteration1, pd.DataFrame.from_dict(parcel_dict)])

100%|██████████| 51/51 [00:02<00:00, 24.79it/s]


In [15]:
iteration1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5569 entries, 0 to 0
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   APN         5569 non-null   object
 1   strata_inc  5569 non-null   object
dtypes: object(2)
memory usage: 130.5+ KB


In [8]:
iteration1.to_csv(
    os.path.join(OUTPUT_FP, '..', 'CloudFactory', 'Iteration1', 'sampled_APNs.csv'), index=False)

## 2. CF Round 2: Residential parcels associated with 2015-20 AANC permits
We submit a batch of parcels that match to AANC permits issued 2015-20 to estimate the number of permits that relate to the construction of detached ADUs. 

In [16]:
# 5k AANC permits issued during 2015-20
permits_2015_20_gdf = gpd.GeoDataFrame(permits_2015_20, geometry='geometry_permit', crs='EPSG:4326')

In [17]:
# 3k AANC permits issued during 2015-20 that spatially fall within our residential boundary
permits_2015_20_res = permits_2015_20_gdf.sjoin(
    gpd.GeoDataFrame(geometry=[sj_parcels_res.geometry.buffer(0).unary_union], crs='EPSG:4326'), 
    how='left', predicate='intersects')
permits_2015_20_res = permits_2015_20_res.loc[~permits_2015_20_res['index_right'].isna()]

In [18]:
# Grab only permits with spatially-matched APNs that coincide with the Permit APN
permits_2015_20_res = permits_2015_20_res.loc[permits_2015_20_res['APN'] == permits_2015_20_res['APN_parcel']]

In [19]:
# Aggregate at the parcel level
iteration2 = permits_2015_20_res.copy()
iteration2 = pd.DataFrame({'APN': iteration2['APN_parcel'].unique()})
print('[INFO] Number of parcels: {}'.format(len(iteration2)))

[INFO] Number of parcels: 2797


In [20]:
# Remove parcels that have been annotated in B1
iteration2 = iteration2.loc[~iteration2['APN'].isin(iteration1['APN'].unique())]
print('[INFO] Number of parcels: {}'.format(len(iteration2)))

[INFO] Number of parcels: 2682


In [21]:
# Remove parcels that are not in the residential area
iteration2 = iteration2.loc[iteration2['APN'].isin(sj_parcels_res['APN'].unique())]
print('[INFO] Number of parcels: {}'.format(len(iteration2)))

[INFO] Number of parcels: 2682


In [22]:
iteration2.to_csv(
    os.path.join(OUTPUT_FP, '..', 'CloudFactory', 'Iteration2', 'sampled_APNs.csv'), index=False)

In [23]:
# Generate tile centroids
parcel_centroids2 = iteration2.copy()
parcel_centroids2 = pd.merge(
    parcel_centroids2, sj_parcels_res[['APN', 'geometry']], how='left', validate='one_to_one')

parcel_centroids2 = gpd.GeoDataFrame(parcel_centroids2, crs='EPSG:4326')
parcel_centroids2['centroid'] = parcel_centroids2.geometry.centroid

parcel_centroids2[['APN', 'centroid']].to_csv(
    os.path.join(OUTPUT_FP, '..', 'CloudFactory', 'Iteration2', 'parcel_centroids.csv'), index=False)