# Population Estimates: Samples submitted to Cloud Factory

In [11]:
from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

tqdm.pandas()

In [12]:
# Import functions
%run inference-functions.ipynb import load_data, get_bounds, assign_cbgs, visualize_tile_predvOSM, assign_cbgs_by_coverage
%run ADU_permit_matching-polygon-pipeline.ipynb import load_sources

In [13]:
# Paths
OAK_FP = '/oak/stanford/groups/deho/building_compliance/'
OUTPUT_FP = os.path.join(OAK_FP, 'outputs', 'Population-Estimates', 'outputs')
PERMIT_INPUT_FP = os.path.join(OAK_FP, 'outputs', 'Permit-Matching', 'inputs')

BUILD_FP = os.path.join(OAK_FP, 'outputs', 'cbg-inference-{}', 'inference_building_processed')
OSM_FP = os.path.join(OAK_FP, 'outputs', 'cbg-inference-2020', 'osm_building_processed')

In [14]:
# Load data
# * Residential parcels (assigned to CBGs)
sj_parcels_res = gpd.read_file(os.path.join(OUTPUT_FP, 'sj-parcels-res-cbgs'))
assert sj_parcels_res.duplicated('APN').sum() == 0

# * Census block groups
cbg_income_2016_SJ = gpd.read_file(os.path.join(OUTPUT_FP, 'Strata', 'cbg-income-strata'))

# * Permits
sj_parcel_permit = pd.read_csv(os.path.join(PERMIT_INPUT_FP, '..', 'outputs', 'parcel_permit_found.csv'))
sj_permit_noparcel = pd.read_csv(os.path.join(PERMIT_INPUT_FP, '..', 'outputs', 'parcel_permit_notfound.csv'))
sj_parcel_permit['geometry_parcel'] = gpd.GeoSeries.from_wkt(sj_parcel_permit['geometry_parcel'])
sj_parcel_permit['geometry_permit'] = gpd.GeoSeries.from_wkt(sj_parcel_permit['geometry_permit'])

In [15]:
# Add income strata information to parcel data
sj_parcels_res = pd.merge(
    sj_parcels_res, cbg_income_2016_SJ[['GEOID', 'strata_inc']], 
    how='left', validate='many_to_one')

## 1. CF Round 1: Sample 3% of the parcels in each strata

In [16]:
# Sample 3% of parcels
percentage_of_samples = 0.03
iteration1 = pd.DataFrame()
for strata_inc in tqdm(sj_parcels_res['strata_inc'].unique()):
    parcels_to_review = sj_parcels_res.loc[sj_parcels_res['strata_inc'] == strata_inc]['APN'].unique()

    # Sample parcels
    np.random.seed(42)
    strata_inc_N = len(parcels_to_review)
    parcels_to_review = np.random.choice(
        parcels_to_review, size=int(strata_inc_N * percentage_of_samples), replace=False)

    for i, parcel_apn in enumerate(parcels_to_review):
        parcel_dict = {'APN': [parcel_apn], 'strata_inc': [strata_inc]}
        iteration1 = pd.concat([iteration1, pd.DataFrame.from_dict(parcel_dict)])

100%|██████████| 51/51 [00:03<00:00, 12.91it/s]


In [7]:
iteration1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5081 entries, 0 to 0
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   APN         5081 non-null   object
 1   strata_inc  5081 non-null   object
dtypes: object(2)
memory usage: 119.1+ KB


In [10]:
iteration1.to_csv(
    os.path.join(OUTPUT_FP, '..', 'CloudFactory', 'Iteration1', 'sampled_APNs.csv'), index=False)

In [18]:
iteration1['strata_inc'].unique()

array(['IS_35', 'IS_45', 'IS_40', 'IS_30', 'IS_17', 'IS_11', 'IS_19',
       'IS_22', 'IS_23', 'IS_43', 'IS_18', 'IS_12', 'IS_37', 'IS_1',
       'IS_49', 'IS_32', 'IS_29', 'IS_31', 'IS_14', 'IS_39', 'IS_13',
       'IS_38', 'IS_36', 'IS_8', 'IS_42', 'IS_9', 'IS_15', 'IS_25',
       'IS_21', 'IS_2', 'IS_10', 'IS_6', 'IS_24', 'IS_47', 'IS_4',
       'IS_27', 'IS_33', 'IS_16', 'IS_48', 'IS_28', 'IS_46', 'IS_34',
       'IS_0', 'IS_20', 'IS_26', 'IS_41', 'IS_3', 'IS_7', 'IS_5', 'IS_44'],
      dtype=object)