# Population Estimates: Compute confidence levels for all parcels

In [1]:
from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

tqdm.pandas()

In [2]:
# Import functions
%run inference-functions.ipynb import load_data, get_bounds, assign_cbgs, visualize_tile_predvOSM, assign_cbgs_by_coverage
%run ADU_permit_matching-polygon-pipeline.ipynb import load_sources

In [3]:
# Paths
OAK_FP = '/oak/stanford/groups/deho/building_compliance/'
OUTPUT_FP = os.path.join(OAK_FP, 'outputs', 'Population-Estimates', 'outputs')
PERMIT_INPUT_FP = os.path.join(OAK_FP, 'outputs', 'Permit-Matching', 'inputs')

BUILD_FP = os.path.join(OAK_FP, 'outputs', 'cbg-inference-{}', 'inference_building_processed')
OSM_FP = os.path.join(OAK_FP, 'outputs', 'cbg-inference-2020', 'osm_building_processed')

In [4]:
# Define parameters
# * Percentage of samples in each income strata to process
percentage_of_samples = 1

# * Divide compute ----- MODIFY AGENT HERE (N Nathan, A Andrea) ---------
agent = 'N'

# Note: These parameters are relevant for how OSM footprints are used
# to identify the main building and to decide whether to snap to OSM small building 
# footprints. 
# The limit_2016_inferences parameter determines whether OSM footprints are used 
# symmetrically for 2020 and 2016. If True, we only include OSM footprints for 2016
# if these have been partially discovered by model inferences. If False, we include
# any OSM small building for the 2016 footprints (as done for 2020).

model_params = {'area_threshold_main': 30, 'area_threshold_small': 20,
                'flatten_threshold': 0.85, 'main_expansion_type': 'raw_polygons', 
                'main_polygon_definition': 'OSM', 'negative_buffer': 0.5, 'simplify_tolerance': 0,
                'limit_2016_inferences': True}

# 1. Compute confidences

In [5]:
def parcel_confidence(sj_parcels_res, sj_parcel_permit, parcel_apn, model_params):
    # Collect parcel data
    inferred_buildings_2020_parcel, inferred_buildings_2016_parcel, osm_buildings_parcel, permits_parcel = parcel_level_data(parcel_apn, sj_parcels_res, sj_parcel_permit)
    parcel_bounds = sj_parcels_res[sj_parcels_res['APN'] == parcel_apn]

    # Prepare data
    inferred_buildings_parcel = {'2016': inferred_buildings_2016_parcel,
                                 '2020': inferred_buildings_2020_parcel}
    
    osm_buildings_parcel = {'2016': osm_buildings_parcel,
                            '2020': osm_buildings_parcel}

    if inferred_buildings_2016_parcel is None:
        inferred_buildings_2016_parcel = pd.DataFrame(columns=['GEOID', 'area', 'small', 'large', 'geometry', 'iou'])
    inferred_buildings_2016_parcel = inferred_buildings_2016_parcel[inferred_buildings_2016_parcel['small'] == 1]
    inferred_buildings_2016_parcel = inferred_buildings_2016_parcel.reset_index(drop=True)

    # Incorporate OSM data
    parcel_buildings, _ = process_parcel_buildings(
        inferred_buildings_parcel, osm_buildings_parcel, parcel_bounds, model_params)
    
    for year in ['2016', '2020']:
        if parcel_buildings[year] is None: # means there is no building (not even main building?)
            parcel_buildings[year] = pd.DataFrame(columns=['main_building_flag', 'OSM_flag', 
                                                           'build_confidence', 'geometry', 'area'])
            continue
            
        small_buildings = parcel_buildings[year].copy()
        parcel_buildings[year] = small_buildings[small_buildings['main_building_flag'] == False].reset_index(drop=True)

    if parcel_buildings['2020'].empty:
        return 1, parcel_buildings
    elif inferred_buildings_2016_parcel.empty and not parcel_buildings['2020'].empty:
        return 3, parcel_buildings
    elif len(inferred_buildings_2016_parcel) == 1 and len(parcel_buildings['2020']) == 1: # do we need to care about intersection?
        return 2, parcel_buildings
    elif len(inferred_buildings_2016_parcel) > len(parcel_buildings['2020']):
        return 2, parcel_buildings
    else:
        building_no_pair_found = False
        len_2020 = len(parcel_buildings['2020'])
        counter = 0
        while not building_no_pair_found and counter < len_2020:
            building_2020 = parcel_buildings['2020'].loc[counter]
            
            # try to find a pair in 2016
            found_idx = None
            for idx, building_2016 in inferred_buildings_2016_parcel.iterrows():
                # IOU
                if building_2020['geometry'].intersection(building_2016['geometry']).area/building_2020['geometry'].union(building_2016['geometry']).area > 0:
                    found_idx = idx
                    break
            
            if found_idx is None:
                building_no_pair_found = True
#             else:
#                 # take the 2016 pair found away from the table
#                 # EDIT: i actually don't think this is necessary, esp for buildings that are so close to each other
            counter += 1
    
        if building_no_pair_found:
            # take the 2020 entry where no building was found, output that confidence
            return 3, parcel_buildings
        else:
            return 2, parcel_buildings
            

In [6]:
def main():
    # Load data
    # * Residential parcels (assigned to CBGs)
    sj_parcels_res = gpd.read_file(os.path.join(OUTPUT_FP, 'sj-parcels-res-cbgs'))
    assert sj_parcels_res.duplicated('APN').sum() == 0

    # * Census block groups
    cbg_income_2016_SJ = gpd.read_file(os.path.join(OUTPUT_FP, 'Strata', 'cbg-income-strata'))

    # * Permits
    sj_parcel_permit = pd.read_csv(os.path.join(PERMIT_INPUT_FP, '..', 'outputs', 'parcel_permit_found.csv'))
    sj_permit_noparcel = pd.read_csv(os.path.join(PERMIT_INPUT_FP, '..', 'outputs', 'parcel_permit_notfound.csv'))
    sj_parcel_permit['geometry_parcel'] = gpd.GeoSeries.from_wkt(sj_parcel_permit['geometry_parcel'])
    sj_parcel_permit['geometry_permit'] = gpd.GeoSeries.from_wkt(sj_parcel_permit['geometry_permit'])
    
    # Add income strata information to parcel data
    sj_parcels_res = pd.merge(
        sj_parcels_res, cbg_income_2016_SJ[['GEOID', 'strata_inc']], 
        how='left', validate='many_to_one')
    
    # Divide compute (split up the income strata)
    unique_strata = sj_parcels_res['strata_inc'].unique()

    # Find a seed that balances the number of APNs 
    np.random.seed(44)
    a1_strata = np.random.choice(unique_strata, size=len(unique_strata)//2, replace=False)

    sj_parcels_res['agent'] = sj_parcels_res['strata_inc'].apply(
        lambda strata_inc: 'A' if strata_inc in a1_strata else 'N')
    
    # Drop parcels for other agent
    sj_parcels_res = sj_parcels_res.loc[sj_parcels_res['agent'] == agent]
    
    # Set up output files
    parcel_output = os.path.join(OUTPUT_FP, 'Confidences_construction', agent, 'parcel-confidence.csv')
    building_output = os.path.join(OUTPUT_FP, 'Confidences_construction', agent, 'building-confidence-{}')

    if os.path.exists(parcel_output):
        parcel_conf_df = pd.read_csv(parcel_output)
        buildings_gpd = {'2016': gpd.read_file(building_output.format('2016')), 
                     '2020': gpd.read_file(building_output.format('2020'))}

        reviewed_parcels = parcel_conf_df['APN'].unique()
        reviewed_istrata = parcel_conf_df['strata_inc'].unique()

        print('[INFO] Number of reviewed income strata: {}'.format(len(reviewed_istrata)))
        print('[INFO] Number of reviewed parcels: {}'.format(len(reviewed_parcels)))

    else:
        os.makedirs(os.path.dirname(parcel_output))

        parcel_conf_df = pd.DataFrame()
        buildings_gpd = {'2016': gpd.GeoDataFrame(geometry=[]), '2020': gpd.GeoDataFrame(geometry=[])}
        reviewed_parcels = []
    # Compute confidence for each parcel, order by income strata
    for strata_inc in tqdm(sj_parcels_res['strata_inc'].unique()):
        parcels_to_review = sj_parcels_res.loc[sj_parcels_res['strata_inc'] == strata_inc]['APN'].unique()

        # Sample parcels
        np.random.seed(42)
        strata_inc_N = len(parcels_to_review)
        parcels_to_review = np.random.choice(
            parcels_to_review, size=int(strata_inc_N * percentage_of_samples), replace=False)
        for i, parcel_apn in enumerate(parcels_to_review):
            if parcel_apn in reviewed_parcels:
                continue

            # Compute confidence
            parcel_conf, parcel_build = parcel_confidence(
                sj_parcels_res, sj_parcel_permit, parcel_apn, model_params)

            # Get geoid
            parcel_geoid = sj_parcels_res.loc[sj_parcels_res['APN'] == parcel_apn].iloc[0]['GEOID']

            # Append
            parcel_conf_df = pd.concat([parcel_conf_df, pd.DataFrame.from_dict(
                {'APN': [parcel_apn], 'GEOID': [parcel_geoid],
                 'strata_inc': [strata_inc], 'confidence': [parcel_conf]})])

            for year in ['2016', '2020']:
                if parcel_build[year] is not None:
                    parcel_build[year]['APN'] = parcel_apn
                    buildings_gpd[year] = pd.concat([buildings_gpd[year], parcel_build[year]])

            if i > 0 and (i % 50 == 0 or i == len(parcels_to_review)):
                parcel_conf_df.to_csv(parcel_output, index=False)
                for year in ['2016', '2020']:
                    buildings_gpd[year].to_file(building_output.format(year))

In [None]:
main()

[INFO] Number of reviewed income strata: 2
[INFO] Number of reviewed parcels: 9540


  0%|          | 0/26 [00:00<?, ?it/s]

# 2. Finalize
Concatenate both parcel confidence datasets.

In [20]:
# Load parceland CBG data
cbg_income_2016_SJ = gpd.read_file(os.path.join(OUTPUT_FP, 'Strata', 'cbg-income-strata'))
sj_parcels_res = gpd.read_file(os.path.join(OUTPUT_FP, 'sj-parcels-res-cbgs'))

# Load separate parcel confidence data frames
parcel_conf_df = pd.DataFrame()
for agent in ['A', 'N']:
    parcel_output = os.path.join(OUTPUT_FP, 'Confidences_construction', agent, 'parcel-confidence.csv')
    
    parcel_conf_df = pd.concat([parcel_conf_df, pd.read_csv(parcel_output, dtype={'GEOID': 'str'})])
    
reviewed_parcels = parcel_conf_df['APN'].unique()
reviewed_istrata = parcel_conf_df['strata_inc'].unique()

print('[INFO] Number of reviewed income strata: {}'.format(len(reviewed_istrata)))
print('[INFO] Number of reviewed parcels: {}'.format(len(reviewed_parcels)))

# Drop duplicates
parcel_conf_df.drop_duplicates(subset=['APN', 'confidence'], inplace=True)

parcel_conf_df = parcel_conf_df[['APN', 'confidence']]

# Re-do GEOID (because it incorrectly saves as int)
parcel_conf_df = pd.merge(
    parcel_conf_df, sj_parcels_res[['APN', 'GEOID']], how='left', validate='one_to_one')

# Re-do strata_income
parcel_conf_df = pd.merge(
    parcel_conf_df, cbg_income_2016_SJ[['GEOID', 'strata_inc']], how='left', 
    validate='many_to_one')

# Save
parcel_conf_df.to_csv(os.path.join(OUTPUT_FP, 'Confidences_construction', 'parcel-confidence.csv'), index=False)

[INFO] Number of reviewed income strata: 51
[INFO] Number of reviewed parcels: 162685


In [28]:
parcel_conf_df = pd.read_csv(os.path.join(OUTPUT_FP, 'Confidences_construction', 'parcel-confidence.csv'), 
                             dtype={'GEOID': str})
parcel_conf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162685 entries, 0 to 162684
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   APN         162685 non-null  object 
 1   confidence  38886 non-null   float64
 2   GEOID       159883 non-null  object 
 3   strata_inc  159883 non-null  object 
dtypes: float64(1), object(3)
memory usage: 5.0+ MB


Building confidences: issue that different column names were created (due to GPD limits on col name length) so buildings were not concatenated correctly. 

In [30]:
buildings_gpd = {'2016': gpd.GeoDataFrame(),  '2020': gpd.GeoDataFrame()}

for agent in ['A', 'N']:
    building_output = os.path.join(OUTPUT_FP, 'Confidences_construction', agent, 'building-confidence-{}')

    for year in ['2016', '2020']:
        buildings_gpd[year] = pd.concat(
            [buildings_gpd[year], gpd.read_file(building_output.format(year))])

In [None]:
# Save
for year in ['2016', '2020']:
    buildings_gpd[year].to_file(os.path.join(
        OUTPUT_FP, 'Confidences_construction', 'building-confidence-{}'.format(year)))

In [None]:
buildings_gpd['2020']['build_conf'] = buildings_gpd['2020']['build_conf'].astype('float')

In [11]:
buildings_gpd['2020'].head()

Unnamed: 0,main_build,OSM_flag,build_conf,area,APN,geometry
0,1,0,0.8265858678962559,304.303273,61248010,"POLYGON ((-121.80168 37.36919, -121.80168 37.3..."
1,1,0,0.7925487726368186,139.626724,67847079,"POLYGON ((-121.78191 37.24780, -121.78191 37.2..."
2,1,0,0.870618975395968,214.732877,69525043,"POLYGON ((-121.83270 37.23293, -121.83270 37.2..."
3,0,1,0.0,24.972158,69525043,"POLYGON ((-121.83257 37.23306, -121.83257 37.2..."
4,1,0,0.7890311011722161,173.054128,61259042,"POLYGON ((-121.79939 37.35804, -121.79939 37.3..."


In [41]:
building_output = os.path.join(OUTPUT_FP, 'Confidences_construction', 'N', 'building-confidence-{}')