# Process CF Parcel-level Annotations

1. Batch 1: 5,500 residential parcels from a simple random sample (3.5%) across each of the 51 income strata
2. Batch 2: ~3,000 residential parcels matching to AANC permits
3. Batch 3: ~16,000 residential parcels (latest estimate) according Neyman allocation per the power analysis to detect a significant effect in the permitted new small building proportion in low vs. high income neighborhoods

**Outline:**

Part 1: 
- Extract CF polygons -> This gives us a map of small buildings in each year (gdf)
- Identify newly constructed small buildings (entirely new, replacement of old building)

*Part 1 outputs:*
- parcel_density: DataFrame of number of small buildings per parcel in 2016 and 2020
- CF_area_coverage: shp of the parcels annotated by CF
- sbuild_2020_compared: geoDataFrame of 2020 small buildings, including a flag for whether they are new constructions

Part 2: 
- Permit-matching for newly constructed small buildings
- Outcome analysis 

In [76]:
import geopandas as gpd
import glob
import matplotlib.pyplot as plt
import os
import pandas as pd
import pyproj
import seaborn as sns
from tqdm import tqdm
from shapely import wkt
import shapely

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

tqdm.pandas()

In [77]:
# Load functions 
%run ADU_permit_matching-polygon-pipeline.ipynb import compare_building_footprint

In [78]:
# Paths
OAK_FP = '/oak/stanford/groups/deho/building_compliance/'
CF_FP = os.path.join(OAK_FP, 'CloudFactory')
OUTPUT_FP = os.path.join(OAK_FP, 'outputs', 'Population-Estimates', 'outputs')

# Parameters
batch = 1
completion = 100 # (completion rate of the batch so we can update as CF progresses without overwriting)
construction_detection = 'CF' #['AT', 'CF', 'hybrid'][1]
min_sqft = [0, 11.1484][1] # Note that 120sqft = 11.1484 sqm
limit_to_GT = False

subset = 100

In [79]:
# Input data
# * Universe of SJ res parcels assigned to each CBG
sj_parcels_cbgs = gpd.read_file(os.path.join(OUTPUT_FP, 'sj-parcels-res-cbgs'))

# * Income-stratified CBGs (from Pop-Estimates-Strata.ipynb)
cbg_income_2016_SJ = gpd.read_file(os.path.join(OUTPUT_FP, 'Strata', 'cbg-income-strata'))

In [80]:
# Functions
def compare_smallbuildings(gdf, comp_list, name_list, model_params):
    """
    Compares the area of each building in gdf to the buildings in the GeoDataFrames in
    comp_list to check for expansions/constructions.
    """
    match_cols = ['building_index', 'APN', 'geometry'] +  [
        'expansion_{}_flag'.format(comp_name) for comp_name in name_list] + [
        'diff_{}_value'.format(comp_name) for comp_name in name_list]
    
    if gdf is None:
        return None
    
    for comp_name in name_list:
        gdf['expansion_{}_flag'.format(comp_name)] = None
        gdf['diff_{}_value'.format(comp_name)] = None
    
    for comp, comp_name in zip(comp_list, name_list):
        
        if comp is None:
            continue
        
        # Check expansion
        if len(gdf) > 0:
            comp['geometry_comp'] = comp['geometry']

            # Get polygon in comp gdf to which we compare each polygon
            comp = comp.reset_index()
            gdf = gdf.sjoin(comp[['geometry', 'geometry_comp']], how='left', predicate='intersects')
            gdf['iou_comp'] = gdf.apply(lambda row: (
                row['geometry'].intersection(comp.iloc[int(row['index_right'])]['geometry'])).area / 
                                    row['geometry'].area if pd.notnull(row['index_right']) else None, axis=1)

            # Have to account for potentially various matches for one inference (we compare to 
            # the one with highest overlap)
            gdf = gdf.sort_values('iou_comp', ascending=False)
            gdf.drop_duplicates(subset=['geometry'], keep='first', inplace=True)

            # Check expansion for each polygon
            gdf[['expansion_{}_flag'.format(comp_name), 'diff_{}_value'.format(comp_name)]] = gdf.apply(
                lambda row: compare_building_footprint(
                    base_geom=comp.iloc[int(row['index_right'])]['geometry'], 
                    new_geom=row['geometry'].union(comp.iloc[int(row['index_right'])]['geometry']), 
                    diff_type='protruding_poly', 
                    model_params=model_params, 
                    main_building_flag=False
                ) if pd.notnull(row['index_right']) else (True, True), 
                axis=1, result_type="expand")
            
            # Set area equal to the full polygon where we have a fully new small building
            gdf.loc[gdf['diff_{}_value'.format(comp_name)] == True, 'diff_{}_value'.format(comp_name)] = gdf.loc[
                gdf['diff_{}_value'.format(comp_name)] == True].to_crs('EPSG:26910')['geometry'].area

        gdf = gdf[match_cols]

    return gdf

In [81]:
# Ground truth small building consutrction cases
# Batch 1
test_cases = {
    'true(new)': [
        '45120021', '44732030', '43933037', '44626011', '46432056', '43428123', 
        '27453041', '26462005', '30341002', '30334023', '47701055', '48125057',
        '29928063', '37317034', '27726026', '30330004', '30327016', '24960051', 
        '24529043', '46227003', '26123040', '41934030', '44228010', '49704006', 
        '49724012', '49902047', '49927002', '59949012', '65219015', '01511081', 
        '48611010', '48612015', '48635017', '48804008', '48804026', '49109059',
        '49119021', '49407021', '49709005', '49902084', '49916019', '56911050', 
        '56938060', '58629019', '59205028', '64920009', '65424001', '65937019', 
        '67032055', '67622042', '67849070', '68433070', '69205043', '69210133', 
        '69226014', '70617022', '68701028', '70101032', '68452040', '67018013', 
        '69501022', '67646016', '68922029', '26449031', 
    ],
    'true(replace)': [
        '43406087', '42933015', '27409002', '26435035', '24509006', '26117066', 
        '24902060', '48144016', '42914030', '48419045', '49461011', '59214041',
        '67646033', '67832020', '47736020', '48446044', '49141060', '68918083', 
        '67303059', '67008008', '24928012', '47745061', '48129033', '48425023'
    ],
    'false': [
        '44708058', '44629050', '43917076', '43409088', '42929009', '44626008', 
        '41944094', '42926032', '43915071', '42903018', '46714029', '44706008',
        '43920020', '42940022', '44627033', '46705041', '46707017', '43946024', 
        '43429050', '42108046', '42923006', '43903050', '42117003', '42950063',
        '43422021', '41917050', '42916008', '43424064', '42950046', '44638005', 
        '44709053', '45515080', '44232048', '42947023', '43925046', '43910021', 
        '46237003', '42908005', '44627041', '42906013', '43914059', '42924011', 
        '42114072', '27445085', '40304054', '24953062', '24927036', '24958007',
        '25404032', '27423086', '27437020', '27438048', '28213044', '28214011',
        '43412093', '43909051', '44245040', '46738010', '46740005', '01506001', 
        '47228068', '47229033', '47229085', '48426033', '48816030', 
        '56922048', # this is a hard case but it seems the small builds were just moved around
        '58923068', '59108040', '64906009', '65438025', '68945071', '42935022', 
        '43917140', '46207052', '65215028', '69525051', '67310023', '67845058', 
        '46705073'
    ], 
    'unsure': [
        '48605010' # roof cover
    ]
}

## 1. Check batch completion and extract CF polygons

In [82]:
# Load batch data and batch query
# * Submitted parcels
submitted_parcels = glob.glob(os.path.join(CF_FP, 'images', 'batch{}'.format(batch), 'images', '*.png'))

In [83]:
# * Annotated parcels
annotated_parcels_files = glob.glob(
    os.path.join(CF_FP, 'exports', 'Batch{}_{}'.format(batch, completion), '*.json'))
annotated_parcels = [p.split(os.path.sep)[-1].replace('.json', '') for p in annotated_parcels_files]
if len(annotated_parcels) == 0:
    with open(os.path.join(CF_FP, 'exports', 'Batch{}_{}'.format(batch, completion), 'apn.txt'), 'r') as f:
        annotated_parcels = f.readlines()

smallbuild_parcels = sbuild['APN'].unique()

# Generate shp of annotated area
annotated_parcels_shp = sj_parcels_cbgs.loc[sj_parcels_cbgs['APN'].isin(annotated_parcels)].copy()

In [84]:
# Load extracted polygons
sbuild = pd.read_csv(
    os.path.join(OAK_FP, 'CloudFactory/results/Batch{}_{}/processed_buildings.csv'.format(
        batch, completion)), 
    dtype={'apn': str})
sbuild.rename(columns={'apn': 'APN'}, inplace=True)
sbuild.reset_index(inplace=True, drop=False)
sbuild.rename(columns={'index': 'building_index'}, inplace=True)
assert len(sbuild['building_index'].unique()) == len(sbuild)

print('[INFO] Number of unique buildings: {}'.format(len(sbuild)))

[INFO] Number of unique buildings: 850


In [85]:
# Impose minimum size for small buildings

# * Compute 2016 and 2020 areas
wgs84 = pyproj.CRS('EPSG:26910')
utm = pyproj.CRS('EPSG:4326')
project = pyproj.Transformer.from_crs(utm, wgs84, always_xy=True).transform

sbuild['2016_min_area'] = sbuild['2016'].progress_apply(
    lambda geom: 0 if pd.isnull(geom) else shapely.ops.transform(project, wkt.loads(geom)).area)
sbuild['2020_min_area'] = sbuild['2020'].progress_apply(
    lambda geom: 0 if pd.isnull(geom) else shapely.ops.transform(project, wkt.loads(geom)).area)

n_builds = len(sbuild)
sbuild = sbuild.loc[(sbuild['2016_min_area'] >= min_sqft) | (sbuild['2020_min_area'] >= min_sqft)]

print('[INFO] Dropping {} small buildings that do not meet the minimum area threshold.'.format(
    n_builds - len(sbuild)))
sbuild.drop(['2016_min_area', '2020_min_area'], axis=1, inplace=True)

100%|██████████| 850/850 [00:00<00:00, 6255.77it/s]
100%|██████████| 850/850 [00:00<00:00, 6034.23it/s]

[INFO] Dropping 36 small buildings that do not meet the minimum area threshold.





In [86]:
# Make gdf for each year
sbuild16 = sbuild[['building_index', 'APN', '2016']].copy()
sbuild16 = sbuild16.loc[~sbuild16['2016'].isna()]
sbuild16['2016'] = sbuild16['2016'].apply(wkt.loads)
sbuild16 = gpd.GeoDataFrame(sbuild16, geometry='2016', crs='EPSG:4326')

sbuild20 = sbuild.copy()
sbuild20.drop('2016', axis=1, inplace=True)
sbuild20 = sbuild20.loc[~sbuild20['2020'].isna()]
sbuild20['2020'] = sbuild20['2020'].apply(wkt.loads)
sbuild20 = gpd.GeoDataFrame(sbuild20, geometry='2020', crs='EPSG:4326')

In [87]:
print('[INFO] Annotated {} parcels out of {} submitted parcels for batch {} at {}% completion'.format(
    len(annotated_parcels), len(submitted_parcels), batch, 
    int(len(annotated_parcels) / len(submitted_parcels) * 100)))
print('[INFO] {} parcels including small buildings across both years'.format(len(smallbuild_parcels)))

[INFO] Annotated 5536 parcels out of 5541 submitted parcels for batch 1 at 99% completion
[INFO] 825 parcels including small buildings across both years


### 1.1 Generate dataframe of small building counts per parcel and CF area coverage

In [88]:
if not os.path.exists(os.path.join(OUTPUT_FP, 'Outcomes', 'B{}-{}'.format(
    batch, completion))):
    os.makedirs(os.path.join(OUTPUT_FP, 'Outcomes', 'B{}-{}'.format(batch, completion)))

In [89]:
parcel_density = sbuild.notna().groupby(sbuild['APN'])[['2016', '2020']].sum().reset_index()

parcel_density = pd.DataFrame({'APN': annotated_parcels}).merge(
    parcel_density, how='left', validate='one_to_one')

parcel_density.fillna(0, inplace=True)
parcel_density.to_csv(
    os.path.join(OUTPUT_FP, 'Outcomes', 'B{}-{}'.format(batch, completion), 
                 'parcel_density-min{}.csv'.format(int(min_sqft))), 
    index=False)

In [90]:
CF_area_coverage = sj_parcels_cbgs.loc[sj_parcels_cbgs['APN'].isin(annotated_parcels)]
CF_area_coverage.to_file(
    os.path.join(OUTPUT_FP, 'Outcomes', 'B{}-{}'.format(batch, completion), 'CF_area_coverage'))

In [91]:
print('[INFO] {} small buildings found in 2016; {} small buildings found in 2020'.format(
    int(parcel_density['2016'].sum()), int(parcel_density['2020'].sum())
))

[INFO] 753 small buildings found in 2016; 800 small buildings found in 2020


## 2. Identify newly constructed small buildings

Two ways to identify construction events: 1) Using CF annotations directly (i.e., whether a polygon was copy-pasted or not) `CF`; 2) using area-thresholds (this does not allow us to detect material changes to buildings that did not have a significant change on the building footprint) `AT`; 3) the hybrid method uses CF annotations directly for annotations made after the CF guideliness were issued, and uses the area threshold method for those made before the guidelines were issued.

In [92]:
# Cloud Factory detection
sbuild_2020_compared = sbuild20.copy()

sbuild_2020_compared['area'] = sbuild_2020_compared.to_crs('EPSG:26910').geometry.area
sbuild_2020_compared['year'] = '2020'

# Construction event if exact_match == 0
sbuild_2020_compared['expansion_2016_flag_CF'] = 1 - sbuild_2020_compared['exact_match']

In [93]:
# Area Threshold detection
if construction_detection != 'CF':
    sbuild_2020_compared_AT = gpd.GeoDataFrame()

    for parcel_apn in tqdm(sbuild20['APN'].unique()):
        # Get annual small building gdfs
        parcel_sbuild_2020 = sbuild20.loc[sbuild20['APN'] == parcel_apn].copy()
        parcel_sbuild_2016 = sbuild16.loc[sbuild16['APN'] == parcel_apn].copy()

        parcel_sbuild_2020.rename(columns={'2020': 'geometry'}, inplace=True)
        parcel_sbuild_2016.rename(columns={'2016': 'geometry'}, inplace=True)

        parcel_sbuild_2020 = gpd.GeoDataFrame(parcel_sbuild_2020, crs='EPSG:4326')
        parcel_sbuild_2016 = gpd.GeoDataFrame(parcel_sbuild_2016, crs='EPSG:4326')

        # Check for new constructions
        parcel_sbuild_2020 = compare_smallbuildings(
            gdf=parcel_sbuild_2020, comp_list=[parcel_sbuild_2016], 
            name_list=['2016'], 
            model_params={'area_threshold_main': None, 'area_threshold_small': 20, 
                          'negative_buffer': 0.5})

        # Append
        sbuild_2020_compared_AT = pd.concat([sbuild_2020_compared_AT, parcel_sbuild_2020])

    if 'index_right' in sbuild_2020_compared.columns:
        sbuild_2020_compared_AT.drop('index_right', inplace=True, axis=1)

    sbuild_2020_compared_AT.rename(columns={'expansion_2016_flag': 'expansion_2016_flag_AT'}, inplace=True)

In [94]:
# Merge
if construction_detection != 'CF':
    sbuild_2020_compared = pd.merge(
        sbuild_2020_compared, sbuild_2020_compared_AT[['building_index', 'expansion_2016_flag_AT']], 
        on='building_index', validate='one_to_one'
    )

In [95]:
if construction_detection == 'CF':
    sbuild_2020_compared['expansion_2016_flag'] = sbuild_2020_compared['expansion_2016_flag_CF']
elif construction_detection == 'AT':
    sbuild_2020_compared['expansion_2016_flag'] = sbuild_2020_compared['expansion_2016_flag_AT']
elif construction_detection == 'hybrid':
    sbuild_2020_compared['expansion_2016_flag'] = sbuild_2020_compared.apply(
        lambda row: row['expansion_2016_flag_CF'] if row['correct_procedure'] == 1 else row['expansion_2016_flag_AT'], axis=1)
else:
    raise Exception('[ERROR] Check method for construction events.')
    
sbuild_2020_compared['expansion_2016_flag'] = sbuild_2020_compared['expansion_2016_flag'].astype(bool)

In [96]:
# Limit to ground truth cases
if limit_to_GT:
    print('[INFO] Number of predicted construction events: {}'.format(
        sbuild_2020_compared['expansion_2016_flag'].sum()))
    
    # Drop false positives
    # Note: False cases have NO small building construction events. True parcels can have
    # false events as there can be multiple small buildings in a parcel
    sbuild_2020_compared.loc[
        sbuild_2020_compared['APN'].isin(test_cases['false']), ['expansion_2016_flag']] = False 
    
    print('[INFO] Number of construction events after dropping FPs: {}'.format(
        sbuild_2020_compared['expansion_2016_flag'].sum()))
    
    # Note that we can't add false negatives because there can be multiple buildings
    # at the parcel level. We can only add those that belong to parcels that have a single building
    # as we can then indeed affirm that the building is associated with a construction event. 
    single_build_apns = sbuild_2020_compared.groupby('APN')['building_index'].count().reset_index()
    single_build_apns = single_build_apns.loc[single_build_apns['building_index'] < 2]
    # * negatives
    FN_sbuild = sbuild_2020_compared.loc[sbuild_2020_compared['expansion_2016_flag'] == False].copy()
    # * Potential FNs: some are TNs but belong to parcels where another small building is a FN
    PFN_sbuild = FN_sbuild.loc[
        FN_sbuild['APN'].isin(test_cases['true(new)'] + test_cases['true(replace)'])]
    # * FNs with a single small building
    FN_single_sbuild = PFN_sbuild.loc[PFN_sbuild['APN'].isin(single_build_apns)]
    FN_single_sbuild = FN_single_sbuild['APN'].unique()
    
    #sbuild_2020_compared.loc[
    #    sbuild_2020_compared['APN'].isin(FN_single_sbuild), ['expansion_2016_flag']] = True 
    
    print('[INFO] Number of construction events after adding FNs: {}'.format(
        sbuild_2020_compared['expansion_2016_flag'].sum()))
    
    print('[INFO] Note: Potentially missing {} construction events'.format(
        len(PFN_sbuild) - len(FN_single_sbuild)))

In [97]:
print('[INFO] We have {} small building annotations across {} parcels for both years.'.format(
    sbuild[['2016', '2020']].notna().sum().sum(), len(smallbuild_parcels)))
print('[INFO] We have {} small building annotations for 2020; and '
      'identify {} of these to be built after 2016'.format(
      len(sbuild_2020_compared), sbuild_2020_compared['expansion_2016_flag'].sum()))

[INFO] We have 1553 small building annotations across 825 parcels for both years.
[INFO] We have 800 small building annotations for 2020; and identify 195 of these to be built after 2016


In [98]:
sbuild_2020_compared[['APN', 'area', 'year', 'expansion_2016_flag']].to_csv(os.path.join(
    OUTPUT_FP, 'Outcomes', 'B{}-{}'.format(batch, completion), 
    'sbuild_2020_compared-{}-min{}.csv'.format(construction_detection, int(min_sqft))), index=False)

In [99]:
sbuild_2020_compared.to_file(os.path.join(
    OUTPUT_FP, 'Outcomes', 'B{}-{}'.format(batch, completion), 
    'sbuild_2020_compared-{}-min{}'.format(construction_detection, int(min_sqft))), index=False)

### 2.1 Check whether we pass test cases
These are manually collected ground truth cases on parcels with no small building construction events (false), cases with entirely new small buildings (true-new) and cases with small buildings that replace old structures (true-replace).

In [100]:
# These checks ensure that at least one small building in the parcel is marked for a
# construction event. 
new_add = sbuild_2020_compared.loc[sbuild_2020_compared['APN'].isin(test_cases['true(new)'])].copy()
new_add = new_add.groupby('APN')['expansion_2016_flag'].sum().reset_index()
new_add['expansion'] = new_add['expansion_2016_flag'].apply(lambda x: min(x, 1))

print('[INFO] True (new) additions: {}/{} out of {} cases'.format(
    new_add['expansion'].sum(), len(new_add), len(test_cases['true(new)'])))

replace_add = sbuild_2020_compared.loc[sbuild_2020_compared['APN'].isin(test_cases['true(replace)'])].copy()
replace_add = replace_add.groupby('APN')['expansion_2016_flag'].sum().reset_index()
replace_add['expansion'] = replace_add['expansion_2016_flag'].apply(lambda x: min(x, 1))

print('[INFO] True (replace) additions: {}/{} out of {} cases'.format(
    replace_add['expansion'].sum(), len(replace_add), len(test_cases['true(replace)'])))

# This check ensures that no small building in these parcels is marked for a construction event. 
false_add = sbuild_2020_compared.loc[sbuild_2020_compared['APN'].isin(test_cases['false'])].copy()
false_add = false_add.groupby('APN')['expansion_2016_flag'].sum().reset_index()
false_add['expansion'] = false_add['expansion_2016_flag'].apply(lambda x: min(x, 1))

print('[INFO] False additions: {}/{} out of {} cases'.format(
    len(false_add) - false_add['expansion'].sum(), len(false_add), len(test_cases['false'])))

[INFO] True (new) additions: 49/51 out of 64 cases
[INFO] True (replace) additions: 21/21 out of 24 cases
[INFO] False additions: 37/60 out of 79 cases


In [65]:
new_add.loc[new_add['expansion'] == 0]

Unnamed: 0,APN,expansion_2016_flag,expansion
27,68452040,0,0
34,70101032,0,0


In [58]:
# Check all construction cases
ground_truth_cases = test_cases['true(new)'] + test_cases['true(replace)'] + test_cases['false']
sbuild_2020_compared = sbuild_2020_compared[['APN', 'area', 'year', 'expansion_2016_flag']]
sbuild_2020_compared.loc[
    (sbuild_2020_compared['expansion_2016_flag'] == True) & ~(sbuild_2020_compared['APN'].isin(ground_truth_cases))]

Unnamed: 0,APN,area,year,expansion_2016_flag
29,24915056,31.365,2020,True
36,24928012,67.68,2020,True
42,24938019,43.875,2020,True
51,25411059,28.62,2020,True
73,26421037,46.71,2020,True
74,26436110,10.08,2020,True
88,26449031,14.175,2020,True
103,27412054,25.74,2020,True
139,28826037,153.36,2020,True
155,40358067,13.455,2020,True


### 2.2 Test specific cases

In [34]:
parcel_apn = '27409002'
parcel_sbuild = sbuild.loc[sbuild['APN'] == parcel_apn].copy()
parcel_sbuild_2020 = parcel_sbuild.loc[parcel_sbuild['year'] == '2020'].copy()
parcel_sbuild_2016 = parcel_sbuild.loc[parcel_sbuild['year'] == '2016'].copy()

# Check for new constructions
parcel_sbuild_2020 = compare_smallbuildings(
    gdf=parcel_sbuild_2020, comp_list=[parcel_sbuild_2016], 
    name_list=['2016'], 
    model_params={'area_threshold_main': None, 'area_threshold_small': 20, 
                  'negative_buffer': 0.5})

In [35]:
parcel_sbuild_2020

Unnamed: 0,APN,area,year,geometry,expansion_2016_flag,diff_2016_value
865,27409002,37.485,2020,"POLYGON ((-121.92719 37.33260, -121.92723 37.3...",False,0.0
