# Identify permitted and potentially unpermitted attached and detached units in San Jose (2020)

In [1]:
import os
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
from shapely.geometry import Point
import numpy as np
import math

In [2]:
# File paths
OAK_FP = '/oak/stanford/groups/deho/building_compliance/'
INPUT_FP = os.path.join(OAK_FP, 'outputs', 'Permit-Matching')
SJ_RES_PARCELS_FP = os.path.join(INPUT_FP, 'inputs', 'san_jose_parcels_res.geojson')
ZONING_FP = os.path.join(OAK_FP, 'san_jose_suppl', 'san_jose_Zoning_Districts.geojson')
BUILD_FP = os.path.join(OAK_FP, 'outputs', 'cbg-inference-{}', 'inference_building_processed')
OSM_FP = os.path.join(OAK_FP, 'outputs', 'cbg-inference-2020', 'osm_building_processed')

In [3]:
# Load data
# * Residential parcels
sj_parcels_res = gpd.read_file(SJ_RES_PARCELS_FP)
sj_parcels_res = sj_parcels_res[sj_parcels_res['APN'].notna()]

# * Building permits
bldg_active = gpd.read_file(os.path.join(INPUT_FP, 'inputs', 'permits', 'bldg_active.geojson'))
bldg_recent = gpd.read_file(os.path.join(INPUT_FP, 'inputs','permits', 'bldg_recent.geojson'))
bldg_expired = gpd.read_file(os.path.join(INPUT_FP, 'inputs', 'permits', 'bldg_expired.geojson'))

# * Zoning
sj_zoning = gpd.read_file(ZONING_FP)
sj_residential = sj_zoning[(sj_zoning['ZONING'].str.contains('R-1')) | (sj_zoning['ZONING'].str.contains('R-2')) |\
         ((sj_zoning['ZONING'].str.contains('R-M')) & (sj_zoning['ZONING'] != 'R-MH'))]

# * parcel+permit
sj_parcel_permit = pd.read_csv(os.path.join(INPUT_FP, 'outputs', 'parcel_permit_found.csv'))
sj_permit_noparcel = pd.read_csv(os.path.join(INPUT_FP, 'outputs', 'parcel_permit_notfound.csv'))
sj_parcel_permit['geometry_parcel'] = gpd.GeoSeries.from_wkt(sj_parcel_permit['geometry_parcel'])
sj_parcel_permit['geometry_permit'] = gpd.GeoSeries.from_wkt(sj_parcel_permit['geometry_permit'])
sj_permit_noparcel['geometry'] = gpd.GeoSeries.from_wkt(sj_permit_noparcel['geometry'])

Handle active permits differently than expired permits
- Expired permits barely report dwelling units and square footage
- Should we filter really finely like we do for active permits, or filter for just work description as we do for expired permits to catch ALL the construction permits we can find? I worry that some of the buildings we infer will be under non-SFR permits

Keep this block of code in case we want to filter more specifically for possible dwelling units, but we will work with the more relaxed conditions to get more possible permits to compare to

In [4]:
# Filter permits
cols = list(bldg_active.columns) + ['permit_status']
permits = gpd.GeoDataFrame(columns=cols)
for i, status in zip([bldg_active, bldg_recent, bldg_expired], ['active', 'recent', 'expired']):
    i['permit_status'] = status
    permits = pd.concat([permits, i[i['WORKDESC'].isin(['New Construction', 'Additions/Alterations'])]])
permits = permits.reset_index(drop=True)

permits.crs = "EPSG:4326"

# Functions

In [5]:
def match_parcel(parcel_apn):
    # Obtain parcel-level data
    parcel_inputs = parcel_level_data(
      parcel_apn, sj_parcels_res. sj_parcel_permit)
    inferred_buildings_2020_parcel, inferred_buildings_2016_parcel, osm_buildings_parcel, permits_parcel = parcel_inputs

    # Incorporate OSM data
    parcel_buildings = process_OSM_data(
      inferred_buildings_2020_parcel, inferred_buildings_2016_parcel, osm_buildings_parcel)

    # Incorporate permit data
    parcel_buildings = process_permit_data(permits_parcel, parcel_buildings)

    return parcel_buildings

In [6]:
def parcel_level_data(parcel_apn, sj_parcels_res, sj_parcel_permit):
    parcel_bounds = sj_parcels_res[sj_parcels_res['APN'] == parcel_apn]

    # clip inferred buildings and osm buildings
    # i don't think clip is a good idea since it truncates the inferences according to the mask
    # inferred_buildings_2020_parcel = gpd.clip(building_inference_2020, parcel_bounds)

    # def mask_buildings(df, parcel_bounds):
    #   df_out = df.sjoin(parcel_bounds[['geometry','geom']])
    #   # print(df_out)
    #   df_out['iou'] = df_out['geometry'].intersection(df_out['geom']).area/df_out['geom'].area
    #   df_out = df_out[df_out['iou'] > 0.7]
    #   return df_out

    def mask_buildings1(parcel_bounds, fp):
        df_out = gpd.read_file(fp, mask=parcel_bounds)
        df_out['iou'] = df_out['geometry'].intersection(parcel_bounds).area/df_out['geometry'].area
        df_out = df_out[df_out['iou'] > 0.7]
        if df_out.empty:
            return None
        else:
            return df_out

    inferred_buildings_2020_parcel = mask_buildings1(parcel_bounds['geometry'].values[0], BUILD_FP.format('2020'))
    inferred_buildings_2016_parcel = mask_buildings1(parcel_bounds['geometry'].values[0], BUILD_FP.format('2016'))
    osm_buildings_parcel = mask_buildings1(parcel_bounds['geometry'].values[0], OSM_FP)

    permits_parcel = sj_parcel_permit[sj_parcel_permit['APN_parcel'] == parcel_apn]
    if permits_parcel.empty:
        permits_parcels = None

    return inferred_buildings_2020_parcel, inferred_buildings_2016_parcel, osm_buildings_parcel, permits_parcel

In [7]:
MIN_ATTACHED_SIZE = 20
MIN_ATTACHED_SIZE_2020_2016_DIFF = 30

def process_OSM_data(inferred_buildings_2020_parcel, inferred_buildings_2016_parcel, osm_buildings_parcel):
    # Returns a gpd.GeoDataFrame with the following columns:
    #   - GEOID, area, small, large
    #   - flags: OSM_flag, expansion_flag, main_building_flag
    #   - building geometry
    gpd_cols = ['main_building_flag', 'OSM_flag', 'expansion_flag', 'geometry']
    parcel_buildings = gpd.GeoDataFrame(geometry=[], columns=gpd_cols)
    
    # Drop OSM index_left column
    if osm_buildings_parcel is not None and 'index_left' in osm_buildings_parcel.columns:
        osm_buildings_parcel.drop('index_left', axis=1, inplace=True)

    # No inference nor OSM data
    if inferred_buildings_2020_parcel is None and osm_buildings_parcel is None:
        return parcel_buildings

    # No inference but OSM data: then we fully rely on OSM data
    if inferred_buildings_2020_parcel is None and (osm_buildings_parcel is not None and len(osm_buildings_parcel) > 0):
        parcel_buildings = osm_buildings_parcel.copy()
        parcel_buildings = parcel_buildings.sort_values('area', ascending=False)
        parcel_buildings['main_building_flag'] = parcel_buildings.apply(
            lambda row: True if row.name == 0 else False, axis=1)
        parcel_buildings['OSM_flag'] = True
        parcel_buildings['expansion_flag'] = False
        return parcel_buildings[gpd_cols]
    
    # Inference and no OSM data
    if len(inferred_buildings_2020_parcel) > 0 and osm_buildings_parcel is None:

        # i. Identify main building
        inferred_buildings_2020_parcel = inferred_buildings_2020_parcel.sort_values(
            'area', ascending=False)
        inference_main_build = inferred_buildings_2020_parcel.iloc[[0]]
        inference_main_build_geom = inference_main_build['geometry'].iloc[0]
        expansion_flag = False

        parcel_buildings = inference_main_build.copy()

        # ii. Compare to 2016 footprint
        if len(inferred_buildings_2016_parcel) > 0:
            # Identify 2016 main building
            inferred_buildings_2016_parcel = inferred_buildings_2016_parcel.sort_values(
                'area', ascending=False)
            inference_main_build_2016 = inferred_buildings_2016_parcel.iloc[0]
            inference_main_build_geom_2016 = inference_main_build_2016['geometry']

          # Compare
            union_main_build = inference_main_build_geom.union(inference_main_build_geom_2016)
            diff_20_16 = compute_largest_protruding_poly(union_main_build, inference_main_build_geom_2016)

            if diff_20_16['area'] > MIN_ATTACHED_SIZE_2020_2016_DIFF:
                parcel_buildings = inference_main_build_2016.copy()
                expansion_flag = True

        parcel_buildings['expansion_flag'] = expansion_flag
        parcel_buildings['main_building_flag'] = True
        parcel_buildings['OSM_flag'] = False
        parcel_buildings = parcel_buildings[gpd_cols]

        # iii. Add inferred small buildings 
        inferred_small = inferred_buildings_2020_parcel.iloc[1:].copy()
        inferred_small['expansion_flag'] = False
        inferred_small['OSM_flag'] = False
        inferred_small['main_building_flag'] = False
        parcel_buildings = pd.concat([parcel_buildings, inferred_small[gpd_cols]])

        return parcel_buildings[gpd_cols]

    # Inference and OSM data
    if len(inferred_buildings_2020_parcel) > 0 and len(osm_buildings_parcel) > 0:
        
        parcel_buildings = gpd.GeoDataFrame(geometry=[])

        # i. Identify main building(s) in OSM and inferences
        osm_main_build = osm_buildings_parcel.sort_values('area', ascending=False).iloc[[0]]
        osm_main_build_geom = osm_main_build['geometry'].iloc[0]
        
        inference_main_build = inferred_buildings_2020_parcel.sjoin(
            osm_main_build, how='left', predicate='intersects')
        inference_main_build = inference_main_build.loc[~inference_main_build['index_right'].isna()]
        inference_main_build_geom = inference_main_build.geometry.unary_union
        inference_main_build.drop('index_right', axis=1, inplace=True)
        
        # * Check for building expansion. Note we care about concentrated building
        # expansions along a single wall and not general changes in building footprint due to noisy
        # inferences so we isolate the largest protruding polygon
        union_main_build = inference_main_build_geom.union(osm_main_build_geom)
        diff_main_build = compute_largest_protruding_poly(union_main_build, osm_main_build_geom)
        
        expansion_flag = False
        osm_flag = True
        parcel_main_build_geom = osm_main_build_geom
        if (diff_main_build['area'] > MIN_ATTACHED_SIZE):
            expansion_flag = True
            osm_flag = False
            parcel_main_build_geom = union_main_build
        
        # Generate main building gpd and append to parcel buildlings
        parcel_main_build = gpd.GeoDataFrame(geometry=[parcel_main_build_geom], crs='EPSG:4326')
        parcel_main_build['main_building_flag'] = True
        parcel_main_build['expansion_flag'] = expansion_flag
        parcel_main_build['OSM_flag'] = osm_flag
        parcel_buildings = pd.concat([parcel_buildings, parcel_main_build[gpd_cols]])
        
        # ii. Match small buildings
        osm_buildings_parcel_small = osm_buildings_parcel.sort_values(
            'area', ascending=False).iloc[1:].copy()
        inferred_buildings_2020_parcel_small = inferred_buildings_2020_parcel.sjoin(
            inference_main_build[['GEOID_left', 'geometry']], how='left', predicate='intersects')
        inferred_buildings_2020_parcel_small = inferred_buildings_2020_parcel_small.loc[
            inferred_buildings_2020_parcel_small['index_right'].isna()]
        inferred_buildings_2020_parcel_small.drop('index_right', axis=1, inplace=True)
        
        parcel_small_build = match_small_buildings(
            inferred_buildings_2020_parcel_small, osm_buildings_parcel_small, gpd_cols)

        parcel_buildings = pd.concat([parcel_buildings, parcel_small_build[gpd_cols]])
        return parcel_buildings[gpd_cols]



    return None



In [8]:
def compute_largest_protruding_poly(union_build, base_build):
    """
    """
    diff_main_build = gpd.GeoSeries(union_build.difference(base_build), crs='EPSG:4326')
    diff_main_build = diff_main_build.explode(ignore_index=True, index_parts=False)
    
    diff_main_build = gpd.GeoDataFrame(geometry=diff_main_build, crs='EPSG:4326')
    diff_main_build['area'] = diff_main_build.to_crs('EPSG:26910').area
    diff_main_build = diff_main_build.sort_values('area', ascending=False).iloc[0]
    return diff_main_build

In [9]:
def match_small_buildings(inferred, osm, gpd_cols):
    """
    """
    # Note: we want to default to using OSM unless there are small buildings not 
    # captured by OSM

    if len(osm) == 0 and len(inferred) == 0:
        return gpd.GeoDataFrame(geometry=[], columns=gpd_cols)

    if (len(osm) == 0 and len(inferred) > 0) or (len(osm) > 0 and len(inferred) == 0):
        parcel_small_build = osm.copy() if len(osm) > 0 else inferred.copy()
        parcel_small_build['main_building_flag'] = False
        parcel_small_build['OSM_flag'] = True if len(osm) > 0 else False
        parcel_small_build['expansion_flag'] = False
        return parcel_small_build[gpd_cols]

    # Default to OSM
    parcel_small_build = osm.copy()
    parcel_small_build['main_building_flag'] = False
    parcel_small_build['OSM_flag'] = True
    parcel_small_build['expansion_flag'] = False

    # Matching
    inferred_match = inferred.sjoin(osm, predicate='intersects', how='left')

    # Keep only isolated inferences not captured in OSM
    inferred_match = inferred_match.loc[inferred_match['index_right'].isna()]
    inferred_match = inferred_match[[
      'GEOID_left', 'area_left', 'small_left', 'large_left', 'geometry']]
    inferred_match.rename(
      columns={c: c.replace('_left', '') for c in inferred_match.columns}, 
      inplace=True)
    inferred_match['main_building_flag'] = False
    inferred_match['OSM_flag'] = False
    inferred_match['expansion_flag'] = False

    parcel_small_build = pd.concat([parcel_small_build, inferred_match[gpd_cols]])
    return parcel_small_build[gpd_cols]

In [10]:
def process_permit_data(permits_parcel, parcel_buildings):
    if len(permits_parcel) > 0:
        # TO DO
        pass
    else:
        return parcel_buildings

# Main

In [None]:
shortlist = gpd.GeoDataFrame()

# Loop over each parcel
parcel_apns = sj_parcels_res['APN'].unique()
for parcel_apn in parcel_apns:
    shortlist_parcel = match_parcel(parcel_apn)

    # Append to shortlist
    if len(shortlist_parcel) > 0:
      shortlist = pd.concat([shortlist, shortlist_parcel])

In [11]:
# Check
parcel_apn='48114037'
inferred_buildings_2020_parcel, inferred_buildings_2016_parcel, osm_buildings_parcel, permits_parcel = parcel_level_data(parcel_apn, sj_parcels_res, sj_parcel_permit)
parcel_buildings = process_OSM_data(
    inferred_buildings_2020_parcel, inferred_buildings_2016_parcel, osm_buildings_parcel)


  df_out['iou'] = df_out['geometry'].intersection(parcel_bounds).area/df_out['geometry'].area

  df_out['iou'] = df_out['geometry'].intersection(parcel_bounds).area/df_out['geometry'].area

  df_out['iou'] = df_out['geometry'].intersection(parcel_bounds).area/df_out['geometry'].area
  return merge(


In [2]:
# 3.5 Sort shortlists
# TO DO
# Risk levels:
# highest risk, no permitted additions of any kind, and you detect detached
# no permitted additions of any kind, and you detect attached
# permitted detached ADU, you detect more than one detached
# permitted detached ADU, you detect attached and detached
# permitted attached ADU, you detect attached and detached
# permitted attached ADU, you detect a detached (or vice versa)
# permitted attached ADU and detached ADU, you detect anything unpermitted