# Functions to run the polygon pipeline for ADU permit-matching

In [2]:
import glob
import rasterio
from rasterio.transform import from_bounds
from rasterio import plot

from tqdm import tqdm
tqdm.pandas()

# Inference functions
%run inference-functions.ipynb import get_bounds

## Load data

In [None]:
def load_sources():
    tif_fp = {
    '2016': '/oak/stanford/groups/deho/building_compliance/san_jose_naip_512/2016/raw_tif', 
    '2018': '/oak/stanford/groups/deho/building_compliance/san_jose_naip_512/2018/raw_tif',
    '2020': '/oak/stanford/groups/deho/building_compliance/san_jose_naip_512/raw_tif'
    }
    inferences_dir = {
        '2016': '/oak/stanford/groups/deho/building_compliance/san_jose_naip_512/2016/infer',
        '2018': '/oak/stanford/groups/deho/building_compliance/san_jose_naip_512/2018/infer',
        '2020': '/oak/stanford/groups/deho/building_compliance/san_jose_naip_512/phase2_superresx2/infer/'
    }
    img_fp = {
        '2016': '/oak/stanford/groups/deho/building_compliance/san_jose_naip_512/2016/superresx2',
        '2018': '/oak/stanford/groups/deho/building_compliance/san_jose_naip_512/2018/superresx2',
        '2020': '/oak/stanford/groups/deho/building_compliance/san_jose_naip_512/phase2_superresx2'
    }
    return tif_fp, inferences_dir, img_fp

## Process polygons

In [2]:
def process_OSM_data(inferred_buildings_2020_parcel, inferred_buildings_2016_parcel, osm_buildings_parcel,
                     min_area_thresh, flatten_threshold, parcel_bounds):
    # Returns a gpd.GeoDataFrame with the following columns:
    #   - GEOID, area, small, large
    #   - flags: OSM_flag, expansion_OSM_flag, main_building_flag, expansion_2016_flag
    #   - building geometry
    gpd_cols = [
        'main_building_flag', 'OSM_flag', 'expansion_OSM_flag', 'diff_OSM_value',
        'expansion_2016_flag', 'diff_2016_value', 'geometry']
    parcel_buildings = gpd.GeoDataFrame(geometry=[], columns=gpd_cols)
    
    # Drop OSM index_left column
    if osm_buildings_parcel is not None and 'index_left' in osm_buildings_parcel.columns:
        osm_buildings_parcel.drop('index_left', axis=1, inplace=True)
        
    # Identify main buildings
    parcel_builds, parcel_main_geoms = identify_main_buildings(
        inferred_buildings_2020_parcel, inferred_buildings_2016_parcel, osm_buildings_parcel, parcel_bounds)
    ib_2020_parcel, ib_2016_parcel, osm_parcel = parcel_builds
    ib_2020_main_geom, ib_2016_main_geom, osm_main_geom = parcel_main_geoms
    
    # Process the 2016 inferences
    # Note: We process 2016 in a special manner. We do not default to OSM as we do for
    # 2020, but rather just use OSM to complete the predictions that were inferred
    # by the model. 
    
    # Merge buildings with OSM annotations
    ib_2016_parcel = merge_buildings(
        gdf=ib_2016_parcel, comp=osm_parcel, 
        area_threshold_expansion=min_area_thresh, flatten_threshold=flatten_threshold,
        limit_to_inferences=True)
    
    # Case 1: No inference nor OSM data ----------------- 
    if ib_2020_parcel is None and osm_parcel is None:
        return parcel_buildings[gpd_cols], np.NAN

    
    # Case 2: No inference but OSM data ----------------- 
    # We fully rely on OSM data
    if ib_2020_parcel is None and osm_parcel is not None:
        parcel_buildings = osm_parcel.copy()
        parcel_buildings['OSM_flag'] = True
        
        # Compare to 2016 footprints
        parcel_buildings = compare_buildings(
            gdf=parcel_buildings, comp_list=[ib_2016_parcel], name_list=['2016'], 
            area_threshold_expansion=min_area_thresh, flatten_threshold=flatten_threshold)
        parcel_buildings['expansion_OSM_flag'] = False
        parcel_buildings['diff_OSM_value'] = 0
        
        
    # Case 3: Inference and no OSM data ----------------- 
    if osm_parcel is None and ib_2020_parcel is not None:
        
        # Compare to 2016 footprints
        parcel_buildings = compare_buildings(
            gdf=ib_2020_parcel, comp_list=[ib_2016_parcel], name_list=['2016'],  
            area_threshold_expansion=min_area_thresh, flatten_threshold=flatten_threshold
            )
        
        # Reflect lack of OSM
        parcel_buildings['OSM_flag'] = False
        parcel_buildings['expansion_OSM_flag'] = None
        parcel_buildings['diff_OSM_value'] = None

        
    # Case 4: Inference and OSM data -----------------   
    if osm_parcel is not None and ib_2020_parcel is not None:
        # Merge with OSM footprints
        ib_2020_parcel = merge_buildings(
            gdf=ib_2020_parcel, comp=osm_parcel, 
            area_threshold_expansion=min_area_thresh, flatten_threshold=flatten_threshold,
            limit_to_inferences=False)
        
        # Check for 2016 and OSM expansions
        parcel_buildings = compare_buildings(
            gdf=ib_2020_parcel, comp_list=[osm_parcel, ib_2016_parcel], name_list=['OSM', '2016'],  
            area_threshold_expansion=min_area_thresh, flatten_threshold=flatten_threshold
            )
        
        # Raw main building expansion check for 2020 vs 2016
        #if ib_2020_parcel is not None and ib_2016_parcel is not None:
        #    raw_main_2020 = inferred_buildings_2020_parcel.sort_values('area', ascending=False).iloc[0]['geometry']
        #    raw_main_2016 = inferred_buildings_2016_parcel.sort_values('area', ascending=False).iloc[0]['geometry']
        #    main_exp = (raw_main_2020.intersection(raw_main_2016)).area / (raw_main_2020.union(raw_main_2016)).area
        #    if main_exp < 0.8:
        #        parcel_buildings.loc[(parcel_buildings['main_building_flag'] == True), 'expansion_2016_flag'] = True
        
    # Compute building area
    parcel_buildings['area'] = parcel_buildings.to_crs('EPSG:26910').geometry.area
    
    # Generate data dict (for debugging)
    data_dict = {'2020_output': ib_2020_parcel, '2016_output': ib_2016_parcel, 'osm_output': osm_parcel}
    return parcel_buildings[gpd_cols + ['area']], data_dict


In [None]:
def compare_buildings(gdf, comp_list, name_list, area_threshold_expansion, flatten_threshold
                    ):
    """
    """
    match_cols = ['GEOID', 'area', 'iou', 'main_building_flag',
                  'OSM_flag', 'geometry'] +  [
        'expansion_{}_flag'.format(comp_name) for comp_name in name_list] + [
        'diff_{}_value'.format(comp_name) for comp_name in name_list]
    
    if gdf is None:
        return None
    
    for comp_name in name_list:
        gdf['expansion_{}_flag'.format(comp_name)] = None
        gdf['diff_{}_value'.format(comp_name)] = None
    
    for comp, comp_name in zip(comp_list, name_list):
        
        if comp is None:
            continue
        
        # Check expansion
        if len(gdf) > 0:
            comp['geometry_comp'] = comp['geometry']

            comp = comp.reset_index()
            gdf = gdf.sjoin(comp[['geometry', 'geometry_comp']], how='left', predicate='intersects')
            gdf['iou_comp'] = gdf.apply(lambda row: (
                row['geometry'].intersection(comp.iloc[int(row['index_right'])]['geometry'])).area / 
                                    row['geometry'].area if pd.notnull(row['index_right']) else None, axis=1)

            # Have to account for potentially various matches for one inference
            gdf = gdf.sort_values('iou_comp', ascending=False)
            gdf.drop_duplicates(subset=['geometry'], keep='first', inplace=True)

            # Compare expansion
            gdf[['expansion_{}_flag'.format(comp_name), 'diff_{}_value'.format(comp_name)]] = gdf.apply(
                lambda row: compare_building_footprint(
                    base_geom=comp.iloc[int(row['index_right'])]['geometry'], 
                    new_geom=row['geometry'].union(comp.iloc[int(row['index_right'])]['geometry']), 
                    diff_type='protruding_poly', 
                    area_threshold=area_threshold_expansion) if pd.notnull(row['index_right']) else (True, True), 
                axis=1, result_type="expand")
            gdf.loc[gdf['diff_{}_value'.format(comp_name)] == True, 'diff_{}_value'.format(comp_name)] = gdf.loc[
                gdf['diff_{}_value'.format(comp_name)] == True].to_crs('EPSG:26910')['geometry'].area

        gdf = gdf[match_cols]

    return gdf

In [None]:
def merge_buildings(gdf, comp, area_threshold_expansion, flatten_threshold, limit_to_inferences):
    match_cols = ['GEOID', 'area', 'iou', 'main_building_flag', 'OSM_flag', 'geometry'] 
    
    if gdf is None:
        return None
    
    if comp is None:
        return gdf
    
    # Default to OSM for all buildings except main building
    parcel_build = comp.loc[comp['main_building_flag'] == False].copy()
    
    # Check for missing buildings in OSM annotations
    parcel_union = parcel_build.geometry.unary_union
    
    fp = gdf.loc[gdf['main_building_flag'] == False].copy()
    if len(fp) > 0:
        fp['inf_not_covered'] = fp.apply(
            lambda row: compare_building_footprint(
                base_geom=parcel_union, 
                new_geom=row['geometry'], 
                diff_type='protruding_poly', 
                area_threshold=area_threshold_expansion)[0], 
            axis=1)

        fp = fp.sjoin(parcel_build[['geometry']], how='left')
        fp = fp.loc[(fp['inf_not_covered'] == True) | (fp['index_right'].isna())]

        # Account for multiple OSM matches
        fp.drop_duplicates(subset=['geometry'], inplace=True)
    fp['OSM_flag'] = False
    
    # Keep only buildings identified in inferences (we're lenient and allow for anything)
    # that is at least 30% covered by the inferences to be included or 
    # inference footprint is 60% covered by OSM.

    if limit_to_inferences and len(parcel_build) > 0:
        # Cover 30% of OSM footprints
        gdf_geom = gdf.geometry.unary_union
        parcel_build['osm_coverage'] = parcel_build['geometry'].intersection(gdf_geom).area/parcel_build['geometry'].area
        
        # OSM covers 60% of inference footprint
        gdf.reset_index(inplace=True, drop=True)
        parcel_build = parcel_build.sjoin(gdf[['geometry']], how='left', predicate='intersects')
        
        parcel_build['inf_coverage'] = parcel_build.apply(
            lambda row: 0 if pd.isnull(row['index_right']) else row['geometry'].intersection(
                gdf.iloc[int(row['index_right'])]['geometry']).area/gdf.iloc[
                int(row['index_right'])]['geometry'].area, axis=1)
        parcel_build = parcel_build.loc[(parcel_build['osm_coverage'] > 0.3) | (parcel_build['inf_coverage'] > 0.6)]
        
    parcel_build = pd.concat([parcel_build[match_cols], fp[match_cols]])
    
    # Add back main building
    parcel_build = pd.concat([gdf.loc[gdf['main_building_flag'] == True][match_cols], 
                              parcel_build[match_cols]])
    
    # Flatten
    gdf = flatten_geometries(parcel_build, flatten_threshold)

    return gdf
    



In [None]:
def simplify_gdf(gdf):
    gdf = gdf.to_crs('EPSG:26910')
    gdf['geometry'] = gdf['geometry'].simplify(tolerance=0.5, preserve_topology=True)
    gdf = gdf.to_crs('EPSG:4326')
    return gdf

In [None]:
def get_inference_main_building(inference_buildings, osm_buildings_parcel):
    """
    inference_buildings: (gpd.GeoDataFrame) parcel inference all buildings
    osm_main_build: (gpd.GeoDataFrame) parcel OSM main building
    """
    
    if inference_buildings is None:
        return None
    
    inference_buildings = inference_buildings.sort_values(
            'area', ascending=False)
    cols = ['GEOID', 'area',  'geometry', 'iou', 'OSM_flag', 'main_building_flag']
    
    # If OSM is unavailable, we extract the largest polygon
    if osm_buildings_parcel is None:
        inference_buildings = inference_buildings.reset_index(drop=True)
        inference_buildings['main_building_flag'] = inference_buildings.apply(
            lambda row: True if row.name == 0 else False, axis=1)
        inference_buildings['OSM_flag'] = False
        
        # Simplify
        inference_buildings = simplify_gdf(inference_buildings)
        
    # If OSM is available, we use the union between the polygons that overlap with 
    # the main building in OSM and the main building in OSM.
    else:
        # OSM main build
        osm_main_build = osm_buildings_parcel.loc[osm_buildings_parcel['main_building_flag'] == True]
        
        # Identify inference buildings overlapping with OSM main building
        inference_buildings = inference_buildings.sjoin(
            osm_main_build[['geometry']], how='left', predicate='intersects')
        inference_buildings['OSM_flag'] = False
        
        # If there is no overlap with the OSM main building, we use OSM.
        if inference_buildings['index_right'].isna().mean() == 1:
            inference_buildings['main_building_flag'] = False
            
            # Simplify small builds
            inference_buildings = simplify_gdf(inference_buildings)
            
            inference_buildings = pd.concat([osm_main_build[cols], inference_buildings[cols]])
            inference_buildings = inference_buildings.sort_values('area', ascending=False)
        else:
            inference_buildings['main_building_flag'] = inference_buildings['index_right'].apply(
                lambda x: True if pd.notnull(x) else False)

            # Combine main building polygon
            inference_buildings['dissolve_idx'] = np.arange(len(inference_buildings))
            inference_buildings['dissolve_idx'] = inference_buildings.apply(lambda row: 99 if row['main_building_flag'] else row['dissolve_idx'], axis=1)
            inference_buildings = inference_buildings.dissolve(
                by='dissolve_idx', aggfunc={
                     "area": "sum",
                     'GEOID': 'first',
                     'iou': 'mean',
                     'main_building_flag': 'max'
                 },).reset_index()
            inference_buildings.drop(['dissolve_idx'], axis=1, inplace=True)
            inference_buildings = inference_buildings.sort_values('area', ascending=False)
            inference_buildings = inference_buildings.reset_index(drop=True)
            
            # Simplify
            inference_buildings = simplify_gdf(inference_buildings)
            inference_buildings['OSM_flag'] = False
            
            # Replace with OSM coverage (as determined by the minimum rotated rectangle) plus
            # whatever was not covered in OSM (potential expansions)
            inf_main_build = inference_buildings.iloc[0]['geometry']
            inf_mrr = inf_main_build.minimum_rotated_rectangle
            inf_union = inf_mrr.intersection(osm_main_build.geometry.unary_union)
            inf_union = inf_union.union(inf_main_build)
            
            # Replace with union of overlapping poly and OSM
            #inf_union = inference_buildings.iloc[0]['geometry'].union(
            #    osm_main_build.geometry.unary_union)
            
            # * Remove overlap with OSM small buildings
            osm_small_build = osm_buildings_parcel.loc[osm_buildings_parcel['main_building_flag'] == False]
            if len(osm_small_build) > 0:
                inf_union_diff = inf_union.difference(osm_small_build.geometry.unary_union)
                
                # Add back the OSM small buildings we removed from main inference
                if inf_union != inf_union_diff:
                    covered_osm_buildings = osm_small_build.sjoin(gpd.GeoDataFrame(geometry=[inf_union]))
                    covered_osm_buildings = covered_osm_buildings.loc[~covered_osm_buildings['index_right'].isna()]
                    covered_osm_buildings['OSM_flag'] = True
                    inference_buildings = pd.concat([inference_buildings[cols], covered_osm_buildings[cols]])
                
                inf_union = inf_union_diff
                
                # * Keep largest polygon (in case OSM difference broke up the polygon)
                if type(inf_union) == shapely.geometry.multipolygon.MultiPolygon:
                    inf_union = max(inf_union, key=lambda a: a.area)
            
            inference_buildings.at[0, 'geometry'] = inf_union
            
    return inference_buildings[cols]

def identify_main_buildings(inferred_buildings_2020_parcel, inferred_buildings_2016_parcel, 
                            osm_buildings_parcel, parcel_bounds):
    # Identify OSM main building
    osm_main_build_geom, osm_main_build = None, None
    
    if osm_buildings_parcel is not None:
        # Filter duplicate geometries
        osm_buildings_parcel.drop_duplicates(subset=['geometry'], inplace=True)
        
        osm_buildings_parcel = osm_buildings_parcel.sort_values('area', ascending=False)
        
        # Identify main building
        osm_buildings_parcel = osm_buildings_parcel.reset_index(drop=True)
        osm_buildings_parcel['main_building_flag'] = osm_buildings_parcel.apply(
            lambda row: True if row.name == 0 else False, axis=1)
        
        osm_buildings_parcel['OSM_flag'] = True
        osm_main_build_geom = osm_buildings_parcel.iloc[0]['geometry']
        
    # Identify main 2016 building
    inferred_buildings_2016_main_geom = None
    if inferred_buildings_2016_parcel is not None:
        # Clip inferences
        inferred_buildings_2016_parcel = gpd.clip(inferred_buildings_2016_parcel, parcel_bounds)
        
        inferred_buildings_2016_parcel = inferred_buildings_2016_parcel.reset_index(drop=True)
        inferred_buildings_2016_parcel = get_inference_main_building(
            inferred_buildings_2016_parcel, osm_buildings_parcel)
        
        inferred_buildings_2016_main_geom = inferred_buildings_2016_parcel.iloc[0]['geometry'] 
    
    # Identify main 2020 building
    inferred_buildings_2020_main_geom = None
    if inferred_buildings_2020_parcel is not None:
        # Clip inferences
        inferred_buildings_2020_parcel = gpd.clip(inferred_buildings_2020_parcel, parcel_bounds)
        
        inferred_buildings_2020_parcel = inferred_buildings_2020_parcel.reset_index(drop=True)
        inferred_buildings_2020_parcel = get_inference_main_building(
                inferred_buildings_2020_parcel, osm_buildings_parcel)
        inferred_buildings_2020_main_geom = inferred_buildings_2020_parcel.iloc[0]['geometry']

    parcel_builds = inferred_buildings_2020_parcel, inferred_buildings_2016_parcel, osm_buildings_parcel
    parcel_main_geoms = inferred_buildings_2020_main_geom, inferred_buildings_2016_main_geom, osm_main_build_geom
    
    return parcel_builds, parcel_main_geoms



def compute_largest_protruding_poly(union_build, base_build):
    """
    Note we care about concentrated building
    # expansions along a single wall and not general changes in building footprint due to noisy
    # inferences so we isolate the largest protruding polygon
    :return: (gpd.GeoSeries)
    """
    diff_build = gpd.GeoDataFrame(geometry=[union_build.difference(base_build)], crs='EPSG:4326')

    # Break up polygons
    diff_build = diff_build.to_crs('EPSG:26910')
    diff_build['geometry'] = diff_build.geometry.buffer(-0.2)
    diff_build = diff_build.explode(ignore_index=True, index_parts=False)
    diff_build['geometry'] = diff_build.geometry.buffer(0.2)
    
    # Remove "pizza crusts" or thin strips of additional area
    diff_build['geometry'] = diff_build['geometry'].apply(
    lambda geom: geom.buffer(-0.5).buffer(0.5*1.1).intersection(geom))
    
    diff_build = diff_build.to_crs('EPSG:4326')

    # Return largest polygon
    diff_build['area'] = diff_build.to_crs('EPSG:26910').area
    diff_build = diff_build.sort_values('area', ascending=False).iloc[0]
    
    return diff_build
  
    
def compare_building_footprint(base_geom, new_geom, diff_type, area_threshold):
    expansion_flag, diff_value = None, None
    if base_geom is not None:
        expansion_flag = False

        if diff_type == 'protruding_poly':
            diff_gpd = compute_largest_protruding_poly(new_geom, base_geom)
            diff_value = diff_gpd['area']
        else:
            raise Exception('[ERROR] Raw poly comparison not implemented.')

        if diff_value > area_threshold:
            expansion_flag = True
    return expansion_flag, diff_value


def flatten_geometries(gdf, threshold):
    def check_overlapping_polygons(df, row):
        unique = True
        for i in set(range(len(df))).difference(set([row.name])):
            intersect = ((row['geometry'].intersection(df.to_crs('EPSG:26910').iloc[i].geometry)).area) / row['geometry'].area
            if intersect > threshold:
                unique = False
        return unique
    
    # Start by dropping duplicate geometries
    gdf.drop_duplicates('geometry', inplace=True)
    
    gdf = gdf.copy()
    gdf = gdf.reset_index(drop=True)
    
    gdf['unique'] = gdf.to_crs('EPSG:26910').apply(
        lambda row: check_overlapping_polygons(gdf, row), axis=1)
    
    # Get unique geometries
    gdf = gdf.loc[(gdf['unique'] == True) | ((gdf['main_building_flag'] == True) & (gdf['OSM_flag'] == False))]
    
    # Recompute area
    gdf['area'] = gdf.to_crs('EPSG:26910').geometry.area
    gdf = gdf.sort_values('area', ascending=False)
    
    return gdf

## Plotting tools

In [None]:
def run_osm_apn(parcel_apn, area_threshold, flatten_threshold):
    inferred_buildings_2020_parcel, inferred_buildings_2016_parcel, osm_buildings_parcel, permits_parcel = parcel_level_data(parcel_apn, sj_parcels_res, sj_parcel_permit)
    parcel_bounds = sj_parcels_res[sj_parcels_res['APN'] == parcel_apn]

    # Incorporate OSM data
    parcel_buildings, _ = process_OSM_data(
        inferred_buildings_2020_parcel, inferred_buildings_2016_parcel, 
        osm_buildings_parcel, area_threshold, flatten_threshold, parcel_bounds)

    # Plot
    fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(18, 8))
    parcel_bounds.plot(ax=ax1, edgecolor='black', facecolor='none')
    if osm_buildings_parcel is not None:
        osm_buildings_parcel.plot(ax=ax1, color='blue', alpha=0.7)
    if inferred_buildings_2020_parcel is not None:
        inferred_buildings_2020_parcel.plot(ax=ax1, color='red', alpha=0.7)
    ax1.axis('off')

    parcel_bounds.plot(ax=ax2, edgecolor='black', facecolor='none')
    if osm_buildings_parcel is not None:
        osm_buildings_parcel.plot(ax=ax2, color='blue', alpha=0.7)
    if inferred_buildings_2016_parcel is not None:
        inferred_buildings_2016_parcel.plot(ax=ax2, color='red', alpha=0.7)
    ax2.axis('off')

    parcel_bounds.plot(ax=ax3, edgecolor='black', facecolor='none')
    parcel_buildings.plot(ax=ax3, color='blue', alpha=0.7)
    if osm_buildings_parcel is not None:
        osm_buildings_parcel.plot(ax=ax3, color='blue', alpha=0)
    ax3.axis('off')
    plt.show()
    
    return parcel_buildings

In [11]:
def run_osm_apn_full_detail(parcel_apn, area_threshold, flatten_threshold, sat_imagery=None, attach=None, title=True):
    inferred_buildings_2020_parcel, inferred_buildings_2016_parcel, osm_buildings_parcel, permits_parcel = parcel_level_data(parcel_apn, sj_parcels_res, sj_parcel_permit)
    parcel_bounds = sj_parcels_res[sj_parcels_res['APN'] == parcel_apn]

    # Incorporate OSM data
    parcel_buildings, data_dict = process_OSM_data(
        inferred_buildings_2020_parcel, inferred_buildings_2016_parcel, 
        osm_buildings_parcel, area_threshold, flatten_threshold, parcel_bounds)

    ib_2020_parcel = data_dict['2020_output']
    ib_2016_parcel = data_dict['2016_output']
    osm_parcel = data_dict['osm_output']
    
    # Define axis -- depends on whether we want an independent plot, or to attach to another plot
    if attach is None:
        if sat_imagery is None:
            fig, axs = plt.subplots(ncols=3, nrows=2, figsize=(18, 10))
        else:
            fig, axs = plt.subplots(ncols=3, nrows=3, figsize=(18, 15))
    else:
        fig, axs = attach

    # Get individual axis -- number depends on whether we visualize satellite imagery
    if sat_imagery is None:
        (ax1, ax2, ax3), (ax4, ax5, ax6) = axs
    else:
        (ax1, ax2, ax3), (ax4, ax5, ax6), (ax7, ax8, ax9) = axs

    for ax in [a for alist in axs for a in alist]:
        parcel_bounds.plot(ax=ax, edgecolor='black', facecolor='none')
        ax.axis('off')
        
    for ax in (ax1, ax2):
        if osm_buildings_parcel is not None:
            osm_buildings_parcel.plot(ax=ax, color='blue', alpha=0.3)
            
    if inferred_buildings_2020_parcel is not None:
        inferred_buildings_2020_parcel.plot(ax=ax1, color='red', alpha=0.7)
    if title:
        ax1.set_title('2020 inferences (raw)')
        
    if inferred_buildings_2016_parcel is not None:
        inferred_buildings_2016_parcel.plot(ax=ax2, color='red', alpha=0.7)
    if title:
        ax2.set_title('2016 inferences (raw)')
        
    if osm_parcel is not None:
        osm_parcel.plot(ax=ax3, color='blue', alpha=0.7)
    if title:
        ax3.set_title('OSM annotations')
        
    if ib_2020_parcel is not None:
        ib_2020_parcel.plot(ax=ax4, color='red', alpha=0.7)
    if title:
        ax4.set_title('OSM-adjusted 2020 polygons')
    
    if ib_2016_parcel is not None:
        ib_2016_parcel.plot(ax=ax5, color='red', alpha=0.7)
    if title:
        ax5.set_title('OSM-adjusted 2016 polygons')
    
    # Output
    parcel_buildings.plot(ax=ax6, color='purple', alpha=0.7)
    if title:
        ax6.set_title('Output')
    
    # Satellite images
    if sat_imagery is not None:
        
        for year, ax in zip(['2020', '2016'], (ax7, ax8)):
            # Get imagery
            file_name = get_file_name_from_parcel(
                parcel_apn, sat_imagery['sj_parcels_res'], sat_imagery['tiles_gdf'][year])
            img_file, superres_file = find_image_file_and_superrestile(
                sat_imagery['img_fp'][year], sat_imagery['tif_fp'][year], file_name)

            with rasterio.open(superres_file) as src:
                out_image, out_transform = rasterio.mask.mask(
                    src, parcel_bounds.to_crs('EPSG:26910')['geometry'], crop=True, nodata=255)
            
            # Plot
            rasterio.plot.show(out_image, transform=out_transform, ax=ax)
    
    # Plot independently
    if attach is None:
        plt.show()
        plt.close()
    
    return parcel_buildings

## Visualizing satellite imagery

In [3]:
def get_file_name_from_parcel(parcel_apn, sj_parcels_res, tiles_gdf_year):
    # Get parcel bounds
    parcel_bounds = sj_parcels_res[sj_parcels_res['APN'] == parcel_apn]
    
    # Get tiles
    tiles = tiles_gdf_year.copy()
    
    # Return tile with largest overlap with parcel
    tiles['iou'] = tiles['geometry'].intersection(parcel_bounds.iloc[0]['geometry']).area
    tiles = tiles.sort_values('iou', ascending=False)
    tiles = tiles.iloc[0]
    
    return tiles['file']

In [10]:
def find_image_file_and_superrestile(img_fp, tif_fp, file_name):
    """
    Searches for the inference file within train, val and test directories.
    """
    if os.path.exists(os.path.join(img_fp, 'train')):
        # For 2020 data which is split across train, val and test
        img_file = None
        for dirname in ['train', 'val', 'test']:
            dirpath = os.path.join(img_fp, dirname, 'images', '{}.npy'.format(file_name))
            if os.path.exists(dirpath):
                img_file = dirpath
    else:
        # For 2016 and 2018 data which is not split
        img_file = os.path.join(img_fp, '{}.npy'.format(file_name))
        
    # Generate file 
    superres_tile = os.path.join(img_fp, '..', 'superres_tif', '{}.tif'.format(file_name))
    if not os.path.exists(superres_tile):
        if not os.path.exists(os.path.join(img_fp, '..', 'superres_tif')):
            os.makedirs(os.path.join(img_fp, '..', 'superres_tif'))
        
        tile_img = np.load(img_file).astype(np.uint8)
        
        # Get original raster
        raster_original = rasterio.open(os.path.join(tif_fp, '{}.tif'.format(file_name)))
        t = from_bounds(*raster_original.bounds, tile_img.shape[0], tile_img.shape[1])
        raster_crs = rasterio.crs.CRS({"init": "epsg:26910"})
        
        with rasterio.open(superres_tile, 'w', driver='GTiff', 
                           height=tile_img.shape[0], width=tile_img.shape[1],
                           count=3, dtype=str(tile_img.dtype),
                           crs=raster_crs, transform=t) as raster_new:
            raster_new.write(tile_img[:, :, 0], 1)
            raster_new.write(tile_img[:, :, 1], 2)
            raster_new.write(tile_img[:, :, 2], 3)
            raster_new.close()
        
    return img_file, superres_tile

In [1]:
def get_tile_dicts_all_years(oak_fp, inferences_dir):
    # Load tile dict for each year
    tile_bounds_dict_all = {}
    for year in ['2016', '2018', '2020']:
        output_fp = os.path.join(oak_fp, 'outputs', 'cbg-inference-{}'.format(year))
        with open(os.path.join(output_fp, 'tile_bounds.json'), "r") as f:
            tile_bounds_dict = json.load(f)
            tile_bounds_dict_all[year] = tile_bounds_dict
    
    # Get tiles for all years
    tiles_gdf = {}
    for year in ['2016', '2018', '2020']:
        tiles = glob.glob(os.path.join(inferences_dir[year], '*.npy'))
        tiles = [t.split(os.path.sep)[-1].replace('.npy', '') for t in tiles]
        tile_metrics_pd = pd.DataFrame(tiles, columns=['file'])

        tile_metrics_pd['geometry'] = tile_metrics_pd.file.progress_apply(
            lambda name: get_bounds(tile_bounds_dict_all[year], name) if name in list(tile_bounds_dict_all[year].keys()) else None
        )
        tiles_gdf[year] = gpd.GeoDataFrame(tile_metrics_pd.copy(), crs='EPSG:4326')
        
    return tile_bounds_dict_all, tiles_gdf

## Ground truth cases

In [None]:
def load_ground_truth_parcels():
    # Positive small building constructions
    ps_gt_grid = (['23044043', '24960042', '68932067', '27447043', '69414018'], 
                  'Positive small build')

    # Negative small building constructions
    ns_gt_grid = (['24960056', '27406055', '42937040', '47701057', '44249007', '40306200', 
                   '09218018', '46742046', '27725060', '26434063'], 
                  'Negative small build')

    # Positive main building constructions
    pm_gt_grid = (['58630050', '42905080', '47202096', '24960042', '48608012', '41933001'], 
                  'Positive main build')

    # Negative main building constructions
    nm_gt_grid = (['41934035', '49936015', '49722020', '37804025', '44710075', '44234038', 
                   '46702025', '43944074', '42116035', '24509050', '26444013', '70845021', 
                   '48809009'],
                  'Negative main build')
    case_dict = {
        'Positive small build': ps_gt_grid,
        'Negative small build': ns_gt_grid,
        'Positive main build': pm_gt_grid,
        'Negative main build': nm_gt_grid
    }
    return case_dict