# Prepare Solar Panel Dataset from New Panel Delineation

* Calls all panel shapes existing within all existing and digitized array boundaries (bound by NAIP availability)
* Checks for panel delination quality and resulting array shape quality, removes low quality panel delineations based on perimeter to area ratio
    * USPVDB and CCVPV have both been manually validated for correctness in their original creation
    * Saves arrays (excluding USPVDB and CCVPV) where panels are not present OR panel delination was low quality, and exports as shape file to assess for comissions
* Removes arrays and panels manually validated as comissions from remote sensing datasets
* Creates new array shapes by buffering and dissolving panel boundaries
    * USPVDB and CCVPV are both high array quality. So keep those shapes in the array dataset, but improve all other array boundaries if the new shape is of high quality
* Saves high-quality panels objects as new shape file
* Saves highest-quality array objects as new shape file

## Import Libraries and Variables

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import geopandas as gpd
import os 

# Load config file
def load_config(filename):
    config = {}
    with open(filename, 'r') as f:
        for line in f:
            # Strip whitespace and split by '='
            key, value = line.strip().split('=')
            # Try to convert to numeric values if possible
            try:
                value = float(value) if '.' in value else int(value)
            except ValueError:
                pass  # Leave as string if not a number
            config[key] = value
    return config

In [2]:
# Set folder paths
wd = r'S:\Users\stidjaco\R_files\BigPanel'
downloaded_path = os.path.join(wd, r'Data\Downloaded')
derived_path = os.path.join(wd, r'Data\Derived')
derivedTemp_path = os.path.join(derived_path, r'intermediateProducts')

# Set getPanels from GEE output folder path for two different approaches
getPanels_path = os.path.join(derivedTemp_path, r'getPanelsGEEOutput')

# GM-SEUS output initial paths
gmseusArraysInitPath = os.path.join(derivedTemp_path, r'initialGMSEUS_Arrays.shp')
gmseusPanelsInitPath = os.path.join(derivedTemp_path, r'initialGMSEUS_Panels.shp')

# Set GM-SEUS NAIP classified panels and arrays path
gmseusNaipPanelsPath = os.path.join(derived_path, r'GMSEUS/GMSEUS_NAIP_Panels.shp')
gmseusNaipArraysPath = os.path.join(derived_path, r'GMSEUS/GMSEUS_NAIP_Arrays.shp')

# Set a GM-SEUS array path for acquiring installation year (exploded grouped arrays -- SAM) and panel path
gmseusArraysInstYrPath = os.path.join(derivedTemp_path, r'GMSEUS_Arrays_instYr.shp')
gmseusPanelsFinalPath = os.path.join(derivedTemp_path, r'GMSEUS_Panels_ExistingAndNAIP.shp')
gmseusArraysFromPanelsPath = os.path.join(derivedTemp_path, r'GMSEUS_ArraysFromPanels.shp')

# Call the GM-SEUS arrays init to get crs
gmseusArraysInit = gpd.read_file(gmseusArraysInitPath)
gmseusCrs = gmseusArraysInit.crs

# Set paths to USPVDB and CCVPV (we check if GMSEUSgeorect arrays are from these datasets)
uspvdb_path = os.path.join(derivedTemp_path, r'uspvdb_poly.shp')
ccvpv_path = os.path.join(derivedTemp_path, r'ccvpv_poly.shp')

# Load the config from the text file
config = load_config('config.txt')

# Set general variables
gee_crs = 4326 # native projection of Google Earth Engine exports
minPanelRowArea = config['minPanelRowArea'] # 15 m2, minimum area for a single panel row from the 1st percentile panel area from Stid et al., 2022
maxPanelRowArea = config['maxPanelRowArea'] # 254 m2 95th perccentile for a single panel row from Stid et al., 2022. MSU Solar Carport has max 1890m2
minNumPanelRows = config['minNumPanelRows'] # 3 panels, minimum number of panels rows to form a ground mounted solar array, definition from Stid et al., 2022
minPmArRatio = config['minPmArRatio'] # 18.8%, 20% was minimum ratio of panel perimeter to area ratio for panels from Stid et al., 2022, MSU Solar Carport has min 18.9%
panelArrayBuff = config['panelArrayBuff'] # 10m buffer, 20m maximum distance between panel rows to form an array. We used 5m in Stid et al., 2022, but there are lower packing factors at greater latitudes (nativeID: '1229957948')
arrayArrayBuff = config['arrayArrayBuff'] # 20m buffer, 40m maximum distance between arrays subsections of the same mount type to form a complete array. In Stid et al., 2022, we used 50m, but we checked for same installation year in addition to mount type.

# Set limits for mount classification
lengthRatioThresh = config['lengthRatioThresh']  # If length ratio < 3.0, set to dual_axis or else fixed_axis_diagonal, else single- or fixed-axis
areaRatioThresh = config['areaRatioThresh']  # If area ratio < 0.15, set to fixed_diag_axis, else dual_axis

# Set the threshold for Z-scores (3 standard deviations is a common choice, adjust if needed) and unique mount proportion. 
z_threshold = 3 # 3 standard deviations
uniqueMountThreshold = 0.1 # 10% of the panel mount types in the array

# Remove new arrays where the new array area is less than 0.25 * gmseus array area and where new array PmArRatio is less than the 99th percentile of gmseus PmArRatio
newAreaThreshold = 0.25 # 25% of the original gmseus array area
gmseusPmArRatioThreshold = 0.99 # 99th percentile

# For this script wwhere we filter panel-rows based on shared geometrical similarity, the arrayID column is subArrID, because we exlode all sub array geometries in script3
arrayIDcol = 'subArrID'

## Helper functions

In [10]:
# Function to assign mount type to solar panel-rows based on azimuth and panel geometry. Also returns all relevant design parameters for each panel-row. Requires the setting of a length ratio threshold and an area ratio threshold.
def assignMountType(feature):
    # Estimate azimuth of solar panel-row short edge
    def getAzimuth(feature):
        # Get the minimum bounding rectangle (oriented)
        mbr = feature.geometry.minimum_rotated_rectangle
        
        # Get the coordinates of the MBR
        coords = list(mbr.exterior.coords)
        
        # Calculate distances between consecutive vertices to determine lengths of edges
        edge_lengths = []
        for i in range(len(coords) - 1):  # last point is a duplicate of the first
            p1, p2 = coords[i], coords[i + 1]
            dist = np.sqrt((p2[0] - p1[0])**2 + (p2[1] - p1[1])**2)
            # Set a tempArea 
            # panels = panels**2 + (p2[1] - p1[1])**2)
            edge_lengths.append(dist)
        
        # Identify shorter and longer sides
        short_edge_index = np.argmin(edge_lengths[:2])  # first two edges are enough to find shorter side
        
        # Use the shorter edge for azimuth calculation
        p1, p2 = coords[short_edge_index], coords[short_edge_index + 1]
        
        # Calculate the azimuth (angle relative to north, counterclockwise)
        delta_x = p2[0] - p1[0]
        delta_y = p2[1] - p1[1]

        # Azimuth relative to north (y-axis)
        angle_radians = np.arctan2(delta_x, delta_y)
        angle_degrees = np.degrees(angle_radians)

        # Normalize the angle to 0-360 degrees
        if angle_degrees < 0:
            angle_degrees += 360
        if angle_degrees > 360:
            angle_degrees -= 360
        
        # In the northern hemisphere, the a solar panel-row azimuth angle will never be towards the north (270 to 360 and 0 to 90 degrees). Therefore, if the azimuth is between 270 and 360 or 0 and 90, we need to add 180 degrees to the azimuth to get the correct orientation of the panel.
        if 270 <= angle_degrees <= 360 or 0 <= angle_degrees <= 90:
            angle_degrees += 180

        return angle_degrees
    
    # Get the ratio of the long edge to the short edge of the panel (and the lengths of the short and long edges)
    def getLengthRatio(feature):
        # Get the minimum bounding rectangle (oriented)
        mbr = feature.geometry.minimum_rotated_rectangle
        
        # Get the coordinates of the MBR
        coords = list(mbr.exterior.coords)
        
        # Calculate distances between consecutive vertices
        edge_lengths = []
        for i in range(len(coords) - 1):  # last point is a duplicate of the first
            p1, p2 = coords[i], coords[i + 1]
            dist = np.sqrt((p2[0] - p1[0])**2 + (p2[1] - p1[1])**2)
            edge_lengths.append(dist)
        
        # Sort the edge lengths to identify short and long sides
        sorted_lengths = sorted(edge_lengths[:2])  # Only need two sides (since rectangle has equal opposite sides)
        short_edge = sorted_lengths[0]
        long_edge = sorted_lengths[1]
        
        # Calculate the ratio of long edge to short edge
        length_ratio = long_edge / short_edge
        return length_ratio, short_edge, long_edge
    
    # Run the geteAzimuth function to get the azimuth of each panel row, getLengthRatio function to get the long and short edge ratio, and the and getAreaRatio function to get the panel area to bounding box ratio
    azimuth = getAzimuth(feature)
    length_ratio, short_edge, long_edge = getLengthRatio(feature)

    # Assign mount type based on azimuth and area ratio 
    # Fixed-axis: If the azimuth is within 60 degrees of S, and length ratio is greater than 2.5
    # Single-axis: If the azimuth is within 30 degrees of E or W (in southward radians), and length ratio is greater than 2.5
    # Dual-axis: Any azimuth and the length ratio is less than 2.5
    def classify_mount_type(azimuth, length_ratio):
        # Check if azimuth is within 60 degrees to to S (180) -- Should never be north
        if (abs(azimuth - 180) <= 60):
            if length_ratio >= lengthRatioThresh:
                return 'fixed_axis'
        
        # Check if azimuth is within 30 degrees of close to E (90) or W (270)
        elif (abs(azimuth - 90) <= 30 or abs(azimuth - 270) <= 30):
            if length_ratio >= lengthRatioThresh:
                return 'single_axis'
        
        # Otherwise, classify as dual-axis
        if length_ratio < lengthRatioThresh: # if area_ratio > areaRatioThresh and length_ratio < lengthRatioThresh:
            return 'dual_axis'
        
        # Default case -- no panel-rows should be missed, but default to fixed-axis
        return 'fixed_axis'
    
    # Classify the mount type
    mount = classify_mount_type(azimuth, length_ratio)

    # Assign mount type based on azimuth, and return the mount type, azimuth, length ratio, short edge, and long edge
    return mount, azimuth, length_ratio, short_edge, long_edge

# Function to check for and remove erroneous geometries in arrays
def checkArrayGeometries(arrays): 
    # For a collection of reasons, array boundaries may contain erroneous geometries that result in a near-zero area, linestrings, or points. 
    # To check for and remove these, we'll explode arrays, calculate a temporary area, remove subarrays that are less than a minimum area, then dissolve by tempID.
    arrays['tempDissolveID'] = (1 + np.arange(len(arrays)))  # Create a temporary ID for dissolving
    arrays = arrays.explode(index_parts=False)
    arrays['tempArea'] = arrays['geometry'].area
    arrays = arrays[arrays['tempArea'] >= minPanelRowArea]
    arrays = arrays.dissolve(by=['tempDissolveID'], as_index=False)
    arrays = arrays.drop(columns=['tempArea', 'tempDissolveID'])
    arrays = arrays.reset_index(drop=True)
    return arrays

# Function to create an array from a set of panel rows based on the distance between them
def createArrayFromPanels(panels, buffDist, dissolveID):
    # Count panels per group before dissolving
    panelCounts = panels.groupby(dissolveID).size().reset_index(name='numPanels')

    # Get the total area of the panels within each group (sum of area column)
    panelAreas = panels.groupby(dissolveID)['area'].sum().reset_index(name='pnlArea')
    
    # Buffer the geometries by buffDist, dissovle boundaries, and unbuffer by buffDist* -1. Assign the number of objects being dissovle into a numPanels column.
    arrays = panels.copy()
    arrays['geometry'] = arrays.buffer(buffDist)
    arrays = arrays.dissolve(by=[dissolveID], as_index=False)
    arrays['geometry'] = arrays.buffer(buffDist * -1)

    # Merge the panel counts and panel areas back into the dissolved array DataFrame. Select only the dissolveID and respective columns in the right df
    arrays = arrays.merge(panelCounts[[dissolveID, 'numPanels']], on=dissolveID, how='left')
    arrays = arrays.merge(panelAreas[[dissolveID, 'pnlArea']], on=dissolveID, how='left')

    # Due to the buffering and unbuffering, some mulitpolygons contain erroneous geometries that result in a near-zero area, linestrings, or points. Remove these.
    arrays = checkArrayGeometries(arrays)

    # Reset index
    arrays = arrays.reset_index(drop=True)
    return arrays

## Process Panel Data from GEE Outputs

In [4]:
# Function to read in the shapefiles/geojsons from folder and return a processed panel geodataframe
def getPanels_method(path):

    # Function to load geodataframes if different files are present in the folder
    def load_gdf(path, extension, target_crs):
        files = [f for f in os.listdir(os.path.join(path)) if f.endswith(f'.{extension}')]
        dfs = [gpd.read_file(os.path.join(path, file)) for file in files]
        # Directly concatenate, set crs, and reproject
        return gpd.GeoDataFrame(pd.concat(dfs, ignore_index=True)).set_crs(gee_crs).to_crs(target_crs)
    
    # Handle both GeoJson and Shp files, both may be present depending on script4 output requirements (vertex limit of geojson)
    # Check what file extensions are present in the folder (either/or geojson or shapefile). 
    # If both are present, load both and concatenate. If only one is present, load that one.
    geoJsonFileNum = len([f for f in os.listdir(os.path.join(path)) if f.endswith('.geojson')])
    shpFileNum = len([f for f in os.listdir(os.path.join(path)) if f.endswith('.shp')])
    if geoJsonFileNum > 0 and shpFileNum > 0:
        solarPanelsJSON = load_gdf(path, 'geojson', gmseusCrs)
        solarPanelsSHP = load_gdf(path, 'shp', gmseusCrs)
        solarPanels = pd.concat([solarPanelsJSON, solarPanelsSHP], ignore_index=True)
        print('Both geojson and shapefile found in the folder. Concatenating both.')
    elif geoJsonFileNum > 0:
        solarPanels = load_gdf(path, 'geojson', gmseusCrs)
    elif shpFileNum > 0:
        solarPanels = load_gdf(path, 'shp', gmseusCrs)
    else:
        raise ValueError('No valid file extensions found in the folder. Please provide either a geojson or shapefile.')

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Explode arrays into panels, remove array with missing panels

    # Remove rows where pnlsPres == "No"
    solarPanels = solarPanels[solarPanels['pnlsPres'] != 'No']

    # Remove rows where pnlNum is 1
    solarPanels = solarPanels[solarPanels['pnlNum'] > 1]

    # Explode the multipolygons
    solarPanels = solarPanels.explode(index_parts=False)

    # Reset the index
    solarPanels = solarPanels.reset_index(drop=True)

    # Drop the subset, pnlsPres columns
    solarPanels = solarPanels.drop(columns='pnlsPres')

    # Set area
    solarPanels['area'] = solarPanels.geometry.area

    # Return the geodataframe
    return solarPanels

# Get the solar panels geodataframe
solarPanels = getPanels_method(getPanels_path)

# Export the solar panels geodataframes
solarPanels.to_file(os.path.join(derivedTemp_path, 'solarPanelsInit.shp'), driver='ESRI Shapefile')

# Print the number of panels in each geodataframe and the total area of solar panels
print(f'Total number of newly delineated panel-rows: {len(solarPanels)}')
print(f'Total area of solar panels on initial processing: {solarPanels.geometry.area.sum() / 1e6:.2f} km2')

Total number of newly delineated panel-rows: 2390864
Total area of solar panels on initial processing: 686.66 km2


## Filter for High Quality Panels and Create New Array Dataset

### Remove inividual panels by within array design/shape similarity

In [5]:
# Call solar panels
solarPanels = gpd.read_file(os.path.join(derivedTemp_path, 'solarPanelsInit.shp'))

# Get the mount type, azimuth, length ratio, area ratio, short edge, and long edge for each panel
solarPanels[['mount', 'azimuth', 'lengthRatio', 'shortEdge', 'longEdge']] = solarPanels.apply(assignMountType, axis=1, result_type='expand')

# Make a new arrayID nativeID the nativeID + Source columns
#solarPanels['arrayID'] = solarPanels['nativeID'] + '_' + solarPanels['Source']

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Create initial solar arrays from solar panels, and remove low quality arrays and panels

# First, drop any panel below the minimum panel area, and greater than the max panel area
solarPanels = solarPanels[solarPanels.geometry.area > minPanelRowArea]
solarPanels = solarPanels[solarPanels.geometry.area < maxPanelRowArea]

# Save solarArrays as copy of solarPanels
solarPanels = solarPanels.reset_index(drop=True)

# Set an initial panelID 1 through n for the entire dataset
solarPanels['panelID'] = range(1, len(solarPanels) + 1)

# Create solar arrays from solar panels
solarArrays = createArrayFromPanels(solarPanels, panelArrayBuff, arrayIDcol)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Determine panel quality based on panel design parameters within arrays

# Here, we will remove commissions based on a number of quality control metrics and determining if panels are outliers. We consider 1 universal metric and 5 within array (local) metrics: PmArRatio (universal), PmArRatio (local), lengthRatio, areaRatio, compactness, and mount proportion. 
# First, we will determine if a panel is universally an outlier with perimeter to area ratio (PmArRatio) using a set threshold, then within arrays using mount technology proportions, PmArRatio, lengthRatio, areaRatio, and compactness (Poslby-Popper ratio). If 2 or more fail, we will remove the panel.
# ~~~~~~~~
# PmArRatio is well contrained for square to rectangular objects of solar panel-row size (see percentiles of gmseusInitPanels).We will set a universal miminum threshold for PmArRatio across the entire array to remove large area panel objects that are not solar panels. In gmseusInit, 0.19 was the minimum PmArRatio. We will use 0.18 as the minimum threshold for solar panels.
# Universally, low compactness can include long skinny panel-rows and high compactness can include-dual axis or square panel-rows. So we will only use this metric to address within-array varaibility.
# Within each array, we will also remove panel-objects where the mount type composes less than 10% of the array. 

# Set failure threshold (max would be 5)
failureThreshold = 2

# Set an QAQC column as zero, which we will append to for each failed quality control metric. 
solarPanels['QAQC'] = 0

# Calculate the perimeter to area ratio and compacntess for each panel
solarPanels['perimeter'] = solarPanels.geometry.length
solarPanels['PmArRatio'] = solarPanels['perimeter'] / solarPanels['area']
solarPanels['compactness'] = (4 * np.pi * solarPanels['area']) / (solarPanels['perimeter'] ** 2)

# Calculate area ratio, the calculate the panel area to bounding box area ratio, not the minimum bounding rectangle
solarPanels['bboxArea'] = solarPanels.geometry.bounds.apply(lambda row: (row['maxx'] - row['minx']) * (row['maxy'] - row['miny']), axis=1)
solarPanels['areaRatio'] = solarPanels['area'] / solarPanels['bboxArea']
solarPanels = solarPanels.drop(columns='bboxArea')

# First, universal removal of comissions based on constrained geometries of ground mounted solar panel-rows (PmArRatio). If PmArRatio is less than the minimum threshold, remove the panel.
solarPanels = solarPanels[solarPanels['PmArRatio'] >= minPmArRatio]

# Second, remove local (within array) comissions based on within-array mount similarity. If the mount type composes less than 10% of the array, add 1 to QAQC column.
solarPanels['uniqueMount'] = solarPanels.groupby([arrayIDcol, 'mount'])['mount'].transform('count') / solarPanels.groupby(arrayIDcol)['mount'].transform('count')
solarPanels.loc[solarPanels['uniqueMount'] < uniqueMountThreshold, 'QAQC'] += 1
solarPanels = solarPanels.drop(columns='uniqueMount')

# Third, remove local (within array) comissions based on within-array PmArRatio, lengthRatio, areaRatio, and compactness (Poslby-Popper ratio). If the panel is an outlier in any of these metrics, add 1 to QAQC column.
solarPanels['PmArRatioZ'] = (solarPanels['PmArRatio'] - solarPanels.groupby(arrayIDcol)['PmArRatio'].transform('mean')) / solarPanels.groupby(arrayIDcol)['PmArRatio'].transform('std')
solarPanels['lengthRatioZ'] = (solarPanels['lengthRatio'] - solarPanels.groupby(arrayIDcol)['lengthRatio'].transform('mean')) / solarPanels.groupby(arrayIDcol)['lengthRatio'].transform('std')
solarPanels['areaRatioZ'] = (solarPanels['areaRatio'] - solarPanels.groupby(arrayIDcol)['areaRatio'].transform('mean')) / solarPanels.groupby(arrayIDcol)['areaRatio'].transform('std')
solarPanels['compactnessZ'] = (solarPanels['compactness'] - solarPanels.groupby(arrayIDcol)['compactness'].transform('mean')) / solarPanels.groupby(arrayIDcol)['compactness'].transform('std')
solarPanels.loc[solarPanels['PmArRatioZ'].abs() > z_threshold, 'QAQC'] += 1
solarPanels.loc[solarPanels['lengthRatioZ'].abs() > z_threshold, 'QAQC'] += 1
solarPanels.loc[solarPanels['areaRatioZ'].abs() > z_threshold, 'QAQC'] += 1
solarPanels.loc[solarPanels['compactnessZ'].abs() > z_threshold, 'QAQC'] += 1

# Drop the Z-score columns
solarPanels = solarPanels.drop(columns=['PmArRatioZ', 'lengthRatioZ', 'areaRatioZ', 'compactnessZ'])

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Remove low quality arrays and panels

# Get solar panels that fail three or more quality control metrics
solarPanelsDrop = solarPanels[solarPanels['QAQC'] >= failureThreshold]

# Get solar panels that pass quality control metrics
solarPanels = solarPanels[solarPanels['QAQC'] < failureThreshold]

# If the resulting array has three or fewer panels, add these to solarPanelsDrop and remove them from solarPanels
panelCounts = solarPanels.groupby(arrayIDcol).size().reset_index(name='numPanels')
solarPanels = solarPanels.merge(panelCounts, on=arrayIDcol, how='left')
solarPanels = solarPanels.reset_index(drop=True)

# Get the arrays with too few panels
solarPanelsTooFew = solarPanels[solarPanels['numPanels'] <= minNumPanelRows]
solarPanelsTooFew = solarPanelsTooFew.drop(columns='numPanels')
solarPanelsDrop = pd.concat([solarPanelsDrop, solarPanelsTooFew], ignore_index=True)
solarPanels = solarPanels[solarPanels['numPanels'] > minNumPanelRows]

# Export 
#solarPanels.to_file(os.path.join(derivedTemp_path, 'solarPanels.shp'), driver='ESRI Shapefile')
#solarPanelsDrop.to_file(os.path.join(derivedTemp_path, 'solarPanelsDrop.shp'), driver='ESRI Shapefile')

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Regenerate solar arrays from the filtered solar panels and export

# Regenerate solar arrays from the filtered solar panels
solarPanels = solarPanels.reset_index(drop=True)
solarPanels = solarPanels.drop(columns=['numPanels'])
solarArrays = createArrayFromPanels(solarPanels, panelArrayBuff, arrayIDcol)

# Set a new arrayID and panelID that is 1 through n for the entire dataset. First drop old columns.
solarArrays = solarArrays.reset_index(drop=True)
solarPanels = solarPanels.reset_index(drop=True)
solarArrays = solarArrays.drop(columns=['panelID'])
solarPanels = solarPanels.drop(columns=[arrayIDcol, 'panelID'])
solarPanels['panelID'] = range(1, len(solarPanels) + 1)

# Get arrayID for each panel from a spatial join. Get only the arrayID column
solarPanels = gpd.sjoin(solarPanels, solarArrays[[arrayIDcol, 'geometry']], how='left', predicate='intersects')
solarPanels = solarPanels.drop(columns='index_right')

# Reset the index
solarPanels = solarPanels.reset_index(drop=True)
solarArrays = solarArrays.reset_index(drop=True)

# Export high quality arrays and panels
solarArrays.to_file(os.path.join(derivedTemp_path, 'solarArrays_ArrayQAQC.shp'), driver='ESRI Shapefile')
solarPanels.to_file(os.path.join(derivedTemp_path, 'solarPanels_PanelQAQC.shp'), driver='ESRI Shapefile')

# Export solar panels that are dropped
solarPanelsDrop.to_file(os.path.join(derivedTemp_path, 'solarPanelsDropped_PanelQAQC.shp'), driver='ESRI Shapefile')

# Print the number of panels dropped due to quality control within array
print(f'Total number of panels dropped due to quality control within arrays: {len(solarPanelsDrop)}')

  solarArrays.to_file(os.path.join(derivedTemp_path, 'solarArrays_ArrayQAQC.shp'), driver='ESRI Shapefile')
  solarPanels.to_file(os.path.join(derivedTemp_path, 'solarPanels_PanelQAQC.shp'), driver='ESRI Shapefile')
  solarPanelsDrop.to_file(os.path.join(derivedTemp_path, 'solarPanelsDropped_PanelQAQC.shp'), driver='ESRI Shapefile')


Total number of panels dropped due to quality control within arrays: 35954


### Remove array-wide panels by quality of new array delineation (new area compared to initial area, and perimeter to area ratio)
Importantly, we consider subarray shapes as an array, allowing for unique interarray design distinctions. Thus when we compare the new array area and PmArRatio to the initial, we must explode the initial arrays (this logic is already built into our new solar array delination)

In [7]:
# Call solar panels and arrays
solarPanels = gpd.read_file(os.path.join(derivedTemp_path, 'solarPanels_PanelQAQC.shp'))
solarArrays = gpd.read_file(os.path.join(derivedTemp_path, 'solarArrays_ArrayQAQC.shp'))

# Call initial gmseus arrays
gmseusArraysInit = gpd.read_file(gmseusArraysInitPath)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ QAQC (Export high quality arrays and panels compared to initial GM-SEUS arrays)

# Reset the index, and calculate the area of the solar arrays
solarArrays = solarArrays.reset_index()
solarArrays['arrayArea'] = solarArrays.geometry.area

# Calculate the perimeter to area ratio of array 
solarArrays['PmArRatio'] = solarArrays.length / solarArrays.area

# Calculate array area and PmArRatio for gmseus initial arrays. First, explode initial gmseus arrays to match logic of panel-row generated arrays (dependent on sub-array shapes, allows for unique array designs within an array area).
gmseusArraysInit = gmseusArraysInit.explode(index_parts=False)
gmseusArraysInit = gmseusArraysInit.reset_index(drop=True)
gmseusArraysInit['arrayArea_gmseus'] = gmseusArraysInit.geometry.area
gmseusArraysInit['PmArRatio_gmseus'] = gmseusArraysInit.length / gmseusArraysInit.area

# Perform a spatial join to get the gmseus array area and PmArRatio
solarArrays = gpd.sjoin(solarArrays, gmseusArraysInit[['arrayArea_gmseus', 'PmArRatio_gmseus', 'geometry']], how='left', predicate='intersects')
solarArrays = solarArrays.drop(columns='index_right')

# Remove new arrays where the new array area is less than 0.25 * gmseus array area and where new array PmArRatio is less than the 99th percentile of gmseus PmArRatio
solarArraysHighQuality = solarArrays[(solarArrays['arrayArea'] > newAreaThreshold * solarArrays['arrayArea_gmseus']) & (solarArrays['PmArRatio'] < gmseusArraysInit['PmArRatio_gmseus'].quantile(gmseusPmArRatioThreshold))]
solarArraysHighQuality = solarArraysHighQuality.drop(columns=['arrayArea_gmseus', 'PmArRatio_gmseus'])
solarArraysHighQuality = solarArraysHighQuality.reset_index(drop=True)

# Drop level_0 and index columns if they exist
solarArraysHighQuality = solarArraysHighQuality.drop(columns=['level_0', 'index'], errors='ignore')

# Reset index, and filter for panels that are in high quality arrays
solarPanels = solarPanels.reset_index(drop=True)
solarPanelsHighQuality = solarPanels[solarPanels[arrayIDcol].isin(solarArraysHighQuality[arrayIDcol])].reset_index(drop=True)

# Export high quality arrays and panels
solarArraysHighQuality.to_file(os.path.join(gmseusNaipArraysPath), driver='ESRI Shapefile')
solarPanelsHighQuality.to_file(os.path.join(gmseusNaipPanelsPath), driver='ESRI Shapefile')

# Print the number of panels removed due to array delineation quality control
print(f'Total number of panel-rows removed due to array delineation quality control: {len(solarPanels) - len(solarPanelsHighQuality)}')

# Print the final number of arrays and panels
print(f'Total number of solar arrays: {len(solarArraysHighQuality)}')
print(f'Total number of solar panel-rows: {len(solarPanelsHighQuality)}')

# Print the final area in km2 of arrays and panels
print(f'Total area of solar arrays: {solarArraysHighQuality.geometry.area.sum() / 1e6:.2f} km2')
print(f'Total area of solar panel-rows: {solarPanelsHighQuality.geometry.area.sum() / 1e6:.2f} km2')

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Export low quality arrays and panels to understand the quality of the delineation

# Reset all the indices
solarArrays = solarArrays.reset_index()
solarPanels = solarPanels.reset_index()
solarArraysHighQuality = solarArraysHighQuality.reset_index()
solarPanelsHighQuality = solarPanelsHighQuality.reset_index()

# Export low quality arrays and panels (not in solarArraysQAQC nativeID)
solarArraysLowQuality = solarArrays[~solarArrays[arrayIDcol].isin(solarArraysHighQuality[arrayIDcol])]
solarPanelsLowQuality = solarPanels[~solarPanels[arrayIDcol].isin(solarArraysHighQuality[arrayIDcol])]

# Export low quality arrays and panels
solarArraysLowQuality.to_file(os.path.join(derivedTemp_path, 'solarArraysLowArrayQuality.shp'), driver='ESRI Shapefile')
solarPanelsLowQuality.to_file(os.path.join(derivedTemp_path, 'solarPanelsLowArrayQuality.shp'), driver='ESRI Shapefile')

Total number of panel-rows removed due to array delineation quality control: 48262
Total number of solar arrays: 10355
Total number of solar panel-rows: 2291032
Total area of solar arrays: 867.18 km2
Total area of solar panel-rows: 396.60 km2


  solarArraysLowQuality.to_file(os.path.join(derivedTemp_path, 'solarArraysLowArrayQuality.shp'), driver='ESRI Shapefile')


## Merge New Panel Shapes with Existing Ones to Create the Highest Quality Panel Shapefile

In [21]:
# This chunk takes 2800+ minutes to run for 3mil naip panels and 1mil existing panels

# Call in GM-SEUS initial panels and GM-SEUS NAIP panels
gmseusPanelsInit = gpd.read_file(gmseusPanelsInitPath)
gmseusNaipPanels = gpd.read_file(gmseusNaipPanelsPath)

# Set an empty instYr column for the gmseus NAIP panels
gmseusNaipPanels['instYr'] = np.nan

# Set desired panel columns prior to merging
panelColumnsInit = ['nativeID', 'Source', 'area', 'instYr', 'geometry']

# Select desired columns from gmseusPanelsInit and gmseusNaipPanels
gmseusPanelsInit = gmseusPanelsInit[panelColumnsInit]
gmseusNaipPanels = gmseusNaipPanels[panelColumnsInit]

# Select which dataset to give priority to
priorityPanels = gmseusPanelsInit.copy()
nonPriorityPanels = gmseusNaipPanels.copy()

# Save the panel dataset source to each dataset
priorityPanels['pnlSource'] = 'existing'
nonPriorityPanels['pnlSource'] = 'gmseus'

# Buffer priority panels by 10 meters, dissolve, and unbuffer by -10 meters to create array geometries, and remove non-priority panels intersecting with priority panel arrays
priorityPanels_dissolved = priorityPanels.copy()
priorityPanels_dissolved['dissovleID'] = 1
priorityPanels_dissolved = createArrayFromPanels(priorityPanels, panelArrayBuff, 'dissolveID')
priorityPanels_dissolved = priorityPanels_dissolved.drop(columns='dissovleID')

# Explode the priority panel arrays to allow parallel processing of spatial join
priorityPanels_dissolved = priorityPanels_dissolved.explode(index_parts=False)
priorityPanels_dissolved = priorityPanels_dissolved.reset_index(drop=True)

# Buffer priority panels by 10 meters, dissolve, and unbuffer by -10 meters to create array geometries, and remove non-priority panels intersecting with priority panel arrays
#priorityPanels_buffer = priorityPanels.copy()
#priorityPanels_buffer['geometry'] = priorityPanels_buffer.buffer(panelArrayBuff)
#priorityPanels_dissolved = priorityPanels_buffer.dissolve()
#priorityPanels_dissolved['geometry'] = priorityPanels_dissolved.buffer(-panelArrayBuff)

# Remove non-priority panels that intersect with priority panel arrays. Add a tempID column to non-priority panels that is 1 through n
nonPriorityPanels['tempID'] = range(1, len(nonPriorityPanels) + 1)
intersecting_panels = gpd.sjoin(nonPriorityPanels, priorityPanels_dissolved[['geometry']], how="left", predicate="intersects") # used to be inner, check if this works
nonPriorityPanels = nonPriorityPanels[~nonPriorityPanels['tempID'].isin(intersecting_panels['tempID'])]
nonPriorityPanels = nonPriorityPanels.drop(columns='tempID')

# Merge the panel data
mergedPanels = gpd.GeoDataFrame(pd.concat([priorityPanels, nonPriorityPanels], ignore_index=True), crs=gmseusCrs)

# Reset the index
mergedPanels = mergedPanels.reset_index(drop=True)

# Set the final panelID as 1 through n for the entire dataset
mergedPanels['panelID'] = range(1, len(mergedPanels) + 1)

# If pnlSource is 'gmseus' set Source to 'gmseus'. Else, maintain the Source column
mergedPanels.loc[mergedPanels['pnlSource'] == 'gmseus', 'Source'] = 'gmseus'

# Drop pnlSource column
mergedPanels = mergedPanels.drop(columns='pnlSource')

# Print number of rows in mergedPanels
print(f'Number of final panel-rows in GM-SEUS panel-row dataset: {len(mergedPanels)}')

# Print the total number of panel-rows that have Source as 'gmseus' and that are not 'gmseus
print(f'Total number of panels from GM-SEUS: {len(mergedPanels[mergedPanels["Source"] == "gmseus"])}')
print(f'Total number of panels from Existing Sources: {len(mergedPanels[mergedPanels["Source"] != "gmseus"])}')

# Print the total sum of 'area' in the mergedPanels dataset in km2
print(f'Total area of panels in GM-SEUS panel-row dataset is {mergedPanels["area"].sum() / 1e6} km2')

# Export the mergedPanels dataset
mergedPanels.to_file(gmseusPanelsFinalPath, driver='ESRI Shapefile')

Number of final panel-rows in GM-SEUS panel-row dataset: 2924238
Total number of panels from GM-SEUS: 1853057
Total number of panels from Existing Sources": 1071181
Total area of panels in GM-SEUS panel-row dataset is 468.0124977148468 km2


## Create Arrays from Panels for All New an Existing Panel-Rows

In [11]:
# Call in gmseusPanelsFinal
gmseusPanelsFinal = gpd.read_file(gmseusPanelsFinalPath)

# Call in GM-SEUS initial arrays. Explode the arrays to match the logic of panel-row generated arrays (dependent on sub-array shapes, allows for unique array designs within an array area).
gmseusArraysInit = gpd.read_file(gmseusArraysInitPath)
gmseusArraysInit = gmseusArraysInit.explode(index_parts=False)
gmseusArraysInit = gmseusArraysInit.reset_index(drop=True)

# Add a column to gmseusArraysInit that called arrayIDcol (variable set at top of script) and is 1 through n for the entire dataset
gmseusArraysInit[arrayIDcol] = range(1, len(gmseusArraysInit) + 1) 

# Spatially join gmseus arrays to panels, copy the arrayID to the panels, and drop the index columns. 
gmseusPanelsFinal = gpd.sjoin(gmseusPanelsFinal, gmseusArraysInit[[arrayIDcol, 'geometry']], how='left', predicate='intersects')
gmseusPanelsFinal  = gmseusPanelsFinal.reset_index(drop=True)
gmseusPanelsFinal  = gmseusPanelsFinal.drop(columns=['index_left', 'index_right'], errors='ignore')
gmseusPanelsFinal = gmseusPanelsFinal.dropna(subset=[arrayIDcol]) # Redundent, drop panels that do not have an arrayID, in theory should never be the case

# Create arrays from panels -- despite the title, these arrays are generated from new naip panels and existing panels
gmseusArraysFromPanels = createArrayFromPanels(gmseusPanelsFinal, panelArrayBuff, arrayIDcol)

# Print the number of arrays with panel-row-level infomraiton
print(f'Total number of arrays with panel-row-level information: {len(gmseusArraysFromPanels)}')

# Print the total area of gmseus arrays final
print(f'Total area of GM-SEUS arrays: {gmseusArraysFromPanels.geometry.area.sum() / 1e6:.2f} km2')

# Export the GM-SEUS Naip Arrays and gmseusArraysInit with arrayIDcol
gmseusArraysFromPanels.to_file(gmseusArraysFromPanelsPath, driver='ESRI Shapefile')
gmseusArraysInit.to_file(gmseusArraysInitPath, driver='ESRI Shapefile')

Total number of arrays with panel-row-level information: 12702
Total area of GM-SEUS arrays: 1047.29 km2


## Merge New Arrays with Existing Array Dataset to Create the Highest Quality Array Shapefile
* We replace SAM, OSM, and CWSD array boundaries with new array boundaries where detected. These array shapes either do not conform with our array definition, were derived by low spatial resolution methods creating problematic array bounds, or do not have have a standardized delineation methods.
* This is what is uploaded to GEE asset and used in script6. 

In [46]:
# Call GMSEUS arrays init and Naip final
gmseusArraysInit = gpd.read_file(gmseusArraysInitPath)
gmseusArraysFromPanels = gpd.read_file(gmseusArraysFromPanelsPath)

# Call in USPVDB and CCVPV original polygons. Here, we call in all spatial array datasets that we are NOT replacing with new NAIP derived arrays.
uspvdb = gpd.read_file(uspvdb_path)
ccvpv = gpd.read_file(ccvpv_path)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Select when to merge GM-SEUS arrays with existing arrays to create the highest quality array dataset

# Add a replaceID column to gmseusArraysFromPanels that is 1
gmseusArraysFromPanels['replaceID'] = 1

# Set a replaceID column to uspvdb and ccvpv that is 0
uspvdb['replaceID'] = 0
ccvpv['replaceID'] = 0

# In gmseusArraysInit, if Source is SAM, CWSD, or OSM, replace the array geometry with the gmseusArraysFromPanels geometry that intersects that array. If not, retain geometries. Set newBound to 1 if a geometry is replaced.
# If source is GMSEUSgeorect, check if array intersects with USPVDB or CCVPV (non-replace geometries identified above). If not, replace the array geometry with the gmseusArraysFromPanels geometry
# Additionally, if source is SAM or CWSD, explode the arrays since SAM and CWSD does not necessarily have project level infomration, and merges many array boundaries into one. 

# Set a new column to gmseusArraysInit that is newBound, which will be a binary indicator for if a new array boundary is used. 
gmseusArraysInit['newBound'] = 0

# ~~~~~~~~~~~~~ Replace SAM, CWSD, and OSM arrays with GMSEUSnaip arrays

# Filter for SAM, CWSD, and OSM sources
replaceBoundaries = gmseusArraysInit[gmseusArraysInit['Source'].isin(['SAM', 'OSM', 'CWSD'])]

# Perform a spatial join to find intersecting geometries from gmseusArraysFromPanels
replaceBoundaries_joined = gpd.sjoin(replaceBoundaries, gmseusArraysFromPanels[['geometry', 'replaceID']], how='right', predicate='intersects') # KEY: Take geometry from right side of join

# Group by arrayID to get all intersecting geometries, and drop rows where replaceID is null
replaceBoundaries_joined = replaceBoundaries_joined.groupby(arrayIDcol).first().reset_index()
replaceBoundaries_joined = replaceBoundaries_joined.dropna(subset=['replaceID']).reset_index()

# If gmseusArraysInit arrayID is in replaceBoundaries_joined arrayID, set the geometry to the geometry in replaceBoundaries_joined and set newBound to 1
gmseusArraysInit = gmseusArraysInit.merge(replaceBoundaries_joined[[arrayIDcol, 'geometry']], on=arrayIDcol, how='left')
gmseusArraysInit['geometry'] = np.where(gmseusArraysInit['geometry_y'].isnull(), gmseusArraysInit['geometry_x'], gmseusArraysInit['geometry_y'])
gmseusArraysInit['newBound'] = np.where(gmseusArraysInit['geometry_y'].isnull(), gmseusArraysInit['newBound'], 1)
gmseusArraysInit = gmseusArraysInit.drop(columns=['geometry_x', 'geometry_y'])

# Save as geodataframe
gmseusArraysInit = gpd.GeoDataFrame(gmseusArraysInit, crs=gmseusCrs)

# # Export gmseusArraysInit
# gmseusArraysInit.to_file(os.path.join(derivedTemp_path, 'gmseusArraysInitTESTSPOT1.shp'), driver='ESRI Shapefile')

## OLD APPROACH
# # Filter for SAM, CWSD, and OSM sources
# replaceBoundaries = gmseusArraysInit[gmseusArraysInit['Source'].isin(['SAM', 'OSM', 'CWSD'])]
# # Perform a spatial join to find intersecting geometries from gmseusArraysFromPanels
# replaceBoundaries_joined = gpd.sjoin(replaceBoundaries, gmseusArraysFromPanels[['geometry']], how='left', predicate='intersects')
# # Merge the joined data to bring in the intersecting geometries as a new column
# replaceBoundaries_merged = replaceBoundaries_joined.merge(gmseusArraysFromPanels[['geometry']], left_on='index_right', right_index=True, suffixes=('', '_new'))
# # Replace the geometry in the original dataframe and set newBound to 1
# gmseusArraysInit.loc[replaceBoundaries_merged.index, 'geometry'] = replaceBoundaries_merged['geometry_new']
# gmseusArraysInit.loc[replaceBoundaries_merged.index, 'newBound'] = 1
# # Export gmseusArraysInit
# gmseusArraysInit.to_file(os.path.join(derivedTemp_path, 'gmseusArraysInitTESTSPOT1.shp'), driver='ESRI Shapefile')

# ~~~~~~~~~~~~~ Replace GMSEUSgeorect arrays with GMSEUSnaip arrays if they do not intersect with USPVDB or CCVPV

# Filter for GMSEUSgeorect source, and then USPVDB, and CCVPV sources
gmseusGeorect = gmseusArraysInit[gmseusArraysInit['Source'] == 'GMSEUSgeorect']

# Perform spatial joins to identify intersecting rows
intersect_uspvdb = gpd.sjoin(gmseusGeorect, uspvdb, how='left', predicate='intersects')
intersect_ccvpv = gpd.sjoin(gmseusGeorect, ccvpv, how='left', predicate='intersects')

# Drop drop nulls and duplicates
intersect_uspvdb = intersect_uspvdb.dropna(subset=['replaceID'])
intersect_ccvpv = intersect_ccvpv.dropna(subset=['replaceID'])
intersect_uspvdb = intersect_uspvdb.drop_duplicates(subset=arrayIDcol)
intersect_ccvpv = intersect_ccvpv.drop_duplicates(subset=arrayIDcol)

# Filter gmseusGeorect for arrays that do not intersect with USPVDB or CCVPV
gmseusGeorect = gmseusGeorect[~gmseusGeorect[arrayIDcol].isin(intersect_uspvdb[arrayIDcol])]
gmseusGeorect = gmseusGeorect[~gmseusGeorect[arrayIDcol].isin(intersect_ccvpv[arrayIDcol])]

# Perform a spatial join to find intersecting geometries from gmseusArraysFromPanels
gmseusGeorect_joined = gpd.sjoin(gmseusGeorect, gmseusArraysFromPanels[['geometry', 'replaceID']], how='right', predicate='intersects') # KEY: Take geometry from right side of join

# Group by arrayID to get all intersecting geometries, and drop rows where replaceID is null
gmseusGeorect_joined = gmseusGeorect_joined.groupby(arrayIDcol).first().reset_index()
gmseusGeorect_joined = gmseusGeorect_joined.dropna(subset=['replaceID']).reset_index()

# If gmseusArraysInit arrayID is in gmseusGeorect_joined arrayID, set the geometry to the geometry in gmseusGeorect_joined and set newBound to 1
gmseusArraysInit = gmseusArraysInit.merge(gmseusGeorect_joined[[arrayIDcol, 'geometry']], on=arrayIDcol, how='left')
gmseusArraysInit['geometry'] = np.where(gmseusArraysInit['geometry_y'].isnull(), gmseusArraysInit['geometry_x'], gmseusArraysInit['geometry_y'])
gmseusArraysInit['newBound'] = np.where(gmseusArraysInit['geometry_y'].isnull(), gmseusArraysInit['newBound'], 1)
gmseusArraysInit = gmseusArraysInit.drop(columns=['geometry_x', 'geometry_y'])

# Save as geodataframe
gmseusArraysInit = gpd.GeoDataFrame(gmseusArraysInit, crs=gmseusCrs)

# Export gmseusArraysInit
# gmseusArraysInit.to_file(os.path.join(derivedTemp_path, 'gmseusArraysInitTESTSPOT2.shp'), driver='ESRI Shapefile')

## OLD APPROACH
# # Filter for GMSEUSgeorect source, and then USPVDB, and CCVPV sources
# gmseus_georect = gmseusArraysInit[gmseusArraysInit['Source'] == 'GMSEUSgeorect']
# # Perform spatial joins to identify intersecting rows
# intersect_uspvdb = gpd.sjoin(gmseus_georect, uspvdb, how='left', predicate='intersects')
# intersect_ccvpv = gpd.sjoin(gmseus_georect, ccvpv, how='left', predicate='intersects')
# # Combine the indices of intersecting rows
# intersecting_indices = intersect_uspvdb.index.union(intersect_ccvpv.index)
# # Filter out intersecting rows from gmseus_georect
# gmseus_georect = gmseus_georect.drop(intersecting_indices)
# # Perform a spatial join to find intersecting geometries from gmseusArraysFromPanels
# gmseus_georect_joined = gpd.sjoin(gmseus_georect, gmseusArraysFromPanels[['geometry']], how='left', predicate='intersects')
# # Merge the joined data to bring in the intersecting geometries as a new column
# gmseus_georect_merged = gmseus_georect_joined.merge(gmseusArraysFromPanels[['geometry']], left_on='index_right', right_index=True, suffixes=('', '_new'))
# # Replace the geometry in the original dataframe and set newBound to 1
# gmseusArraysInit.loc[gmseus_georect_merged.index, 'geometry'] = gmseus_georect_merged['geometry_new']
# gmseusArraysInit.loc[gmseus_georect_merged.index, 'newBound'] = 1

# ~~~~~~~~~~~~~ For SAM arrays, explode the arrays to get individual array boundaries because SAM does not contain project level information

# We do this because SAM has a tendency to identify multiple arrays as one array. Qualitative, this is only the case for SAM, while CWSD tends to underrepsent array area for newly generated bounds and OSM is hand delineated meaning arrays are distinct.

# Filter for SAM sources
sam = gmseusArraysInit[gmseusArraysInit['Source'] == 'SAM']

# Explode the arrays
sam_exploded = sam.explode(index_parts=False)

# Drop SAM arrays from gmseusArraysInit
gmseusArraysInit = gmseusArraysInit[gmseusArraysInit['Source'] != 'SAM']

# Concatenate the exploded SAM arrays with gmseusArraysInit
gmseusArraysInit = pd.concat([gmseusArraysInit, sam_exploded], ignore_index=True)

# # Export gmseusArraysInit
# gmseusArraysInit.to_file(os.path.join(derivedTemp_path, 'gmseusArraysInitTESTSPOT3.shp'), driver='ESRI Shapefile')

# ~~~~~~~~~~~~~ Save and export the final GM-SEUS arrays and print some statistics

# Save copy of gmseusArraysInit as gmseusArraysFinal
gmseusArraysFinal = gmseusArraysInit.copy()

# Drop roofProp, initID, id, and Subset columns
gmseusArraysFinal = gmseusArraysFinal.drop(columns=['roofProp', 'initID', 'id', 'Subset'], errors='ignore')

# Add a temporary ID column that is 1 through n for the exploded arrays
gmseusArraysFinal['tempID'] = range(1, len(gmseusArraysFinal) + 1)

# Count the number of arrays with new boundaries
newBoundCount = len(gmseusArraysFinal[gmseusArraysFinal['newBound'] == 1])

# Print the number of arrays with new boundaries
print(f'Total number of arrays with new boundaries: {newBoundCount}')

# Print the total number of arrays in gmseusArraysFinal
print(f'Total number of arrays in GM-SEUS arrays: {len(gmseusArraysFinal)}')

# Calculate and print the total area of gmseusArraysFinal
print(f'Final total area of GM-SEUS arrays: {gmseusArraysFinal.geometry.area.sum() / 1e6:.2f} km2')

# Export gmseusArraysFinal
gmseusArraysFinal.to_file(gmseusArraysInstYrPath, driver='ESRI Shapefile')

Total number of arrays with new boundaries: 6522
Total number of arrays in GM-SEUS arrays: 22383
Final total area of GM-SEUS arrays: 2947.25 km2


## SOLVE issue with initial spatial joining
We omitted 537 new array boundaries from the original InstYr analysis due to improper spatial join logic. We've fixed this, but need to re-run LandTrendr for these subarray chunks. Export this followign chunk, and run landtrendr under an "NewestUpdate" folder

In [49]:
# Call in gmseusArraysInstYr and the saved version (which has already included the instYr column)
gmseusArraysInstYr = gpd.read_file(gmseusArraysInstYrPath)
gmseusArraysInstYrCOMPLETE = gpd.read_file(os.path.join(derivedTemp_path, r'GMSEUS_Arrays_instYrSAVE.shp'))

# Set completeID to 1
gmseusArraysInstYrCOMPLETE['completeID'] = 1

# Get gmseusArraysInstYr that do not intersect with COMPLETE using spatial join
gmseusArraysInstYr = gpd.sjoin(gmseusArraysInstYr, gmseusArraysInstYrCOMPLETE[['geometry', 'completeID']], how='left', predicate='intersects')

# Fill NaN values in completeID with 0
gmseusArraysInstYr['completeID'] = gmseusArraysInstYr['completeID'].fillna(0)

# Keep only arrays where completeID is 0
gmseusArraysInstYr = gmseusArraysInstYr[gmseusArraysInstYr['completeID'] == 0]

# Drop completeID column and reset index
gmseusArraysInstYr = gmseusArraysInstYr.drop(columns='completeID')
gmseusArraysInstYr = gmseusArraysInstYr.reset_index(drop=True)

# Export gmseusArraysInstYr
gmseusArraysInstYr.to_file(os.path.join(derivedTemp_path, r'GMSEUS_Arrays_instYr_Update.shp'))

  gmseusArraysInstYr.to_file(os.path.join(derivedTemp_path, r'GMSEUS_Arrays_instYr_Update.shp'))
