# Grab OpenStreetMap Data for Solar Panels and Arrays
* Not all panel objects have an array associated with them, and vise versa in this dataset. 
* We process them here prior to combining with other datasets.

In [1]:
# Import Libraries
import osmnx as ox
import geopandas as gpd
import pandas as pd
import numpy as np
import os
#from shapely.geometry import MultiPolygon, Polygon, MultiPoint
import re

# Load config file
def load_config(filename):
    config = {}
    with open(filename, 'r') as f:
        for line in f:
            # Strip whitespace and split by '='
            key, value = line.strip().split('=')
            # Try to convert to numeric values if possible
            try:
                value = float(value) if '.' in value else int(value)
            except ValueError:
                pass  # Leave as string if not a number
            config[key] = value
    return config

## Set Paths and Variables

In [2]:
# Set paths
wd = r'S:\Users\stidjaco\R_files\BigPanel'
downloaded_path = os.path.join(wd, r'Data\Downloaded')
osmDownloadPath = os.path.join(downloaded_path, r'SolarDB\OSM')
osmPanelsPath = os.path.join(osmDownloadPath, r'Panels')
osmArraysPath = os.path.join(osmDownloadPath, r'Arrays')
uspvdb_path = os.path.join(downloaded_path, r'SolarDB\USPVDB\uspvdb_v2_0_20240801.shp')

# Get US Boundary to subset global/non-CONUS datasets
uspvdb = gpd.read_file(uspvdb_path) # USPVDB shapefile

# Load the config from the text file
config = load_config('config.txt')

# Set variables to restrict OSM download to ground-mounted solar arrays and panel-rows
minPanelRowArea = config['minPanelRowArea'] # 15 m2, minimum area for a single panel row from the 1st percentile panel area from Stid et al., 2022
maxPanelRowArea = config['maxPanelRowArea'] # 254 m2 95th perccentile for a single panel row from Stid et al., 2022. MSU Solar Carport has max 1890m2
minNumPanelRows = config['minNumPanelRows'] # 3 panels, minimum number of panels rows to form a ground mounted solar array, definition from Stid et al., 2022
minPmArRatio = config['minPmArRatio'] # 18.8%, 20% was minimum ratio of panel perimeter to area ratio for panels from Stid et al., 2022, MSU Solar Carport has min 18.9%
panelArrayBuff = config['panelArrayBuff'] # 10m buffer, 20m maximum distance between panel rows to form an array. We used 5m in Stid et al., 2022, but there are lower packing factors at greater latitudes (nativeID: '1229957948')
arrayArrayBuff = config['arrayArrayBuff'] # 20m buffer, 40m maximum distance between arrays subsections of the same mount type to form a complete array. In Stid et al., 2022, we used 50m, but we checked for same installation year in addition to mount type.

# Set limits for mount classification
lengthRatioThresh = config['lengthRatioThresh']  # If length ratio < 3.0, set to dual_axis or else fixed_axis_diagonal, else single- or fixed-axis
areaRatioThresh = config['areaRatioThresh']  # If area ratio < 0.15, set to fixed_diag_axis, else dual_axis

## Helper Functions

In [3]:
# Function to check if folder exists, if not create it
def checkFolder(folder):
    if not os.path.exists(folder):
        os.makedirs(folder)

# Function to assign mount type to solar panel-rows based on azimuth and panel geometry. Also returns all relevant design parameters for each panel-row. Requires the setting of a length ratio threshold and an area ratio threshold.
def assignMountType(feature):
    # Estimate azimuth of solar panel-row short edge
    def getAzimuth(feature):
        # Get the minimum bounding rectangle (oriented)
        mbr = feature.geometry.minimum_rotated_rectangle
        
        # Get the coordinates of the MBR
        coords = list(mbr.exterior.coords)
        
        # Calculate distances between consecutive vertices to determine lengths of edges
        edge_lengths = []
        for i in range(len(coords) - 1):  # last point is a duplicate of the first
            p1, p2 = coords[i], coords[i + 1]
            dist = np.sqrt((p2[0] - p1[0])**2 + (p2[1] - p1[1])**2)
            # Set a tempArea 
            # panels = panels**2 + (p2[1] - p1[1])**2)
            edge_lengths.append(dist)
        
        # Identify shorter and longer sides
        short_edge_index = np.argmin(edge_lengths[:2])  # first two edges are enough to find shorter side
        
        # Use the shorter edge for azimuth calculation
        p1, p2 = coords[short_edge_index], coords[short_edge_index + 1]
        
        # Calculate the azimuth (angle relative to north, counterclockwise)
        delta_x = p2[0] - p1[0]
        delta_y = p2[1] - p1[1]

        # Azimuth relative to north (y-axis)
        angle_radians = np.arctan2(delta_x, delta_y)
        angle_degrees = np.degrees(angle_radians)

        # Normalize the angle to 0-360 degrees
        if angle_degrees < 0:
            angle_degrees += 360
        if angle_degrees > 360:
            angle_degrees -= 360
        
        # In the northern hemisphere, the a solar panel-row azimuth angle will never be towards the north (270 to 360 and 0 to 90 degrees). Therefore, if the azimuth is between 270 and 360 or 0 and 90, we need to add 180 degrees to the azimuth to get the correct orientation of the panel.
        if 270 <= angle_degrees <= 360 or 0 <= angle_degrees <= 90:
            angle_degrees += 180

        return angle_degrees
    
    # Get the ratio of the long edge to the short edge of the panel (and the lengths of the short and long edges)
    def getLengthRatio(feature):
        # Get the minimum bounding rectangle (oriented)
        mbr = feature.geometry.minimum_rotated_rectangle
        
        # Get the coordinates of the MBR
        coords = list(mbr.exterior.coords)
        
        # Calculate distances between consecutive vertices
        edge_lengths = []
        for i in range(len(coords) - 1):  # last point is a duplicate of the first
            p1, p2 = coords[i], coords[i + 1]
            dist = np.sqrt((p2[0] - p1[0])**2 + (p2[1] - p1[1])**2)
            edge_lengths.append(dist)
        
        # Sort the edge lengths to identify short and long sides
        sorted_lengths = sorted(edge_lengths[:2])  # Only need two sides (since rectangle has equal opposite sides)
        short_edge = sorted_lengths[0]
        long_edge = sorted_lengths[1]
        
        # Calculate the ratio of long edge to short edge
        length_ratio = long_edge / short_edge
        return length_ratio, short_edge, long_edge
    
    # Run the geteAzimuth function to get the azimuth of each panel row, getLengthRatio function to get the long and short edge ratio, and the and getAreaRatio function to get the panel area to bounding box ratio
    azimuth = getAzimuth(feature)
    length_ratio, short_edge, long_edge = getLengthRatio(feature)

    # Assign mount type based on azimuth and area ratio 
    # Fixed-axis: If the azimuth is within 60 degrees of S, and length ratio is greater than 2.5
    # Single-axis: If the azimuth is within 30 degrees of E or W (in southward radians), and length ratio is greater than 2.5
    # Dual-axis: Any azimuth and the length ratio is less than 2.5
    def classify_mount_type(azimuth, length_ratio):
        # Check if azimuth is within 60 degrees to to S (180) -- Should never be north
        if (abs(azimuth - 180) <= 60):
            if length_ratio >= lengthRatioThresh:
                return 'fixed_axis'
        
        # Check if azimuth is within 30 degrees of close to E (90) or W (270)
        elif (abs(azimuth - 90) <= 30 or abs(azimuth - 270) <= 30):
            if length_ratio >= lengthRatioThresh:
                return 'single_axis'
        
        # Otherwise, classify as dual-axis
        if length_ratio < lengthRatioThresh: # if area_ratio > areaRatioThresh and length_ratio < lengthRatioThresh:
            return 'dual_axis'
        
        # Default case -- no panel-rows should be missed, but default to fixed-axis
        return 'fixed_axis'
    
    # Classify the mount type
    mount = classify_mount_type(azimuth, length_ratio)

    # Assign mount type based on azimuth, and return the mount type, azimuth, length ratio, short edge, and long edge
    return mount, azimuth, length_ratio, short_edge, long_edge

# Function to check for and remove erroneous geometries in arrays
def checkArrayGeometries(arrays): 
    # For a collection of reasons, array boundaries may contain erroneous geometries that result in a near-zero area, linestrings, or points. 
    # To check for and remove these, we'll explode arrays, calculate a temporary area, remove subarrays that are less than a minimum area, then dissolve by tempID.
    arrays['tempDissolveID'] = (1 + np.arange(len(arrays)))  # Create a temporary ID for dissolving
    arrays = arrays.explode(index_parts=False)
    arrays['tempArea'] = arrays['geometry'].area
    arrays = arrays[arrays['tempArea'] >= minPanelRowArea]
    arrays = arrays.dissolve(by=['tempDissolveID'], as_index=False)
    arrays = arrays.drop(columns=['tempArea', 'tempDissolveID'])
    arrays = arrays.reset_index(drop=True)
    return arrays

# Function to create an array from a set of panel rows based on the distance between them
def createArrayFromPanels(panels, buffDist, dissolveID, areaID='area'):
 
    # Count panels per group before dissolving
    panelCounts = panels.groupby(dissolveID).size().reset_index(name='numPanels')

    # Get the total area of the panels within each group (sum of area column). 
    panelAreas = panels.groupby(dissolveID)[areaID].sum().reset_index(name='pnlArea')
    
    # Buffer the geometries by buffDist, dissovle boundaries, and unbuffer by buffDist* -1. Assign the number of objects being dissovle into a numPanels column.
    arrays = panels.copy()
    arrays['geometry'] = arrays.buffer(buffDist)
    arrays = arrays.dissolve(by=[dissolveID], as_index=False)
    arrays['geometry'] = arrays.buffer(buffDist * -1)

    # Merge the panel counts and panel areas back into the dissolved array DataFrame. Select only the dissolveID and respective columns in the right df
    arrays = arrays.merge(panelCounts[[dissolveID, 'numPanels']], on=dissolveID, how='left')
    arrays = arrays.merge(panelAreas[[dissolveID, 'pnlArea']], on=dissolveID, how='left')

    # Due to the buffering and unbuffering, some mulitpolygons contain erroneous geometries that result in a near-zero area, linestrings, or points. Remove these.
    arrays = checkArrayGeometries(arrays)

    # Reset index
    arrays = arrays.reset_index(drop=True)
    return arrays

# Define a function that groups solar panels by mount type and proximity
def groupArrayByMountAndProximity(gdf, buffer_distance):
    # Set a temporary gdf to buffer
    gdfBuffer = gdf.copy()

    # Create a buffered version of the geometries
    gdfBuffer['geometry'] = gdfBuffer.buffer(buffer_distance)

    # Dissolve by mount
    gdfBuffer = gdfBuffer.dissolve(by = 'mount')

    # Explode the dissolved gdf
    gdfBuffer = gdfBuffer.explode(index_parts=False).reset_index(drop = True)

    # Assign a temp ID to the gdfBuffer
    gdfBuffer['arrayID'] = range(0, len(gdfBuffer))

    # Assign each panel the corresponding arrayID and total panel num in array by spatial join.
    gdfOut = gpd.sjoin(gdf, gdfBuffer[['arrayID', 'geometry']], how='left', predicate='intersects').drop(columns='index_right')

    # Group polygons into multiploygons by array ID. Keep the column
    gdfOut = gdfOut.dissolve(by = 'arrayID').reset_index()

    # Drop the arrayID column
    gdfOut = gdfOut.drop(columns='arrayID', errors='ignore')
    return gdfOut

## OSM Get and Process Functions

In [4]:
# Define a function that grabs all solar panel and array data from OSM for a given state
def getSolarOSMData(state):

    # Define your area of interest, As a test case, use Michigan
    place = state + ', USA'

    # Define custom tag for solar panels and arrays (NOTE: We will still have to filter out the arrays from panels)
    panelsFilter = {'generator:source': 'solar'}
    arraysFilter = {'plant:source': 'solar'}

    # Retrieve the data from OSM
    panelData = ox.features_from_place(place, panelsFilter)
    arrayData = ox.features_from_place(place, arraysFilter)
 
    # First, check if 'geometry' column exists and clean DataFrames by removing non-Polygon geometries
    # Solves an issue where states like West Virgina, do not have arrayData
    if 'geometry' in panelData.columns:
        panelData = panelData[panelData['geometry'].apply(lambda x: x.geom_type in ['Polygon', 'MultiPolygon'])]
    else:
        panelData = gpd.GeoDataFrame(columns=['geometry'])  # Empty GeoDataFrame if no geometry column
    if 'geometry' in arrayData.columns:
        arrayData = arrayData[arrayData['geometry'].apply(lambda x: x.geom_type in ['Polygon', 'MultiPolygon'])]
    else:
        arrayData = gpd.GeoDataFrame(columns=['geometry'])  # Empty GeoDataFrame if no geometry column

    ''' This code block tests for non-Polygon or MultiPolygon geometries in the dataframes.
    # Create the same if else structure but for not 'Polygon' or 'MultiPolygon' geometries
    # Solves an issue where states like West Virgina, do not have arrayData
    if 'geometry' in panelData.columns:
        panelData = panelData[panelData['geometry'].apply(lambda x: x.geom_type not in ['Polygon', 'MultiPolygon'])]
    else:
        panelData = gpd.GeoDataFrame(columns=['geometry'])
    if 'geometry' in arrayData.columns:
        arrayData = arrayData[arrayData['geometry'].apply(lambda x: x.geom_type not in ['Polygon', 'MultiPolygon'])]
    else:
        arrayData = gpd.GeoDataFrame(columns=['geometry'])
    '''
    
    # Set CRS as WGS84 (OSM native proj). Then, transform the data to USPVDB CRS. Solves naive projection issues for emtpy array and panel
    panelData = panelData.set_crs('EPSG:4326')
    arrayData = arrayData.set_crs('EPSG:4326')
    panelData = panelData.to_crs(uspvdb.crs)
    arrayData = arrayData.to_crs(uspvdb.crs)

    # Save the index as osmid
    panelData['osmid'] = panelData.index
    arrayData['osmid'] = arrayData.index

    # Reset index
    panelData = panelData.reset_index(drop=True)
    arrayData = arrayData.reset_index(drop=True)

    # Save osmid as string
    panelData['osmid'] = panelData['osmid'].astype(str)
    arrayData['osmid'] = arrayData['osmid'].astype(str)

    # osmid column is currently structured as (way, 1155615180), we want only the number
    panelData['osmid'] = panelData['osmid'].str.split(', ').str[1].str.replace(')', '')
    arrayData['osmid'] = arrayData['osmid'].str.split(', ').str[1].str.replace(')', '')

    # From panelData, remove all rows where 'location', 'building', or 'generator:place' column is = 'roof'. 
    # If these columns are not present, do nothing.
    if 'location' in panelData.columns:
        panelData = panelData[~panelData['location'].isin(['roof'])]
    if 'building' in panelData.columns:
        panelData = panelData[~panelData['building'].isin(['roof'])]
    if 'generator:place' in panelData.columns:
        panelData = panelData[~panelData['generator:place'].isin(['roof'])]

    # Select the following columns for panelData: start_date, generator:method, osmid, source, name, geometry
    # If any of these columns do not exist, create an empty column of NA values (as a string)
    # Rename them to: instYr, modType, nativeID, Source, ProjName, geometry
    # Ensure required columns exist, creating them with NA values if missing
    # NOTE: there may be a 'generator:output:electricity' column, but we will ignore it for now and estiamte capacity from area and installation year later.
    required_columns = {
        'start_date': 'instYr',
        'generator:method': 'modType',
        'osmid': 'nativeID',
        'source': 'Source',
        'name': 'ProjName',
        'geometry': 'geometry'}
    for col, new_col in required_columns.items():
        if col not in panelData.columns:
            panelData[col] = pd.NA
        panelData[new_col] = panelData[col]

    # Select only the new columns
    panelData = panelData[list(required_columns.values())]

    # For arrayData, select the following columns: start_date, plant:method, osmid, source, name, plant:output:electricity, geometry
    # If any of these columns do not exist, create an empty column of NA values (as a string)
    # Rename them to: instYr, modType, nativeID, Source, ProjName, cap_mw, geometry
    # Ensure required columns exist, creating them with NA values if missing
    required_columns = {
        'start_date': 'instYr',
        'plant:method': 'modType',  
        'osmid': 'nativeID',
        'source': 'Source',
        'name': 'ProjName',
        'plant:output:electricity': 'cap_mw',
        'geometry': 'geometry'}
    for col, new_col in required_columns.items():
        if col not in arrayData.columns:
            arrayData[col] = pd.NA
        arrayData[new_col] = arrayData[col]
    
    # Select only the new columns
    arrayData = arrayData[list(required_columns.values())]

    return panelData, arrayData

# Define Function to Process Solar OSM Data
def processSolarOSMData(state):    

    # Get the solar panel and array data from OSM
    panelData, arrayData = getSolarOSMData(state)

    #~~~~~~~~~~~~~~~~~~~~# 
    # Process Array Data # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #~~~~~~~~~~~~~~~~~~~~#

    # If arrayData is not empty, process arrayData
    if not arrayData.empty:
        # Capacity (cap_mw) is currently formated as a string and contains: '1 GW', '1 MW', '1 kW', or 'yes'. 
        # Formatting can also include '1GW', '1MW', '1kW', or the lower case version of any of these. 
        # It may also contain other strings that should be treated as nan, including existing nan values.
        # If the string contains GW, remove everything except the number and multiply by 1000.
        # If the string contais MW, remove everything except the number and leave as is.
        # If the string contains kW, remove everything except the number and divide by 1000.
        # If the string contains 'yes', set to -9999 (null value).
        # If the string contains anything else, set to -9999 (null value).
        # Function to process capacity
        def process_capacity(value):
            if pd.isna(value):
                return np.nan
            value = value.lower().strip()  # Make the string lowercase for easier matching and strip whitespaces
            try:
                if 'gw' in value:
                    return float(value.replace('gw', '').strip()) * 1000
                elif 'mw' in value:
                    return float(value.replace('mw', '').strip())
                elif 'kw' in value:
                    return float(value.replace('kw', '').strip()) / 1000
                elif value == 'yes':
                    return -9999
                else:
                    return -9999
            except ValueError:  # If the string cannot be converted to a float
                return -9999

        # Apply the function to the 'cap_mw' column dynamically. Round to 3 decimal places.
        arrayData['cap_mw'] = arrayData['cap_mw'].apply(process_capacity).round(3)

        # ~~~~~~~~~~~~~~~~~ Get Panel Boundaries In Array Data (e.g. MSU Solar Carport, and 1229957948)

        # Explode the MultiPolygons into individual Polygons
        arrayData = arrayData.explode(index_parts=False).reset_index(drop=True)

        # Filter out any rows where the geometry is invalid or empty
        arrayData = arrayData[arrayData.geometry.notna()]

        # Calculate the area of each array (in square meters)
        arrayData['area'] = arrayData['geometry'].apply(lambda x: x.area if x.is_valid and x.area > 0 else np.nan)

        # Calculate the perimeter-to-area ratio of each array
        arrayData['PmArRatio'] = arrayData['geometry'].apply(lambda x: x.length / x.area if x.is_valid and x.area > 0 else np.nan)

        # Drop rows where area or PmArRatio couldn't be calculated (NaN values)
        arrayData = arrayData.dropna(subset=['area', 'PmArRatio'])

        # IF: an array is less than minimum panel size, remove it (more likely to be rooftop or inverter station)
        arrayData = arrayData[arrayData['area'] >= minPanelRowArea]

        # IF: an array has a perimeter to area ratio greater than 0.188 or area is less than the max panel row area, save it to an panelArrayData dataframe. 
        # Then remove it from arrayData
        panelInArrayData = arrayData[(arrayData['PmArRatio'] > minPmArRatio) | (arrayData['area'] < maxPanelRowArea)].reset_index(drop=True)
        arrayData = arrayData[~arrayData['nativeID'].isin(panelInArrayData['nativeID'])]

        # Dissolve by nativeID to return to multipolygon
        arrayData = arrayData.dissolve(by = 'nativeID').reset_index()

        # Remove panelArrayData shapes that are already in panelData, then merge the remaining dataframes
        # First, check if data exists, then clean DataFrames by dropping rows with invalid or empty geometries, then remove overlapping arrays
        if panelInArrayData is not None:
            panelInArrayData = panelInArrayData[panelInArrayData.geometry.notna()]
            if not panelInArrayData.empty:
                panelInArrayData = panelInArrayData[~panelInArrayData.intersects(panelData.unary_union)]
        panelData = pd.concat([panelData, panelInArrayData])
    
    # Else, if arrayData is empty, return an empty gdf for arrayData and panelInArrayData
    else:
        arrayData = gpd.GeoDataFrame(columns=['instYr', 'modType', 'nativeID', 'Source', 'ProjName', 'cap_mw', 'geometry'])
        panelInArrayData = gpd.GeoDataFrame(columns=['instYr', 'modType', 'nativeID', 'Source', 'ProjName', 'geometry'])

    #~~~~~~~~~~~~~~~~~~~~# 
    # Process Panel Data # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #~~~~~~~~~~~~~~~~~~~~#

    # IF: panelData is not empty, process panelData
    if not panelData.empty:
        # Explode the MultiPolygons into individual Polygons
        panelData = panelData.explode(index_parts=False).reset_index(drop=True)
        panelData['nativeID'] = panelData['nativeID'] + '_' + panelData.groupby('nativeID').cumcount().astype(str)

        # Filter out any rows where the geometry is invalid or empty
        panelData = panelData[panelData.geometry.notna()]

        # Calculate the area of each panel (in square meters)
        panelData['area'] = panelData['geometry'].apply(lambda x: x.area if x.is_valid and x.area > 0 else np.nan)

        # Calculate the perimeter-to-area ratio of each panel
        panelData['PmArRatio'] = panelData['geometry'].apply(lambda x: x.length / x.area if x.is_valid and x.area > 0 else np.nan)

        # Drop rows where area or PmArRatio couldn't be calculated (NaN values)
        panelData = panelData.dropna(subset=['area', 'PmArRatio'])

        # IF: a panel is less than the mimum panel row area, remove it (more likely to be rooftop or inverter station)
        panelData = panelData[panelData['area'] >= minPanelRowArea]

        # ~~~~~~~~~~~~~~~~~~ Get Array Boundaries In Panel Data

        # IF: a panel has a perimeter to area ratio less than 0.188 or area is greater than max panel row area, save it to an arrayPanelData dataframe. 
        # Then remove it from panelData
        arrayInPanelData = panelData[(panelData['PmArRatio'] < minPmArRatio) | (panelData['area'] > maxPanelRowArea)]
        panelData = panelData[~panelData['nativeID'].isin(arrayInPanelData['nativeID'])]

        # ~~~~~~~~~~~~~~~~~~ Get New Array Boudaries From Panel Data

        # Get the mount type for each panel based on the geometry. assignMountType returns multiple columns, so only return the mount column.
        panelData['mount'] = panelData.apply(assignMountType, axis=1).apply(lambda x: x[0]) # panelData['mount'] = panelData.apply(assignMountType, axis=1)

        # Buffer the geometries by panelArrayBuff, dissovle boundaries by overlap, explode again, and unbuffer by panelArrayBuff * -1. 
        arrayFromPanelData = panelData.copy()
        arrayFromPanelData['geometry'] = arrayFromPanelData.buffer(panelArrayBuff)
        arrayFromPanelData = arrayFromPanelData.dissolve().explode(index_parts=False).reset_index(drop=True)

        # Unbuffer the geometries by the same distance (negative buffer)
        arrayFromPanelData['geometry'] = arrayFromPanelData.buffer(panelArrayBuff * -1)

        # Check for and remove erroneous geometries in arrays
        arrayFromPanelData = checkArrayGeometries(arrayFromPanelData)

        # Save the most common mount type for each array based on panels that intersect with the array
        arrayFromPanelData['mount'] = arrayFromPanelData['geometry'].apply(lambda x: panelData[panelData.intersects(x)]['mount'].mode()[0])

        # IF any arrayFromPanelData shape is within a 10m buffer (arrayArrayBuff) of another arrayFromPanelData, merge them into a single array shape
        arrayFromPanelData = groupArrayByMountAndProximity(arrayFromPanelData, arrayArrayBuff)

        # Assign each array a unique identifier
        arrayFromPanelData['arrayID'] = arrayFromPanelData.index

        # Save the number of panels in each array based number of intersecting panels
        arrayFromPanelData['PnlNum'] = arrayFromPanelData['geometry'].apply(lambda x: len(panelData[panelData.intersects(x)]))

        # Assign each panel the corresponding arrayID and total panel num in array by spatial join.
        panelData = gpd.sjoin(panelData, arrayFromPanelData[['arrayID', 'PnlNum', 'geometry']], how='left', predicate='intersects').drop(columns='index_right')

        # Remove arrays and panels that do not meet the minimum number of panels in an array
        arrayFromPanelData = arrayFromPanelData[arrayFromPanelData['PnlNum'] >= minNumPanelRows]
        panelData = panelData[panelData['PnlNum'] >= minNumPanelRows]
    
    # Else, if panelData is empty, return an empty gdf for panelData, arrayInPanelData, and arrayFromPanelData
    else:
        panelData = gpd.GeoDataFrame(columns=['instYr', 'modType', 'nativeID', 'Source', 'ProjName', 'geometry', 'mount'])
        arrayInPanelData = gpd.GeoDataFrame(columns=['instYr', 'modType', 'nativeID', 'Source', 'ProjName', 'geometry'])
        arrayFromPanelData = gpd.GeoDataFrame(columns=['instYr', 'modType', 'nativeID', 'Source', 'ProjName', 'geometry', 'mount', 'arrayID', 'PnlNum'])

    #~~~~~~~~~~~~~~~~~~# 
    # Merge Array Data # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #~~~~~~~~~~~~~~~~~~# 

    # Remove arrays with overlap in the following level of priority: arrayData, arrayInPanelData, arrayFromPanelData
    # This order maintains arrays composed of subarray sections (multipolygons)
    # First, check if data exists, then clean DataFrames by dropping rows with invalid or empty geometries, then remove overlapping arrays
    # Solves an issue where an array df is empty, or contains errant geometries
    if arrayInPanelData is not None:
        arrayInPanelData = arrayInPanelData[arrayInPanelData.geometry.notna()]
        if not arrayInPanelData.empty and not arrayData.empty:
            arrayInPanelData = arrayInPanelData[~arrayInPanelData.intersects(arrayData.unary_union)]
    if arrayFromPanelData is not None:
        arrayFromPanelData = arrayFromPanelData[arrayFromPanelData.geometry.notna()]
        if not arrayFromPanelData.empty and not arrayData.empty:
            arrayFromPanelData = arrayFromPanelData[~arrayFromPanelData.intersects(arrayData.unary_union)]
        if arrayInPanelData is not None and not arrayInPanelData.empty:
            arrayFromPanelData = arrayFromPanelData[~arrayFromPanelData.intersects(arrayInPanelData.unary_union)]

    # For arrayFromPanelData and arrayInPanelData, select the following columns: instYr, modType, nativeID, Source, ProjName, PnlNum, geometry
    arrayFromPanelData = arrayFromPanelData[['instYr', 'modType', 'nativeID', 'Source', 'ProjName', 'geometry']]
    arrayInPanelData = arrayInPanelData[['instYr', 'modType', 'nativeID', 'Source', 'ProjName', 'geometry']]

    # For arrayFromPanelData and arrayInPanelData, add a cap_mw column and set it to -9999
    arrayFromPanelData['cap_mw'] = -9999
    arrayInPanelData['cap_mw'] = -9999

    # Merge the array data
    arrayData = pd.concat([arrayData, arrayInPanelData, arrayFromPanelData])

    #~~~~~~~~~~~~~~~~~~~~~~~~~# 
    # Fill Gaps and Save Data # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #~~~~~~~~~~~~~~~~~~~~~~~~~#

    # Save the final number of panels in each array based number of intersecting panels (overwrites initial count)
    arrayData['PnlNum'] = arrayData['geometry'].apply(lambda x: len(panelData[panelData.intersects(x)]))

    # For each array and each panel, calculate the area and save as a new column
    panelData['area'] = panelData.area
    arrayData['area'] = arrayData.area

    # instYr values are formatted in several ways: 'YYYY-MM' or 'YYYY-MM-DD', 'MM/YYYY', 'YYYY', m/d/YYYY, with additional examples of: 'October 2018', 'March 1, 2015', '6/11/2011', '11/2019', '20183', '2021.0'
    # We want to extract the year from these strings. If the year is not present, or if the year is not between 1900 and 2025, set to -9999
    # Set the instYr column to an integer type when finished.
    # Function to extract a valid year from various formats
    def extract_year(instYr):
        # Patterns to capture the year from different formats
        patterns = [
            r'(\b\d{4})[-/]\d{2}[-/]\d{2}',    # Match 'YYYY-MM-DD' or 'YYYY/MM/DD' (e.g., '2020-05-25')
            r'(\b\d{4})[-/]\d{2}',              # Match 'YYYY-MM' or 'YYYY/MM' (e.g., '2020-05')
            r'\b(\d{1,2})/\d{1,2}/(\d{4})\b',   # Match 'm/d/YYYY' or 'MM/DD/YYYY' (e.g., '6/11/2011')
            r'([A-Za-z]+\s+\d{1,2},?\s+)?(\d{4})',  # Match 'Month YYYY' (e.g., 'October 2018') or 'March 1, 2015'
            r'(\b\d{4}\b)'                      # Match standalone 'YYYY' (e.g., '2020')
        ]
        
        # Iterate through the patterns and try to match
        for pattern in patterns:
            match = re.search(pattern, str(instYr))
            if match:
                # Extract the year from the matched group
                year = int(match.group(1)) if len(match.groups()) == 1 else int(match.group(2))
                # Ensure the year is valid (between 1983 and 2025) -- 1983 is the install year of Solar One, the first commercial solar power plant in the US
                if 1983 <= year <= 2025:
                    return year
        # If no valid year is found, return -9999
        return -9999

    # Apply the function to the 'instYr' column for both panelData and arrayData
    panelData['instYr'] = panelData['instYr'].apply(extract_year).astype(int)
    arrayData['instYr'] = arrayData['instYr'].apply(extract_year).astype(int)
        
    # For both dataframes, fill missing Source and ProjName with 'Unknown'
    panelData['Source'] = panelData['Source'].fillna('Unknown')
    arrayData['Source'] = arrayData['Source'].fillna('Unknown')
    panelData['ProjName'] = panelData['ProjName'].fillna('Unknown')
    arrayData['ProjName'] = arrayData['ProjName'].fillna('Unknown')

    # For respective modType columns, replace 'photovoltaic' with 'c-Si' and 'thermal' with 'csp'. 
    panelData['modType'] = panelData['modType'].replace('photovoltaic', 'c-si')
    arrayData['modType'] = arrayData['modType'].replace('photovoltaic', 'c-si')
    panelData['modType'] = panelData['modType'].replace('thermal', 'csp')
    arrayData['modType'] = arrayData['modType'].replace('thermal', 'csp')

    # For both dataframes, fill missing modType with 'c-Si'
    panelData['modType'] = panelData['modType'].fillna('c-si')
    arrayData['modType'] = arrayData['modType'].fillna('c-si')

    # Save the data to a shapefile in the OSM download folder for the state
    panelData.to_file(os.path.join(osmPanelsPath, state + 'SolarPanels.shp'))
    arrayData.to_file(os.path.join(osmArraysPath, state + 'SolarArrays.shp'))

    # If desired, return the dataframes
    #return panelData, arrayData

## Run the Function for Each CONUS State

In [6]:
# First, check if the OSM download folder exists, if not create it
checkFolder(osmPanelsPath)
checkFolder(osmArraysPath)

# Get a list of all 48 states in the contiguous US )
states = ['Alabama', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

# Loop through each state and get the solar data
for state in states:
    processSolarOSMData(state)
    print(state + ' data has been downloaded and processed.')

  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Alabama data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Arizona data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Arkansas data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


California data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Colorado data has been downloaded and processed.
Connecticut data has been downloaded and processed.
Delaware data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Florida data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Georgia data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Idaho data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Illinois data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Indiana data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Iowa data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Kansas data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Kentucky data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Louisiana data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Maine data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Maryland data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Massachusetts data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Michigan data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Minnesota data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Mississippi data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Missouri data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  _to_file_fiona(df, filename, driver, schema, crs, mode, **kwargs)


Montana data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Nebraska data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Nevada data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


New Hampshire data has been downloaded and processed.
New Jersey data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


New Mexico data has been downloaded and processed.
New York data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


North Carolina data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


North Dakota data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Ohio data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Oklahoma data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Oregon data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Pennsylvania data has been downloaded and processed.
Rhode Island data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


South Carolina data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


South Dakota data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Tennessee data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Texas data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Utah data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Vermont data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Virginia data has been downloaded and processed.
Washington data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  gdf = gdf.dropna(axis="columns", how="all")


West Virginia data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Wisconsin data has been downloaded and processed.


  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)
  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


Wyoming data has been downloaded and processed.


## Compile State Data into a National OSM Solar Dataset

In [7]:
# Function to load geodataframes from all files in a folder
def load_gdf(path, extension, target_crs):
    files = [f for f in os.listdir(path) if f.endswith(f'.{extension}')]
    dfs = [gpd.read_file(os.path.join(path, file)) for file in files]
    # Directly concatenate and reproject
    return gpd.GeoDataFrame(pd.concat(dfs, ignore_index=True)).to_crs(target_crs)

# Load all solar panel and array data
panels = load_gdf(osmPanelsPath, 'shp', uspvdb.crs)
arrays = load_gdf(osmArraysPath, 'shp', uspvdb.crs)

# Print the number of solar panels and arrays
print(f'Total number of solar panels: {len(panels)}')
print(f'Total number of solar arrays: {len(arrays)}')

# Print sum of area of arrays and panels in km
print(f'Total area of solar panels: {panels.area.sum() / 1e6:.2f} km^2')
print(f'Total area of solar arrays: {arrays.area.sum() / 1e6:.2f} km^2')

# Save the data to a shapefile in the OSM download folder
panels.to_file(os.path.join(osmDownloadPath, 'OSMSolarPanels.shp'))
arrays.to_file(os.path.join(osmDownloadPath, 'OSMSolarArrays.shp'))

  return gpd.GeoDataFrame(pd.concat(dfs, ignore_index=True)).to_crs(target_crs)


Total number of solar panels: 872972
Total number of solar arrays: 10531
Total area of solar panels: 116.41 km^2
Total area of solar arrays: 2438.02 km^2
