In [1]:
import os
import rasterio
import geopandas as gpd
from shapely.geometry import box
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

from rasterio import features

from rasterio.features import rasterize
from rasterio.transform import from_origin
import pandas as pd
from rasterio import features
from shapely.geometry import Polygon, MultiPolygon

# FUNCTION 

The raster_to_polygons_optimized function is designed to convert a raster dataset into a GeoDataFrame of polygons, where each polygon represents an individual cell from the raster. 

The function takes a single argument, raster_path, which specifies the file path of the raster to be processed. It begins by opening the raster file using rasterio, extracting the transformation parameters, reading the first band of data, and identifying the no-data value specified in the raster's metadata. 

The function then identifies all valid cells—those not equal to the no-data value—and iterates over these cells. For each valid cell, it calculates the polygon geometry that corresponds to the cell's spatial extent using the raster's transformation parameters and assigns the cell's value to this polygon. 

These polygons and their associated values are then aggregated into a GeoDataFrame, which is assigned the same Coordinate Reference System (CRS) as the source raster. 

The resulting GeoDataFrame, containing a 'value' column for cell values and a 'geometry' column for the corresponding polygons, is returned by the function. This optimized approach ensures efficient processing by focusing only on valid raster cells, thereby excluding areas with no-data values and reducing computational overhead.

In [2]:
def raster_to_polygons_optimized(raster_path):
    """
    Convert a raster dataset to a GeoDataFrame where each valid cell (non-no-data) is represented as a polygon.

    This function opens a raster file, reads its first band, and converts each valid cell into a polygon. The value
    of each cell is retained and associated with the corresponding polygon. The resulting GeoDataFrame contains
    polygons with their associated values and is set to the same CRS as the input raster.

    Parameters:
    - raster_path (str): The file path of the raster to be processed.

    Returns:
    - gpd.GeoDataFrame: A GeoDataFrame with two columns: 'value' containing the cell values and 'geometry'
      containing the corresponding polygon geometries for each valid raster cell.
    """
    # Extract the base filename without extension to use as the column name
    filename = os.path.splitext(os.path.basename(raster_path))[0]

    # Open the raster file using rasterio
    with rasterio.open(raster_path) as src:
        
        # Extract the affine transform for the raster
        transform = src.transform
        # Read the first band of the raster
        data = src.read(1)
        # Retrieve the no-data value set in the raster's metadata
        nodata = src.nodata

    # Identify valid cells (cells not equal to the no-data value)
    valid_cells = np.argwhere(data != nodata)

    polygons = []  # List to store polygon geometries
    values = []  # List to store the values of the valid cells

    # Iterate over the indices of valid cells to create polygons
    for (j, i) in valid_cells:
        # Extract the value of the current cell
        value = data[j, i]
        # Create a polygon geometry for the current cell based on its coordinates and the raster's transform
        polygon = box(
            transform[2] + i * transform[0],  # Minx
            transform[5] + (j + 1) * transform[4],  # Miny
            transform[2] + (i + 1) * transform[0],  # Maxx
            transform[5] + j * transform[4]  # Maxy
        )
        # Append the current polygon and its value to their respective lists
        polygons.append(polygon)
        values.append(value)

    # Create a GeoDataFrame with the polygons and their associated values
    gdf = gpd.GeoDataFrame({filename: values, 'geometry': polygons})
    # Set the CRS of the GeoDataFrame to match the CRS of the source raster
    gdf.crs = src.crs

    return gdf


In [None]:
#gdf_test = raster_to_polygons_optimized()

A raster transform is a set of coefficients that define the spatial relationship between the coordinates of pixels in a raster dataset and the geographic coordinates on the surface of the Earth. This transform allows you to map the row and column indices of pixels in the raster grid to their corresponding geographic locations (such as latitude and longitude or coordinates in a projected coordinate system).

The raster transform typically includes six coefficients, often represented in an affine transformation matrix. These coefficients are:

a: The width of a pixel in the units of the coordinate system (e.g., meters).
b and d: These coefficients typically rotate the raster, but in most north-up images, these are zero.
c: The X-coordinate of the upper-left corner of the upper-left pixel.
e: The height of a pixel in the units of the coordinate system, which is usually a negative value because pixel row indices increase downward, while geographic coordinates usually increase upward.
f: The Y-coordinate of the upper-left corner of the upper-left pixel.
In the context of the rasterio library, the transform is often used to calculate the geographic coordinates of the center of a pixel given its row and column indices, or vice versa. The transform object can perform these calculations through methods like * (to convert pixel coordinates to geographic coordinates) and ~ (to convert geographic coordinates to pixel coordinates).

An example affine transform could look like this: [a, b, c, d, e, f], where a, d, and e are the pixel sizes and rotation terms, and c and f give the geographic coordinates of the upper-left pixel.

Understanding the raster transform is crucial for tasks like georeferencing, where you need to align the raster data with a specific geographic location, or when performing spatial analyses that require the conversion of pixel coordinates to real-world geographic locations.

# Process Raster(s) and Create 'INPUT REPORTING UNITS'

In [3]:

# Folder containing all your raster files
raster_folder = r"C:\Users\bsf31\Documents\post-meds\data\signal\Moore\built_model\rasters"
# Get a list of all raster files in the folder (assuming they are TIFF files)
raster_files = [f for f in os.listdir(raster_folder) if f.endswith('.tif')]

# Initialize variables to store the smallest resolution
smallest_res = None

# Iterate through each raster file to find the smallest resolution
for raster_file in raster_files:
    with rasterio.open(os.path.join(raster_folder, raster_file)) as src:
        # src.res returns a tuple (width, height) of pixels
        if smallest_res is None or (src.res[0] < smallest_res[0] and src.res[1] < smallest_res[1]):
            smallest_res = src.res

# smallest_res now contains the smallest resolution by width and height
print("Smallest resolution:", smallest_res)

# Initialize an empty GeoDataFrame to store the combined results
combined_gdf = gpd.GeoDataFrame()

# Process each raster file
for raster_file in raster_files:
    raster_path = os.path.join(raster_folder, raster_file)
    with rasterio.open(raster_path) as src:
        # Check if the raster's resolution matches the most common resolution
        if src.res != smallest_res:
            print(f"Skipping {raster_file} due to mismatching resolution.")
            continue  # Skip this file

    # If resolution matches, process the raster
    current_gdf = raster_to_polygons_optimized(raster_path)
    
    # Merge the current GeoDataFrame with the combined one
    if combined_gdf.empty:
        combined_gdf = current_gdf
    else:
        combined_gdf = combined_gdf.merge(current_gdf, on='geometry', how='outer')

# Save the combined GeoDataFrame to a GeoPackage
output_gpkg_path = r'C:\Users\bsf31\Documents\post-meds\data\signal\Moore\built_model\input_reporting_units\input_reporting_units.gpkg'
combined_gdf.to_file(output_gpkg_path, layer='built_env', driver='GPKG')


Smallest resolution: (97.38204323985136, 99.43096076870454)


In [32]:
output_gpkg_path

'C:\\Users\\bsf31\\Documents\\post-meds\\data\\signal\\Moore\\built_model\\input_reporting_units\\input_reporting_units.gpkg'

# Optional - Append New Single File Raster Data

In [None]:
# Path to the existing GeoPackage and the new raster file
existing_gpkg_path = output_gpkg_path
# Load the existing GeoPackage into a GeoDataFrame
existing_gdf = gpd.read_file(existing_gpkg_path, layer='combined_layer')



In [None]:
new_raster_path = 'path_to_new_raster_file.tif'
# Process the new raster to a GeoDataFrame
new_gdf = raster_to_polygons_optimized(new_raster_path)

# Merge the new GeoDataFrame with the existing one
# Ensure the merge is done based on geometry or another suitable common attribute
updated_gdf = existing_gdf.merge(new_gdf, on='geometry', how='outer')

# Update the GeoPackage with the updated GeoDataFrame
# This example overwrites the existing layer; you could also choose to add a new layer
updated_gdf.to_file(existing_gpkg_path, layer='combined_layer', driver='GPKG')


In [4]:
study_path = r"C:\Users\bsf31\Documents\post-meds\data\signal\RWMP_AREA_2229.gpkg"
study_area = gpd.read_file(study_path)

# REFERENCE RASTER

In [23]:
output_crs = 'EPSG:2229'
output_resolution = (97.37954817629196214, 99.34198426966294448)


In [13]:
output_resolution

(97.37954817629196, -99.34198426966294)

In [6]:
study_area_bounds =study_area.total_bounds
study_area_bounds

array([5807047.11341745, 1961361.28632066, 6127434.03567656,
       2040906.05493562])

# Vector to Raster

In [15]:
def vector_to_raster(input_vector, output_raster, attribute, study_area_bounds, value_mapping, single_value=None, resolution=(abs(0.00026949458523585647), abs(-0.00026949458523585647)), dtype='uint16'):
    # Check if input_vector is a GeoDataFrame or a file path
    if isinstance(input_vector, gpd.GeoDataFrame):
        gdf = input_vector
    else:
        # Read the input vector file (GeoPackage or Shapefile) into a GeoDataFrame
        gdf = gpd.read_file(input_vector)
    # Reproject the GeoDataFrame to the desired CRS (EPSG:2229)
    gdf = gdf.to_crs(epsg=2229)

    # Ensure that categorical column is string
    gdf[attribute] = gdf[attribute].astype(str)
   

   # If single_value is None, convert the attribute column to numerical values
    # If single_value is provided, set the attribute column to the provided single_value
    if single_value is None:
        gdf[attribute] = gdf[attribute].replace(value_mapping).astype(dtype)
    else:
        gdf[attribute] = single_value


    # Use the study area bounds and resolution to calculate the width and height of the output raster
    minx, miny, maxx, maxy = study_area_bounds
    width = int((maxx - minx) / resolution[0])
    height = int((maxy - miny) / resolution[1])

    out_transform = rasterio.transform.from_bounds(minx, miny, maxx, maxy, width, height)


    # Define the metadata for the output raster file
    out_meta = {
        'driver': 'GTiff',
        'width': width,
        'height': height,
        'count': 1,
        'dtype': dtype,
        'crs': 'EPSG:2229',
        'transform': out_transform,
        'nodata': 0
    }
    
    # Open the output raster file for writing with the specified metadata
    with rasterio.open(output_raster, 'w', **out_meta) as dst:
        # Create a generator of tuples containing the geometry and attribute value for each feature in the input vector data
        shapes = ((geom, value) for geom, value in zip(gdf['geometry'], gdf[attribute]))
        
        # Burn the geometries and their corresponding attribute values into a raster array
        burned = features.rasterize(
            shapes=shapes,         # The generator of geometry-attribute tuples
            fill=0,                # The default value for pixels not covered by any geometry
            out_shape=(height, width), # The shape of the output raster array (number of rows and columns)
            transform=out_transform,   # The affine transformation matrix that maps pixel coordinates to the coordinate reference system
            dtype=dtype            # The data type of the raster array
        )
        
        # Write the burned raster array to the output raster file
        dst.write(burned, 1)

# Functions to call Vector to Raster

In [16]:
# To write text file of value mapping
def write_value_mapping(value_mapping, output_file):
    with open(output_file, 'w') as f:
        for key, value in value_mapping.items():
            f.write(f'{key}: {value}\n')

In [17]:
def process_columns(input_vector, output_dir, study_area_bounds, resolution, columns, value_mapping, single_value=None, file_name=None):
    if file_name is None:
        file_name = os.path.splitext(os.path.basename(input_vector))[0]

    for column in columns:
        column_output_dir = os.path.join(output_dir, column)
        os.makedirs(column_output_dir, exist_ok=True)

        output_raster = f"{column_output_dir}/{file_name}_{column}_raster.tif"
        vector_to_raster(input_vector, output_raster, column, study_area_bounds, value_mapping[column], single_value=single_value, resolution=resolution)
        

In [18]:
def process_files_from_list(file_list, output_dir, study_area_bounds, resolution, columns, single_value=None):
    # Generate a global value mapping from the list of unique values for each column
    global_value_mapping = {column: set() for column in columns}
    
    for file_path in file_list:
        gdf = gpd.read_file(file_path)
        for column in columns:
            global_value_mapping[column].update(gdf[column].astype(str).unique())
    
    for column in columns:
        global_value_mapping[column] = {value: idx for idx, value in enumerate(sorted(list(global_value_mapping[column])), 1)}
        print(f"Global value mapping for {column}:")
        for value in global_value_mapping[column]:
            print(f"{value}: {global_value_mapping[column][value]}")
    
    # Write the global value mapping for each column to separate .txt files in the corresponding column folder
    for column in columns:
        column_output_dir = os.path.join(output_dir, column)
        os.makedirs(column_output_dir, exist_ok=True)

        output_value_mapping_file = f"{column_output_dir}/{column}_global_value_mapping.txt"
        write_value_mapping(global_value_mapping[column], output_value_mapping_file)

    # Process each file with the global value mapping
    for file_path in file_list:
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        process_columns(file_path, output_dir, study_area_bounds, columns=columns, resolution=resolution, value_mapping=global_value_mapping, single_value=single_value, file_name=file_name)

In [19]:
# Where files will save, can add subfolders if desired
output_dir = os.path.join(r"C:\Users\bsf31\Documents\post-meds\data\signal\Moore\built_model\base")
os.makedirs(output_dir, exist_ok=True)
vector = [os.path.join(output_dir, 'esha_riparian_rwmp_epsg2229.gpkg')]

In [24]:
process_files_from_list(vector, output_dir, study_area_bounds, output_resolution, columns=['OVERLAY'])


Global value mapping for OVERLAY:
Environmentally Sensitive Habitat Overlay: 1
Riparian Corridor Overlay: 2


  gdf[attribute] = gdf[attribute].replace(value_mapping).astype(dtype)


In [25]:
# Where files will save, can add subfolders if desired
output_dir = os.path.join(r"C:\Users\bsf31\Documents\post-meds\data\signal\Moore\built_model\base")
os.makedirs(output_dir, exist_ok=True)
vector = [os.path.join(output_dir, 'ag_greenbelts_02182021.gpkg')]

In [26]:
process_files_from_list(vector, output_dir, study_area_bounds, output_resolution, columns=['Id'], single_value=1)


Global value mapping for Id:
0: 1


In [27]:
# Where files will save, can add subfolders if desired
output_dir = os.path.join(r"C:\Users\bsf31\Documents\post-meds\data\signal\Moore\built_model\base")
os.makedirs(output_dir, exist_ok=True)
vector = [os.path.join(output_dir, 'lu_catagories.gpkg')]

In [28]:
process_files_from_list(vector, output_dir, study_area_bounds, output_resolution, columns=['Category'])


Global value mapping for Category:
Agricultural: 1
Commercial: 2
Critical Infrastructure: 3
High-Risk Infrastructure: 4
Miscellaneous: 5
Null: 6
Residential: 7


  gdf[attribute] = gdf[attribute].replace(value_mapping).astype(dtype)


In [4]:
input_reporting_units = gpd.read_file(r"C:\Users\bsf31\Documents\post-meds\data\signal\Moore\built_model\input_reporting_units\input_reporting_units.gpkg")

In [5]:
input_reporting_units.columns

Index(['ag_greenbelts_02182021_Id_raster',
       'esha_riparian_rwmp_epsg2229_OVERLAY_raster',
       'lu_catagories_Category_raster', 'geometry'],
      dtype='object')

In [6]:
columns_to_check = ['ag_greenbelts_02182021_Id_raster', 'esha_riparian_rwmp_epsg2229_OVERLAY_raster', 'lu_catagories_Category_raster' ]  # Add more column names if needed

for column in columns_to_check:
    if input_reporting_units[column].isnull().any():
        print(f"Warning: The '{column}' column contains missing values.")
        # Replace missing values with an appropriate value based on your requirements
        input_reporting_units[column] = input_reporting_units[column].fillna(0) 
    else:
        print(f"No missing values found in the '{column}' column.")

# Save the updated input reporting units file
output_file = r'C:\Users\bsf31\Documents\post-meds\data\signal\Moore\built_model\input_reporting_units\input_reporting_units_updated.gpkg'
input_reporting_units.to_file(output_file, driver='GPKG')

print("Input reporting units file updated with null values fixed. Updated file saved to:", output_file)

Input reporting units file updated with null values fixed. Updated file saved to: C:\Users\bsf31\Documents\post-meds\data\signal\Moore\built_model\input_reporting_units\input_reporting_units_updated.gpkg
