## Goal of this Notebook

The goal of this notebook is to perform geospatial raster interpolation and change detection for the SOMA region using Databricks and Sedona. The workflow includes reading raster data, interpolating missing tiles, merging results, and generating output GeoTIFFs for different years. The notebook also computes logical differences between raster bands to detect changes over time and saves the results to Unity Catalog tables and S3 storage.


In [0]:
%run ../get_user

In [0]:
%run ./merge_images

In [0]:
user_email = spark.sql("SELECT current_user()").collect()[0][0]
username = get_username_from_email(user_email)
print(username)

In [0]:
from pyspark.sql import functions as f
from pyspark.sql.functions import expr, explode, col
from sedona.spark import *
from pyspark.sql.window import Window
from pyspark.sql import functions as F
import math
from pyspark.sql import SparkSession
import os


In [0]:
dataset_bucket_name = "revodata-databricks-geospatial"
catalog_name = "geospatial"

In [0]:
config = SedonaContext.builder(). \
    config("spark.hadoop.fs.s3a.bucket.wherobots-examples.aws.credentials.provider","org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"). \
    getOrCreate()

sedona = SedonaContext.create(config)

In [0]:
# This code reads classification raster tiles, computes summary statistics, interpolates missing tiles,
# merges interpolated and non-interpolated tiles, and saves the result to a Unity Catalog table.

classification_df = (
    spark.table(f"geospatial.soma.classification_{username}")
    .withColumn("tile", expr("RS_FromGeoTiff(raster_binary)"))
    .repartitionByRange(20, "rn")
)
classification_df = classification_df.withColumn(
    "maxValue", expr("""RS_SummaryStats(tile, "max", 1, false)""")
)
display(classification_df)

no_interpolation = (
    classification_df
    .filter(classification_df["maxValue"] != 999)
    .select("tile_x", "tile_y", "rn", "year", "index", "raster_binary")
)

interpolated_df = (
    classification_df
    .filter(classification_df["maxValue"] == 999)
    .select(
        "tile_x",
        "tile_y",
        "rn",
        "year",
        "index",
        expr("RS_Interpolate(tile, 2.0, 'variable', 48.0, 6.0)").alias("tile"),
    )
    .withColumn("raster_binary", expr("RS_AsGeoTiff(tile)"))
    .select("tile_x", "tile_y", "rn", "year", "index", "raster_binary")
)

union_df = interpolated_df.unionByName(no_interpolation, allowMissingColumns=False)
union_df.write.mode("overwrite").saveAsTable(f"geospatial.soma.interpolation_{username}")

In [0]:
# This code reads the interpolated raster tiles from Unity Catalog, adds a tile column, repartitions the DataFrame,
# displays the result, filters for specific years, and merges the tiles into output GeoTIFFs for 2025 and 2022.

union_df = (
    spark.table(f"geospatial.soma.interpolation_{username}")
    .withColumn("tile", expr("RS_FromGeoTiff(raster_binary)"))
    .repartitionByRange(10, "rn")
)

display(union_df)

output_tiff_2025 = (
    f"s3://{dataset_bucket_name}/outputs/geotiff/{username}/interpolation_2025.tif"
)
output_tiff_2022 = (
    f"s3://{dataset_bucket_name}/outputs/geotiff/{username}/interpolation_2022.tif"
)

union_df_2025 = union_df.filter(union_df["year"] == 2025)
union_df_2022 = union_df.filter(union_df["year"] == 2022)

merge_tiffs(union_df_2025, output_tiff_2025)
merge_tiffs(union_df_2022, output_tiff_2022)

In [0]:
# This code performs change detection on raster tiles by merging tiles, computing logical differences between bands,
# adding the difference as a new band, saving the results to a Unity Catalog table, and merging the output into a single GeoTIFF.

union_df.createOrReplaceTempView("union_df_vw")

merged_raster = (
    union_df
    .groupBy("rn")
    .agg(expr("RS_Union_Aggr(tile, index)").alias("raster"))
    .repartitionByRange(10, "rn")
)

merged_raster.createOrReplaceTempView("merged_raster_vw")

diff_raster = (
    merged_raster
    .withColumn(
        "diff_band",
        expr(
            "RS_LogicalDifference("
            "RS_BandAsArray(raster, 1), RS_BandAsArray(raster, 2)"
            ")"
        ),
    )
    .repartitionByRange(10, "rn")
)

result_df = (
    diff_raster
    .select(
        "rn",
        expr("RS_AddBandFromArray(raster, diff_band) AS raster").alias("raster")
    )
    .withColumn("raster_binary", expr("RS_AsGeoTiff(raster)"))
    .repartitionByRange(10, "rn")
)

result_df.select("rn", "raster_binary").write.mode("overwrite").saveAsTable(
    f"geospatial.soma.change_detection_{username}"
)

# Merge TIFFs
result_df = (
    spark.table(f"geospatial.soma.change_detection_{username}")
    .withColumn("tile", expr("RS_FromGeoTiff(raster_binary)"))
    .repartitionByRange(10, "rn")
)
output_tiff = (
    f"s3://{dataset_bucket_name}/outputs/geotiff/{username}/difference_output.tif"
)
merge_tiffs(result_df, output_tiff)