In [0]:
from pyspark.sql import functions as f
from pyspark.sql.functions import expr, explode, col
from sedona.spark import *
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from osgeo import gdal

In [0]:
dataset_storage_account_name="melikadatabricksstorage"
dataset_container_name="geospatial-dataset"
dataset_dir="raster/orthophoto/Buffalo"
geotiff_name = "2022_4BandImagery_NewYork_J1187738.tif"
geopackage_name = "Buffalo.gpkg"
catalog_name = "geospatial"
file_url = f"abfss://{dataset_container_name}@{dataset_storage_account_name}.dfs.core.windows.net/{dataset_dir}/{geotiff_name}"

In [0]:
dataset_bucket_name = "revodata-databricks-geospatial"
file_urls = {"2022": f"s3://{dataset_bucket_name}/geospatial-dataset/raster/orthophoto/soma/2022/2022_4BandImagery_SanFranciscoCA_J1191044.tif", "2025": f"s3://{dataset_bucket_name}/geospatial-dataset/raster/orthophoto/soma/2025/2025_4BandImagery_SanFranciscoCA_J1191043.tif"}
catalog_name = "geospatial"


In [0]:
print(file_urls["2022"])

In [0]:
config = SedonaContext.builder(). \
    config("spark.hadoop.fs.s3a.bucket.wherobots-examples.aws.credentials.provider","org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"). \
    getOrCreate()

sedona = SedonaContext.create(config)




In [0]:
df_image_new = sedona.read.format("binaryFile").load(file_urls["2025"])
df_image_new = df_image_new.withColumn("raster", f.expr("RS_FromGeoTiff(content)"))
df_image_new.createOrReplaceTempView("image_new_vw")
df_image_new.printSchema()
df_image_new.count()
display(df_image_new)

In [0]:
df_image_old = sedona.read.format("binaryFile").load(file_urls["2022"])
df_image_old = df_image_old.withColumn("raster", f.expr("RS_FromGeoTiff(content)"))
df_image_old.createOrReplaceTempView("image_old_vw")
df_image_old.printSchema()
df_image_old.count()
display(df_image_old)

In [0]:
metadata_info = spark.sql("SELECT RS_MetaData(raster) FROM image_new_vw")
display(metadata_info)

In [0]:
sedona.sql("SELECT RS_NumBands(raster) FROM image_new_vw").show()

In [0]:
htmlDf = sedona.sql("SELECT RS_AsImage(raster, 500) FROM image_new_vw")
SedonaUtils.display_image(htmlDf)

In [0]:
%python
all_stats = sedona.sql("SELECT RS_SummaryStatsAll(raster) FROM image_new_vw")
display(all_stats)

In [0]:
display(spark.sql("select RS_BandPixelType(raster) from image_new_vw"))

In [0]:
display(spark.sql("select RS_Count(raster) from image_new_vw"))

In [0]:
tiled_df_new = df_image_new.selectExpr(
  "RS_TileExplode(raster, 100, 100)"
).withColumnRenamed("x", "tile_x").withColumnRenamed("y", "tile_y")
tiled_df_new.show(3)

tiled_df_new.createOrReplaceTempView("tiled_df_new_vw")
tiled_df_new.count()

In [0]:
window_spec = Window.orderBy("tile_x", "tile_y")
tiled_df_new = tiled_df_new.withColumn("rn", F.row_number().over(window_spec)).withColumn("year", lit(2025))
display(tiled_df_new.limit(10))

In [0]:
df_image_old.printSchema

In [0]:
tiled_df_old = df_image_old.selectExpr(
  "RS_TileExplode(raster, 100, 100)"
).withColumnRenamed("x", "tile_x").withColumnRenamed("y", "tile_y")
tiled_df_old.show(3)

tiled_df_old.createOrReplaceTempView("tiled_df_old_vw")
tiled_df_old.count()

In [0]:
window_spec = Window.orderBy("tile_x", "tile_y")
tiled_df_old = tiled_df_old.withColumn("rn", F.row_number().over(window_spec)).withColumn("year", lit(2022))
display(tiled_df_old.limit(10))

In [0]:
first_tile = tiled_df_new.limit(1)
first_tile.createOrReplaceTempView("first_tile_vw")

htmlDf = sedona.sql("SELECT RS_AsImage(tile) FROM first_tile_vw")
SedonaUtils.display_image(htmlDf)

In [0]:
tiled_df_new = tiled_df_new.withColumn(
    "ndvi",
    expr(
        "RS_Divide("
        "  RS_Subtract(RS_BandAsArray(tile, 4), RS_BandAsArray(tile, 1)), "
        "  RS_Add(RS_BandAsArray(tile, 4), RS_BandAsArray(tile, 1))"
        ")"
    )
)

display(tiled_df_new.limit(1))

In [0]:
tiled_df_new = tiled_df_new.withColumn(
    "ndwi",
    expr(
        "RS_Divide("
        "  RS_Subtract(RS_BandAsArray(tile, 2), RS_BandAsArray(tile, 4)), "
        "  RS_Add(RS_BandAsArray(tile, 2), RS_BandAsArray(tile, 4))"
        ")"
    )
)

display(tiled_df_new.limit(1))

In [0]:
tiled_df_new = tiled_df_new.withColumn(
    "classification",
    expr("""
        transform(
            arrays_zip(ndvi, ndwi),
            x -> CASE
                WHEN x.ndvi >= -0.5 AND x.ndvi <= 0.5 AND 
                     x.ndwi >= -0.5 AND x.ndwi <= 0.5 THEN 1
                WHEN x.ndvi > 0.6 AND x.ndwi < -0.6 THEN 2
                WHEN x.ndvi < -0.6 AND x.ndwi > 0.6 THEN 3
                ELSE 999
            END
        )
    """)
)



In [0]:
display(tiled_df_new.limit(2))

In [0]:
classification_df_new = tiled_df_new.select("tile_x", "tile_y","rn", "year", expr("RS_MakeRaster(tile, 'I', classification) AS tile").alias("tile")).select("tile_x", "tile_y", "rn", "year", expr("RS_SetBandNoDataValue(tile,1, 999, false)").alias("tile")).select("tile_x", "tile_y", "rn", "year",expr("RS_SetBandNoDataValue(tile,1, 999, true)").alias("tile"))
display(classification_df_new.limit(10))

In [0]:
classification_df_new.createOrReplaceTempView("test_df_new_vw")
display(sedona.sql("SELECT RS_BandAsArray(tile,1) FROM test_df_new_vw").limit(1))

In [0]:
interpolated_df_new = classification_df_new.select("tile_x", "tile_y",expr("RS_Interpolate(tile, 2.0, 'variable', 12.0, 2.0)").alias("tile"))


In [0]:
display(interpolated_df_new.limit(10))

In [0]:
interpolated_df_new = interpolated_df_new.repartition("tile_x", "tile_y")
interpolated_df_new.cache()

In [0]:
tiled_df_old = tiled_df_old.withColumn(
    "ndvi",
    expr(
        "RS_Divide("
        "  RS_Subtract(RS_BandAsArray(tile, 4), RS_BandAsArray(tile, 1)), "
        "  RS_Add(RS_BandAsArray(tile, 4), RS_BandAsArray(tile, 1))"
        ")"
    )
)

display(tiled_df_old.limit(1))

In [0]:
tiled_df_old = tiled_df_old.withColumn(
    "ndwi",
    expr(
        "RS_Divide("
        "  RS_Subtract(RS_BandAsArray(tile, 2), RS_BandAsArray(tile, 4)), "
        "  RS_Add(RS_BandAsArray(tile, 2), RS_BandAsArray(tile, 4))"
        ")"
    )
)

display(tiled_df_old.limit(1))

In [0]:
tiled_df_old = tiled_df_old.withColumn(
    "classification",
    expr("""
        transform(
            arrays_zip(ndvi, ndwi),
            x -> CASE
                WHEN x.ndvi >= -0.5 AND x.ndvi <= 0.5 AND 
                     x.ndwi >= -0.5 AND x.ndwi <= 0.5 THEN 1
                WHEN x.ndvi > 0.6 AND x.ndwi < -0.6 THEN 2
                WHEN x.ndvi < -0.6 AND x.ndwi > 0.6 THEN 3
                ELSE 999
            END
        )
    """)
)

display(tiled_df_old.limit(2))

In [0]:
classification_df_old = tiled_df_old.select("tile_x", "tile_y","rn", "year", expr("RS_MakeRaster(tile, 'I', classification) AS tile").alias("tile")).select("tile_x", "tile_y", "rn", "year", expr("RS_SetBandNoDataValue(tile,1, 999, false)").alias("tile")).select("tile_x", "tile_y", "rn", "year",expr("RS_SetBandNoDataValue(tile,1, 999, true)").alias("tile"))
display(classification_df_old.limit(10))

In [0]:
tiled_df_old = tiled_df_old.select("tile_x", "tile_y", expr("RS_AddBandFromArray(tile, classification) AS tile").alias("tile"))
display(tiled_df_old.limit(1))

In [0]:
window_spec = Window.orderBy("tile_x", "tile_y")
tile_indexed_old = tiled_df_old.withColumn("rn", F.row_number().over(window_spec)).withColumn("year", lit(2022)).select("tile_x", "tile_y", "rn", "year",
  expr("RS_Band(tile, Array(6)) as tile")
)
display(tile_indexed_old.limit(10))

In [0]:
union_raster = classification_df_old.unionByName(classification_df_new, allowMissingColumns=False)
display(union_raster.limit(10))

In [0]:
window_spec = Window.partitionBy("rn").orderBy(F.desc("year"))
union_raster_indexed = union_raster.withColumn("index", F.row_number().over(window_spec))
display(union_raster_indexed.limit(10))

In [0]:
union_raster_indexed.createOrReplaceTempView("union_raster_indexed_vw")

display(sedona.sql("""
    SELECT RS_BandAsArray(tile, 1) AS raster
    FROM union_raster_indexed_vw
    limit 1 
""").limit(10))

In [0]:
merged_raster = sedona.sql("""
    SELECT rn, RS_Union_Aggr(tile, index) AS raster
    FROM union_raster_indexed_vw
    GROUP BY rn
""")

display(merged_raster.limit(10))

In [0]:
merged_raster.createOrReplaceTempView("merged_raster_vw")
display(sedona.sql("""
    SELECT RS_BandAsArray(raster, 2) AS raster
    FROM merged_raster_vw
""").limit(10))

In [0]:
result_raster = merged_raster.withColumn("diff_band", expr( 
        "RS_LogicalDifference("
        "RS_BandAsArray(raster, 1), RS_BandAsArray(raster, 2)"
        ")"))

display(result_raster.limit(5))

In [0]:
df = result_raster.select("rn", expr("RS_AddBandFromArray(raster, diff_band) AS raster").alias("raster"))
display(df.limit(5))

In [0]:
df.createOrReplaceTempView("df_vw")
display(sedona.sql("""
    SELECT rn, RS_BandAsArray(raster, 3) AS raster
    FROM df_vw
""").limit(10))

In [0]:
df.withColumn("raster_binary", expr("RS_AsGeoTiff(raster)"))\
  .write.format("raster").option("fileExtension", ".tiff").mode("overwrite").save(f"s3://{dataset_bucket_name}/geospatial-dataset/raster/orthophoto/soma/result2/")

In [0]:
df2 = df.withColumn("raster_binary", expr("RS_AsGeoTiff(raster)"))
display(df2.limit(5))

In [0]:
import os
os.environ['AWS_ACCESS_KEY_ID'] = dbutils.secrets.get(scope="aws_geospatial_s3", key="access_key")
os.environ['AWS_SECRET_ACCESS_KEY'] = dbutils.secrets.get(scope="aws_geospatial_s3", key="secret_key")
os.environ['AWS_DEFAULT_REGION'] = 'eu-west-2'      # Match your bucket region

In [0]:
a = dbutils.secrets.get(scope="aws_geospatial_s3", key="access_key")
mykey = ""
for i in a:
    print(i)


In [0]:
# First collect all arrays ordered by 'rn'
merged_df = df.withColumn("band1", expr("RS_BandAsArray(raster, 1)")).withColumn(
    "merged_array",
    F.collect_list("band1").over(Window.orderBy("rn").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing))
)

# Then take the first row (which now contains all arrays merged)
final_result1 = merged_df.limit(1).select("merged_array")
display(final_result1.limit(1))

In [0]:
column_list = [row[0] for row in df2.select("raster_binary").collect()]
print(column_list)

In [0]:
from osgeo import gdal

# Build VRT first (virtual mosaic)
vrt_file = "temp.vrt"
gdal.BuildVRT(vrt_file, column_list)

# Then translate to physical file
gdal.Translate("merged.tif", vrt_file)

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS geospatial.soma
COMMENT 'This schema contains {item} data of the South of Market (SoMa) neighborhood  in San Francisco';

In [0]:
all_polygons_new = tiled_df_new.select("tile_x", "tile_y",
  expr("explode(RS_PixelAsPolygons(tile, 6)) as ndvi")
).selectExpr(
  "tile_x", "tile_y",
  "ndvi.geom as geom",
  "ndvi.value as classification"
).repartition("tile_x", "tile_x")

display(all_polygons_new.limit(100))

num_partitions = all_polygons_new.rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}")

all_polygons_new = all_polygons_new.repartition("tile_x", "tile_x")
num_partitions = all_polygons_new.rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}")

all_polygons_new.createOrReplaceTempView("all_polygons_new_vw")
union_polygons_new = spark.sql("""
SELECT tile_x, tile_y, 
    classification,
  ST_Union_Aggr(geom) as geom
FROM all_polygons_new_vw
GROUP BY tile_x, tile_y, classification""")


display(union_polygons_new.limit(10))

union_polygons_new = union_polygons_new.repartition("tile_x", "tile_x")
union_polygons_new.cache()
union_polygons_new.createOrReplaceTempView("union_polygons_new_vw")
result_new = spark.sql("""
SELECT tile_x, tile_y, classification, ST_Dump(geom) AS geom FROM union_polygons_new_vw
""").repartition("tile_x", "tile_x")

result_new.count()
display(result_new.limit(10))


result_new.write.mode("overwrite").option("mergeSchema", "true").saveAsTable(f"geospatial.soma.classification_2025")

result_new.write.format("geojson").mode("overwrite").save(f"s3://{dataset_bucket_name}/geospatial-dataset/raster/orthophoto/soma/2025/classification_2025.geojson")

In [0]:
all_polygons_old = tiled_df_old.select("tile_x", "tile_y",
  expr("explode(RS_PixelAsPolygons(tile, 6)) as ndvi")
).selectExpr(
  "tile_x", "tile_y",
  "ndvi.geom as geom",
  "ndvi.value as classification"
).repartition("tile_x", "tile_x")

display(all_polygons_old.limit(100))


all_polygons_old = all_polygons_old.repartition("tile_x", "tile_x")
num_partitions = all_polygons_old.rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}")

all_polygons_old.createOrReplaceTempView("all_polygons_old_vw")

union_polygons_old = spark.sql("""
SELECT tile_x, tile_y, 
    classification,
  ST_Union_Aggr(geom) as geom
FROM all_polygons_old_vw
GROUP BY tile_x, tile_y, classification""")


display(union_polygons_old.limit(10))

union_polygons_old = union_polygons_old.repartition("tile_x", "tile_x")
union_polygons_old.cache()
union_polygons_old.createOrReplaceTempView("union_polygons_old_vw")

result_old = spark.sql("""
SELECT tile_x, tile_y, classification, ST_Dump(geom) AS geom FROM union_polygons_old_vw
""").repartition("tile_x", "tile_x")

result_old.count()
display(result_old.limit(10))


result_old.write.mode("overwrite").option("mergeSchema", "true").saveAsTable(f"geospatial.soma.classification_2022")

result_old.write.format("geojson").mode("overwrite").save(f"s3://{dataset_bucket_name}/geospatial-dataset/raster/orthophoto/soma/2022/classification_2022.geojson")