In [0]:
%run ../get_user

In [0]:
user_email = spark.sql("SELECT current_user()").collect()[0][0]
username = get_username_from_email(user_email)
print(username)

In [0]:
from sedona.spark import *
from pyspark.sql.functions import expr
import numpy as np
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F

In [0]:
config = SedonaContext.builder() .\
    config('spark.jars.packages',
           'org.apache.sedona:sedona-spark-shaded-3.3_2.12:1.7.1,'
           'org.datasyslab:geotools-wrapper:1.7.1-28.5'). \
    getOrCreate()

sedona = SedonaContext.create(config)

In [0]:
dataset_bucket_name = "revodata-databricks-geospatial"
dataset_dir= "geospatial-dataset/netcdf/global"
dataset_file = "tos_O1_2001-2002.nc"
nc_variable = "tos"

In [0]:
df = sedona.read.format("binaryFile").load(f"s3://{dataset_bucket_name}/{dataset_dir}/{dataset_file}")

In [0]:
record_info_row = df.selectExpr("RS_NetCDFInfo(content) as record_info").first()
print(record_info_row)

In [0]:
df_raster = df.withColumn("raster", expr(f"RS_FromNetCDF(content, '{nc_variable}')"))
display(df_raster)

In [0]:
htmlDF = df_raster.selectExpr("RS_AsImage(raster, 500) as raster_image")
SedonaUtils.display_image(htmlDF)

In [0]:
df_num = df_raster.withColumn("num_raster", expr("RS_NumBands(raster)"))
num_bands = df_num.select("num_raster").first()[0]
print(num_bands)

In [0]:
df_raster.selectExpr(
  "explode(array(RS_Value(raster, 45, 65, 1))) as exploded"
).show()


In [0]:
df_pixels_as_geom = df_raster.selectExpr(
  "explode(RS_PixelAsCentroids(raster, 1)) as exploded"
).selectExpr(
  "exploded.geom as geom",
  "exploded.x as x",
  "exploded.y as y"
)

df_pixels_as_geom.show()

In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col

# Function to process each band and return a DataFrame
def process_band(df_raster: DataFrame, band: int) -> DataFrame:
    return df_raster.selectExpr(f"explode_outer(RS_PixelAsPolygons(raster, {band})) as band") \
                    .selectExpr("band.geom as geom", "band.value as band_value", "band.x as x", "band.y as y") \
                    .where("band_value != 100000002004087734272")\
                    .repartition(100)

# Process the first band
bands_df = process_band(df_raster, 1).withColumnRenamed("band_value", "band_1")

# Iterate over bands 2 to 24 and join their values
for band in range(2, num_bands + 1):
    band_df = process_band(df_raster, band).drop("geom")
    bands_df = bands_df.join(band_df, on=["x", "y"], how="inner") \
                       .withColumnRenamed("band_value", f"band_{band}")

display(bands_df)

In [0]:
band_cols = [f"band_{i}" for i in range(1, num_bands+1)]


df_stats = bands_df.withColumn("band_mean", F.expr(f"aggregate(array({', '.join(band_cols)}), 0D, (acc, x) -> acc + x) / {len(band_cols)}")) \
             .withColumn("band_stddev", F.expr(f"""sqrt(
                 aggregate(array({', '.join(band_cols)}), 0D, (acc, x) -> acc + pow(x - band_mean, 2)) / {len(band_cols)}
             )""")) \
             .withColumn("band_min", F.least(*band_cols)) \
             .withColumn("band_max", F.greatest(*band_cols))

df_stats.count()


In [0]:
def compute_percentile(values, percentile):
    return float(np.percentile(values, percentile))

percentile_udf = udf(lambda row: compute_percentile(row, 25), DoubleType())
df_stats = df_stats.withColumn("band_p25", percentile_udf(F.array(*band_cols)))

percentile_udf = udf(lambda row: compute_percentile(row, 50), DoubleType())
df_stats = df_stats.withColumn("band_p50", percentile_udf(F.array(*band_cols)))

percentile_udf = udf(lambda row: compute_percentile(row, 75), DoubleType())
df_stats = df_stats.withColumn("band_p75", percentile_udf(F.array(*band_cols)))

df_stats = df_stats.repartition(1)

In [0]:
df_stats.printSchema()

In [0]:

df_stats.write.mode("overwrite").format("geojson").save(
    f"s3://{dataset_bucket_name}/{dataset_dir}/{nc_variable}_{username}.geojson"
)