In [0]:
# Retrieve and print all Spark configuration settings
for item in spark.sparkContext.getConf().getAll():
    print(f"{item[0]} = {item[1]}")

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("NetCDF Processing") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memoryOverhead", "1g") \
    .config("spark.executor.memoryOverhead", "1g") \
    .config("spark.driver.cores", "4") \
    .config("spark.executor.cores", "4") \
    .getOrCreate()

In [0]:
from sedona.spark import *

config = SedonaContext.builder() .\
    config('spark.jars.packages',
           'org.apache.sedona:sedona-spark-shaded-3.3_2.12:1.7.1,'
           'org.datasyslab:geotools-wrapper:1.7.1-28.5'). \
    getOrCreate()

In [0]:
from sedona.spark import *

sedona = SedonaContext.create(config)

In [0]:
dataset_storage_account_name="melikadatabricksstorage"
dataset_container_name="geospatial-dataset"
dataset_dir="raster/netcdf/global"
# dataset_file="pr_day_UKESM1-0-LL_ssp126_r1i1p1f2_gn_2015.nc"
dataset_file="tos_O1_2001-2002.nc"
# dataset_file="IMERG_land_sea_mask.nc"
# dataset_file="timeseries-tas-annual-mean_era5-x0.25_era5-x0.25-historical_timeseries_mean_1950-2022.nc"
nc_variable = "tos"

In [0]:
df = sedona.read.format("binaryFile").load(f"abfss://{dataset_container_name}@{dataset_storage_account_name}.dfs.core.windows.net/{dataset_dir}/{dataset_file}")
df.count()


In [0]:
record_info_row = df.selectExpr("RS_NetCDFInfo(content) as record_info").first()
print(record_info_row)

In [0]:
from pyspark.sql.functions import expr

df_raster = df.withColumn("raster", expr(f"RS_FromNetCDF(content, '{nc_variable}')"))
display(df_raster)

In [0]:
htmlDF = df_raster.selectExpr("RS_AsImage(raster, 500) as raster_image")
SedonaUtils.display_image(htmlDF)

In [0]:
df_num = df_raster.withColumn("num_raster", expr("RS_NumBands(raster)"))
num_bands = df_num.select("num_raster").first()[0]
print(num_bands)

In [0]:
df_raster.selectExpr(
  "explode(array(RS_Value(raster, 45, 65, 1))) as exploded"
).show()


In [0]:
df_pixels_as_geom = df_raster.selectExpr(
  "explode(RS_PixelAsCentroids(raster, 1)) as exploded"
).selectExpr(
  "exploded.geom as geom",
  "exploded.x as x",
  "exploded.y as y"
)

df_pixels_as_geom.show()

In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col

# Function to process each band and return a DataFrame
def process_band(df_raster: DataFrame, band: int) -> DataFrame:
    return df_raster.selectExpr(f"explode_outer(RS_PixelAsPolygons(raster, {band})) as band") \
                    .selectExpr("band.geom as geom", "band.value as band_value", "band.x as x", "band.y as y") \
                    .where("band_value != 100000002004087734272")\
                    .repartition(100)

# Process the first band
band1_df = process_band(df_raster, 1).withColumnRenamed("band_value", "band_1")

# Iterate over bands 2 to 24 and join their values
for band in range(2, num_bands + 1):
    band_df = process_band(df_raster, band).drop("geom")
    band1_df = band1_df.join(band_df, on=["x", "y"], how="inner") \
                       .withColumnRenamed("band_value", f"band_{band}")

display(band1_df)

In [0]:
band1_df.printSchema()

In [0]:
band_cols = [f"band_{i}" for i in range(1, num_bands+1)]
from pyspark.sql import functions as F

df_stats = filtered_df.withColumn("band_mean", F.expr(f"aggregate(array({', '.join(band_cols)}), 0D, (acc, x) -> acc + x) / {len(band_cols)}")) \
             .withColumn("band_stddev", F.expr(f"""sqrt(
                 aggregate(array({', '.join(band_cols)}), 0D, (acc, x) -> acc + pow(x - band_mean, 2)) / {len(band_cols)}
             )""")) \
             .withColumn("band_min", F.least(*band_cols)) \
             .withColumn("band_max", F.greatest(*band_cols))

df_stats.count()


In [0]:
import numpy as np
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf

def compute_percentile(values, percentile):
    return float(np.percentile(values, percentile))

percentile_udf = udf(lambda row: compute_percentile(row, 25), DoubleType())
df_stats = df_stats.withColumn("band_p25", percentile_udf(F.array(*band_cols)))

percentile_udf = udf(lambda row: compute_percentile(row, 50), DoubleType())
df_stats = df_stats.withColumn("band_p50", percentile_udf(F.array(*band_cols)))

percentile_udf = udf(lambda row: compute_percentile(row, 75), DoubleType())
df_stats = df_stats.withColumn("band_p75", percentile_udf(F.array(*band_cols)))

df_stats = df_stats.repartition(1)

In [0]:
df_stats.printSchema()

In [0]:


df_stats.write.format("geojson").save(
    f"abfss://{dataset_container_name}@{dataset_storage_account_name}.dfs.core.windows.net/{dataset_dir}/{nc_variable}.geojson"
)