In [0]:
# Create a local directory to store the files
dbutils.fs.mkdirs("dbfs:/tmp/knmi_data/")

In [0]:
%sh

# Download and extract the file using shell commands
curl -L "https://gddassesmentdata.blob.core.windows.net/knmi-data/data.tgz?se=2026-12-31&sp=r&spr=https&sv=2022-11-02&sr=c&sig=%2BKhfrZ5jd%2BmtdfdFQXFl05ymk7w2botn3EEuqYqbshg%3D" -o /tmp/data.tgz

In [0]:
%sh
# 1. Create a local folder on the driver (not dbfs)
mkdir -p /tmp/knmi_data_local

# 2. Extract the file locally (this won't throw the I/O error)
tar -xvzf /tmp/data.tgz -C /tmp/knmi_data_local/

# 3. Use the databricks-specific move command to push it to DBFS
# This is much more stable than writing directly via tar
mv /tmp/knmi_data_local /dbfs/tmp/knmi_data

In [0]:
%fs 
ls "dbfs:/tmp/knmi_data/data"

In [0]:
# check order of columns

from pyspark.sql import functions as F
from pyspark.sql.window import Window

df = spark.read.text("dbfs:/tmp/knmi_data/data/").select(
    F.col("value"), 
    F.col("_metadata.file_path").alias("file_id")
)

comment_lines = df.filter(F.col("value").startswith("#"))

window_spec = Window.partitionBy("file_id").orderBy(F.monotonically_increasing_id().desc())

last_comment_df = comment_lines.withColumn("rank", F.row_number().over(window_spec))

final_unique_comments = last_comment_df.filter(F.col("rank") == 1).select("value").distinct()

display(final_unique_comments)