In [0]:
from pyspark.sql.functions import input_file_name
from pyspark.sql.functions import to_timestamp, col, year, month

# sp=r&st=2025-01-20T14:06:31Z&se=2026-01-01T22:06:31Z&spr=https&sv=2022-11-02&sr=c&sig=Dg8hCI6bX1FpOJ1Jb5suhsR12ZPs1Pm%2Bw484GkfMNU4%3D
# https://experimentcidt.blob.core.windows.net/databricks?sp=r&st=2025-01-20T14:06:31Z&se=2026-01-01T22:06:31Z&spr=https&sv=2022-11-02&sr=c&sig=Dg8hCI6bX1FpOJ1Jb5suhsR12ZPs1Pm%2Bw484GkfMNU4%3D

container = "databricks"
storage_account = "experimentcidt"
tenant_id = "80a5cb6b-ae21-4ea8-bd3f-25e005cefc5b"
managed_identity_client_id = "33f7c0cd-4fa6-432d-9be6-225da9c1768b"

storage_account_key = dbutils.secrets.get(scope="Experiments3", key="storage-account-key")
spark.conf.set(
    f"fs.azure.account.key.{storage_account}.blob.core.windows.net",
    f"{storage_account_key}"
)

spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

spark.conf.set("fs.azure.account.auth.type", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.endpoint", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")



df_with_source = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .option("multiLine", "true") \
    .csv(f"wasbs://{container}@{storage_account}.blob.core.windows.net/*.csv") \
    .withColumn("source_file", input_file_name())

# Remove all rows where the user is 'NULL'
df_with_source = df_with_source.filter(df_with_source.user != "NULL")

# Remove all rows where cpd_name is 'NULL'
df_with_source = df_with_source.filter(df_with_source.cpd_name != "NULL")

# Remove duplicate rows where _time and user and cpd_name are the same
df_with_source = df_with_source.dropDuplicates(["_time", "user", "cpd_name"])

# Print the total number of rows
print(f"Total number of rows: {df_with_source.count()}")

# The _time column looks like 'Sun Jan 19 14:20:55 2025', create a iso_timestamp column 
df_with_source = df_with_source.withColumn("iso_timestamp", to_timestamp(col("_time"), "EEE MMM dd HH:mm:ss yyyy"))

# Add a year and month column
df_with_source = df_with_source.withColumn("year", year(col("iso_timestamp")))
df_with_source = df_with_source.withColumn("month", month(col("iso_timestamp")))

# Now only print the time, user and cpd_name columns, do not restrict the column display length
df_with_source.select("iso_timestamp", "user", "cpd_name").show(10, False)

# Write out the combined table, partition by year and month
df_with_source.write.partitionBy("year", "month").mode("overwrite").saveAsTable("experiments3.logging.combined")


#df_with_source.write.format("delta").mode("overwrite").saveAsTable("experiments3.logging.combined")

#df.write \
#    .format("delta") \
#    .partitionBy("jaar", "maand") \
#    .mode("append") \
#    .save("/mnt/delta/combined_table")