In [3]:
from pyspark.sql import functions as F

# 1) Define the path to the email JSON files
email_glob = "Files/landing/email/yyyy=*/mm=*/dd=*/partition=*/part-*.json"

# 2) Read the raw email data
raw_emails = spark.read.option("multiLine", True).json(email_glob)

# 3) Transform to Bronze schema
emails_df = (
    raw_emails
    .withColumn("internet_message_id", F.col("internet_message_id"))
    .withColumn("thread_id", F.col("thread_id"))
    .withColumn("from_name", F.col("from_name"))
    .withColumn("from_email", F.col("from_email"))
    .withColumn("received_utc", F.to_timestamp(F.col("received_utc"), "yyyy-MM-dd'T'HH:mm:ss'Z'"))
    .withColumn("subject", F.col("subject"))
    .withColumn("body_text", F.col("body_text"))
    .withColumn("customer_org", F.col("customer_org"))
    .select("internet_message_id", "thread_id", "from_name", "from_email", "received_utc", "subject", "body_text", "customer_org")
    .dropDuplicates(["internet_message_id"])
)

# 4) Save as Delta table in Bronze layer
emails_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save("Tables/Bronze/emails")

# 5) Display the result
display(emails_df)

StatementMeta(, 90a5a190-72c0-477c-95ed-80fcdcf0add5, 5, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 9458dea9-8f5b-41c7-9acf-d65b22d0f928)