In [ ]:
# Welcome to your new notebook
# Type here in the cell editor to add code!
from pyspark.sql import functions as F

landing_glob = "Files/landing/m365/teams/messages/yyyy=*/mm=*/dd=*/partition=*/part-*.json"
raw = spark.read.json(landing_glob)

bronze_df = (
    raw
    .withColumn("message_id", F.col("id"))
    .withColumn("conversation_id", F.col("chatId"))
    .withColumn("sender_name", F.col("from.user.displayName"))
    .withColumn("sender_email", F.coalesce(F.col("from.user.email"), F.col("from.user.userPrincipalName")))
    .withColumn("message_text", F.regexp_replace(F.col("body.content"), "<[^>]+>", " "))
    .withColumn("timestamp_utc", F.to_timestamp(F.col("createdDateTime"), "yyyy-MM-dd'T'HH:mm:ss'Z'"))
    .withColumn("customer_org", F.col("extensions.customer_org"))
    .select("message_id","conversation_id","sender_name","sender_email","message_text","timestamp_utc","customer_org")
    .dropDuplicates(["message_id"])
)

bronze_df.write.format("delta").mode("overwrite").option("overwriteSchema","true").save("Tables/Bronze/teams_messages")
display(bronze_df)