In [0]:
# variables
storage_account_name = "voc43cdmqlsgj6nostorage"
container_name = "data"
mount_point = "/mnt/data"

# Securely fetch the secret key from the scope we created
try:
    secret_key = dbutils.secrets.get(scope="voc_project_scope", key="storage-account-key")
except Exception as e:
    print("Looks like your secret scope or key isn't set up.")
    raise e

# Build the configuration string
config = f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net"

# Check if the mount point already exists (makes our code safe to re-run)
if not any(mount.mountPoint == mount_point for mount in dbutils.fs.mounts()):
    print(f"Mounting {mount_point}...")
    
    # Mount the storage
    dbutils.fs.mount(
      source = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net",
      mount_point = mount_point,
      extra_configs = {config: secret_key}
    )
    print(f"Successfully mounted {mount_point}!")
else:
    print(f"{mount_point} is already mounted.")

# Verify by listing the files
print("--- Files in mount point ---")
display(dbutils.fs.ls(mount_point))

In [0]:
# Import all the functions we'll need
from pyspark.sql.functions import col, explode, split, udf, lit, regexp_replace, length, trim
from pyspark.sql.types import StructType, StructField, StringType
import mailbox
import email

# -----------------------------------------------------------------
# 1. --- Drop the Old Table for a Fresh Start ---
# -----------------------------------------------------------------
# This makes our script "idempotent" (re-runnable)
print("Dropping old table (if it exists) to prevent schema errors...")
spark.sql("DROP TABLE IF EXISTS voc_bronze_layer")

# -----------------------------------------------------------------
# 2. --- Read the Data ---
# -----------------------------------------------------------------
print("Reading raw .mbox files...")
rdd = spark.sparkContext.wholeTextFiles("/mnt/data/*.mbox")
df_raw_files = rdd.toDF(["filepath", "raw_text"])

# -----------------------------------------------------------------
# 3. --- Split Files into Individual Emails ---
# -----------------------------------------------------------------
df_split_emails = df_raw_files.select(
    "filepath",
    explode(
        split(col("raw_text"), "\nFrom ")
    ).alias("raw_email_text")
).where(length(col("raw_email_text")) > 10)

# -----------------------------------------------------------------
# 4. --- Define the Robust Parsing UDF ---
# -----------------------------------------------------------------
def get_plain_text_payload(msg):
    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == 'text/plain':
                try:
                    return part.get_payload(decode=True).decode(part.get_content_charset() or 'utf-8', 'ignore')
                except Exception:
                    return None
        return None
    else:
        if msg.get_content_type() == 'text/plain':
            try:
                return msg.get_payload(decode=True).decode(msg.get_content_charset() or 'utf-8', 'ignore')
            except Exception:
                return None
    return None

def parse_email_text(raw_text):
    try:
        msg = mailbox.Message("From " + raw_text)
        body = get_plain_text_payload(msg)
        if body is None:
            body = "N/A_NO_PLAIN_TEXT"
        return (
            msg.get('Message-ID', 'N/A'),
            msg.get('From', 'N/A'),
            msg.get('Subject', 'N/A'),
            msg.get('Date', 'N/A'),
            body
        )
    except Exception:
        return ('PARSE_ERROR', 'PARSE_ERROR', 'PARSE_ERROR', 'PARSE_ERROR', 'PARSE_ERROR')

email_schema = StructType([
    StructField("message_id", StringType(), True),
    StructField("sender", StringType(), True),
    StructField("subject", StringType(), True),
    StructField("date_str", StringType(), True),
    StructField("body", StringType(), True)
])

parse_email_udf = udf(parse_email_text, email_schema)
print("UDF is defined and registered.")

# -----------------------------------------------------------------
# 5. --- Apply UDF, Clean, and Save ---
# -----------------------------------------------------------------
print("Applying UDF...")
df_parsed = df_split_emails.withColumn("parsed_email", parse_email_udf(col("raw_email_text")))

print("Cleaning and filtering...")
df_bronze_clean = df_parsed.select(
    "filepath",
    col("parsed_email.message_id").alias("message_id"),
    col("parsed_email.sender").alias("sender"),
    col("parsed_email.subject").alias("subject"),
    col("parsed_email.date_str").alias("date_str"),
    col("parsed_email.body").alias("body_raw"),
).where(col("message_id") != "PARSE_ERROR") \
 .where(col("body_raw") != "N/A_NO_PLAIN_TEXT")

# --- THIS IS THE NEW CLEANING LOGIC ---
print("Applying final text cleaning (quotes, signatures, newlines)...")
df_final_clean = df_bronze_clean.withColumn(
    "body_no_quotes",
    regexp_replace(col("body_raw"), r"(?m)^\>.*", "") # 1. Remove ">" quotes
).withColumn(
    "body_no_sigs",
    regexp_replace(col("body_no_quotes"), r"(?s)--\n.*", "") # 2. Remove signatures
).withColumn(
    "body_squashed",
    regexp_replace(col("body_no_sigs"), r"\n{3,}", "\n\n") # 3. Squash 3+ newlines to 2
).withColumn(
    "body_clean",
    trim(col("body_squashed")) # 4. Trim all leading/trailing whitespace
)

# Select only the columns we want to save
df_to_save = df_final_clean.select(
    "message_id",
    "sender",
    "subject",
    "date_str",
    "body_clean" # Save the final clean body
)

# -----------------------------------------------------------------
# 6. --- Save as "Bronze" Delta Lake Table ---
# -----------------------------------------------------------------
print("Saving to Delta table 'voc_bronze_layer'...")
(df_to_save
    .write
    .format("delta")
    .mode("overwrite") # This will work now
    .saveAsTable("voc_bronze_layer")
)

print("--- Successfully REBUILT voc_bronze_layer Delta table! ---")