In [0]:
# variables
storage_account_name = "voc43cdmqlsgj6nostorage"
container_name = "data"
mount_point = "/mnt/data"

# Securely fetch the secret key from the scope we created
try:
    secret_key = dbutils.secrets.get(scope="voc_project_scope", key="storage-account-key")
except Exception as e:
    print("Looks like your secret scope or key isn't set up.")
    raise e

# Build the configuration string
config = f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net"

# Check if the mount point already exists (makes our code safe to re-run)
if not any(mount.mountPoint == mount_point for mount in dbutils.fs.mounts()):
    print(f"Mounting {mount_point}...")
    
    # Mount the storage
    dbutils.fs.mount(
      source = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net",
      mount_point = mount_point,
      extra_configs = {config: secret_key}
    )
    print(f"Successfully mounted {mount_point}!")
else:
    print(f"{mount_point} is already mounted.")

# Verify by listing the files
print("--- Files in mount point ---")
display(dbutils.fs.ls(mount_point))

In [0]:
from pyspark.sql.functions import col, explode, split, udf, lit, regexp_replace
from pyspark.sql.types import StructType, StructField, StringType
import mailbox
import email # We need the 'email' library to handle parsing exceptions

# --- Read the Data ---
# Instead of reading line-by-line, we read each .mbox file *entirely* into memory.
# This gives us (filePath, fileContent) pairs, which is perfect for mbox parsing.
rdd = spark.sparkContext.wholeTextFiles("/mnt/data/*.mbox")
df_raw_files = rdd.toDF(["filepath", "raw_text"])

# --- Split Files into Individual Emails ---
# An .mbox file separates emails with a line starting with "From "
# We split the raw_text of each file by this delimiter.
# This creates an array of raw email strings.
df_split_emails = df_raw_files.select(
    "filepath",
    explode(
        split(col("raw_text"), "\nFrom ") # Note the \n and the space
    ).alias("raw_email_text")
)

# --- Define the Parsing UDF (User-Defined Function) ---
# This is the "brain" of our parser. We use Python's 'mailbox' library
# to robustly parse the raw email string.
def parse_email_text(raw_text):
    """
    Takes a raw email string and parses it into key-value pairs
    using Python's native 'mailbox' library.
    """
    # We must add a "From " header back, as we split on it.
    # The mailbox library requires it.
    try:
        # Re-add the separator for the parser to work
        msg = mailbox.Message("From " + raw_text)
        
        # We'll use .get() to avoid errors if a field is missing
        return (
            msg.get('Message-ID', 'N/A'),
            msg.get('From', 'N/A'),
            msg.get('Subject', 'N/A'),
            msg.get('Date', 'N/A'),
            msg.get_payload() # This gets the email body
        )
    except email.errors.MessageParseError:
        # Handle malformed emails gracefully
        return ('PARSE_ERROR', 'PARSE_ERROR', 'PARSE_ERROR', 'PARSE_ERROR', 'PARSE_ERROR')

# Define the "schema" or structure that our UDF will return.
# This is critical for Spark's performance.
email_schema = StructType([
    StructField("message_id", StringType(), True),
    StructField("sender", StringType(), True),
    StructField("subject", StringType(), True),
    StructField("date_str", StringType(), True),
    StructField("body", StringType(), True)
])

# Register our Python function as a Spark UDF
parse_email_udf = udf(parse_email_text, email_schema)

# --- Apply the UDF and Clean the Data ---
# Apply our UDF to the 'raw_email_text' column
df_parsed = df_split_emails.withColumn("parsed_email", parse_email_udf(col("raw_email_text")))

# "Flatten" the new 'parsed_email' struct into top-level columns
# and perform our aggressive text cleaning from the project plan.
df_bronze_clean = df_parsed.select(
    "filepath",
    col("parsed_email.message_id").alias("message_id"),
    col("parsed_email.sender").alias("sender"),
    col("parsed_email.subject").alias("subject"),
    col("parsed_email.date_str").alias("date_str"),
    
    # Clean the body:
    # 1. Remove all ">" quoted replies
    # 2. Remove common email signatures
    # 3. Trim extra whitespace
    regexp_replace(col("parsed_email.body"), r"(?m)^\>.*", "").alias("body_no_quotes"),
    col("raw_email_text") # Keep this for debugging, just in case
).where(col("message_id") != "PARSE_ERROR") # Filter out any emails that failed to parse

# Let's clean the signatures *after* the quotes
df_bronze_clean = df_bronze_clean.withColumn(
    "body_clean",
    regexp_replace(col("body_no_quotes"), r"(?s)--\n.*", "") # Remove text after signature dashes
)

# --- Save as "Bronze" Delta Lake Table ---
# This is the final step. We save our clean DataFrame as a permanent,
# queryable table in the Databricks "Delta Lake".
(df_bronze_clean
    .write
    .format("delta")
    .mode("overwrite") # Use "overwrite" for a one-time job
    .saveAsTable("voc_bronze_layer")
)

print("--- Successfully created voc_bronze_layer Delta table! ---")