In [0]:
from pyspark.sql.functions import col, explode, split, udf, lit, regexp_replace, length
# Load the "Bronze" table we created in the spark job in bronze_etl
# This is "reading from the Delta Lake"
df_bronze = spark.table("voc_bronze_layer")

# Select just the columns we need for training
# We only care about the clean body text
df_body_text = df_bronze.select("message_id", "body_clean") \
                        .where(col("body_clean").isNotNull()) \
                        .where(length(col("body_clean")) > 100) # Filter out tiny/empty emails

# Take a 1000-row sample
# This is more than enough for our initial training
# We use .limit() for a quick, non-random sample. It's fine for this.
df_sample = df_body_text.limit(1000)

# Convert to Pandas!
# Spark is for "big data," but spaCy and other libraries
# are easier to use with a "Pandas" DataFrame.
# .toPandas() collects all the data from the Spark cluster 
# onto the driver node as a single, in-memory object.
pdf_sample = df_sample.toPandas()

# Display the result
print(f"Loaded {len(pdf_sample)} rows into a Pandas DataFrame.")
pdf_sample.head()

In [0]:
pdf_sample.info()

In [0]:
print(pdf_sample['body_clean'].iloc[4])


In [0]:
# Let's look at a few clean emails from our sample to get started
# We'll print the text AND its index in the pandas DataFrame
# so we can easily copy-paste it.

print("--- Email at index 0 ---")
print(pdf_sample['body_clean'].iloc[0])
print("\n--- Email at index 1 ---")
print(pdf_sample['body_clean'].iloc[1])
print("\n--- Email at index 2 ---")
print(pdf_sample['body_clean'].iloc[2])

In [0]:
# Our list of "seed" keywords to bootstrap the labeling
ASPECT_KEYWORDS = [
    # Core Components
    "Spark Core", "scheduler", "task scheduling", "dynamic allocation", "shuffle",
    
    # Spark SQL & DataFrames
    "Spark SQL", "DataFrame", "Dataset API", "query optimizer", "Catalyst",
    "AQE (Adaptive Query Execution)", "query execution", "data source",
    
    # Streaming
    "Structured Streaming", "micro-batch", "streaming query", "DStream",
    "streaming performance", "latency", "watermarking", "stateful streaming",
    
    # Performance & ML
    "performance tuning", "MLlib", "caching", "memory management", "data skew",
    
    # Other
    "Spark Connect", "PySpark", "data source API", "connector"
]

print(f"Loaded {len(ASPECT_KEYWORDS)} aspect keywords.")

In [0]:
import re

# This will hold our "draft" of the training data
TRAINING_DATA_DRAFT = []

# Loop through every email in our Pandas sample
for index, row in pdf_sample.iterrows():
    text = row['body_clean']
    
    # This will hold the (start, end, "ASPECT") tuples for this one email
    entities = []
    
    # Loop through our keyword list
    for keyword in ASPECT_KEYWORDS:
        
        # Use 're.finditer' to find ALL matches of the keyword, ignoring case
        # 're.IGNORECASE' makes it find "spark sql" and "Spark SQL"
        # 're.escape' handles keywords with special chars like "Spark Core"
        try:
            for match in re.finditer(re.escape(keyword), text, re.IGNORECASE):
                start, end = match.span()
                entities.append( (start, end, "ASPECT") )
        except re.error as e:
            # This handles any regex errors, though it's rare with re.escape
            print(f"Regex error with keyword '{keyword}': {e}")
            
    # CRITICAL: We only add the email if we found at least one entity.
    # This creates a "biased" dataset, which we must be aware of.
    if entities:
        # Check for overlapping entities (a common problem)
        # For this simple script, we'll just take all of them.
        # A more advanced script would merge overlaps.
        
        # Add the full text and the found entities to our draft
        TRAINING_DATA_DRAFT.append( (text, {"entities": entities}) )

print(f"--- Pre-Labeling Complete ---")
print(f"Found {len(TRAINING_DATA_DRAFT)} emails with at least one aspect.")
print("Here's a sample of what we found:")

In [0]:
# Helper to review our draft
# Change 'i' to look at different examples (0, 1, 2, ... 152)

i = 100  # <-- CHANGE THIS NUMBER TO SEE THE NEXT EXAMPLE

(text, data) = TRAINING_DATA_DRAFT[i]

print(f"--- Reviewing Example {i} ---")
print("\nFULL TEXT:\n")
print(text)
print("\n------------------------------")
print(f"\nFOUND LABELS: {data['entities']}")

In [0]:
# This is our FINAL, clean training data, based on our review.

# After reviewing, I found that some examples were good. I limited myself to 25 of em becasue of the tediousness of reveiewing them.
# I will copy them here.

TRAINING_DATA = [
    TRAINING_DATA_DRAFT[0],  # This one looked good
    TRAINING_DATA_DRAFT[1],  # This one also looked good
    TRAINING_DATA_DRAFT[10],  # This one was good too
    TRAINING_DATA_DRAFT[14],  
    TRAINING_DATA_DRAFT[21],
    TRAINING_DATA_DRAFT[23],
    TRAINING_DATA_DRAFT[25],
    TRAINING_DATA_DRAFT[26],
    TRAINING_DATA_DRAFT[29],
    TRAINING_DATA_DRAFT[35],
    TRAINING_DATA_DRAFT[39],
    TRAINING_DATA_DRAFT[42],
    TRAINING_DATA_DRAFT[44],
    TRAINING_DATA_DRAFT[45],
    TRAINING_DATA_DRAFT[46], 
    TRAINING_DATA_DRAFT[49],
    TRAINING_DATA_DRAFT[51],
    TRAINING_DATA_DRAFT[52],
    TRAINING_DATA_DRAFT[58],
    TRAINING_DATA_DRAFT[65],
    TRAINING_DATA_DRAFT[70],
    TRAINING_DATA_DRAFT[78],
    TRAINING_DATA_DRAFT[79],
    TRAINING_DATA_DRAFT[92],
    TRAINING_DATA_DRAFT[100],
]

print(f"Final, reviewed TRAINING_DATA contains {len(TRAINING_DATA)} examples.")