In [0]:
%pip install torch
%pip install transformers
%pip install spacy

In [0]:
%restart_python

In [0]:
from pyspark.sql.functions import col, explode, split, udf, lit, regexp_replace, length
import re
import spacy
import random
import os
from spacy.training import Example
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

In [0]:
# Load the "Bronze" table we created in the spark job in bronze_etl
# This is "reading from the Delta Lake"
df_bronze = spark.table("voc_bronze_layer")

# Select just the columns we need for training
# We only care about the clean body text
df_body_text = df_bronze.select("message_id", "body_clean") \
                        .where(col("body_clean").isNotNull()) \
                        .where(length(col("body_clean")) > 100) # Filter out tiny/empty emails

# Take a 1000-row sample
# This is more than enough for our initial training
# We use .limit() for a quick, non-random sample. It's fine for this.
df_sample = df_body_text.limit(1000)

# Convert to Pandas!
# Spark is for "big data," but spaCy and other libraries
# are easier to use with a "Pandas" DataFrame.
# .toPandas() collects all the data from the Spark cluster 
# onto the driver node as a single, in-memory object.
pdf_sample = df_sample.toPandas()

# Display the result
print(f"Loaded {len(pdf_sample)} rows into a Pandas DataFrame.")
pdf_sample.head()

In [0]:
pdf_sample.info()

In [0]:
print(pdf_sample['body_clean'].iloc[4])


In [0]:
# Our list of "seed" keywords to bootstrap the labeling
ASPECT_KEYWORDS = [
    # Core Components
    "Spark Core", "scheduler", "task scheduling", "dynamic allocation", "shuffle",
    
    # Spark SQL & DataFrames
    "Spark SQL", "DataFrame", "Dataset API", "query optimizer", "Catalyst",
    "AQE (Adaptive Query Execution)", "query execution", "data source",
    
    # Streaming
    "Structured Streaming", "micro-batch", "streaming query", "DStream",
    "streaming performance", "latency", "watermarking", "stateful streaming",
    
    # Performance & ML
    "performance tuning", "MLlib", "caching", "memory management", "data skew",
    
    # Other
    "Spark Connect", "PySpark", "data source API", "connector"
]

print(f"Loaded {len(ASPECT_KEYWORDS)} aspect keywords.")

In [0]:
# This will hold our "draft" of the training data
TRAINING_DATA_DRAFT = []

# Loop through every email in our Pandas sample
for index, row in pdf_sample.iterrows():
    text = row['body_clean']
    
    # This will hold the (start, end, "ASPECT") tuples for this one email
    entities = []
    
    # Loop through our keyword list
    for keyword in ASPECT_KEYWORDS:
        
        # Use 're.finditer' to find ALL matches of the keyword, ignoring case
        # 're.IGNORECASE' makes it find "spark sql" and "Spark SQL"
        # 're.escape' handles keywords with special chars like "Spark Core"
        try:
            for match in re.finditer(re.escape(keyword), text, re.IGNORECASE):
                start, end = match.span()
                entities.append( (start, end, "ASPECT") )
        except re.error as e:
            # This handles any regex errors, though it's rare with re.escape
            print(f"Regex error with keyword '{keyword}': {e}")
            
    # CRITICAL: We only add the email if we found at least one entity.
    # This creates a "biased" dataset, which we must be aware of.
    if entities:
        # Check for overlapping entities (a common problem)
        # For this simple script, we'll just take all of them.
        # A more advanced script would merge overlaps.
        
        # Add the full text and the found entities to our draft
        TRAINING_DATA_DRAFT.append( (text, {"entities": entities}) )

print(f"--- Pre-Labeling Complete ---")
print(f"Found {len(TRAINING_DATA_DRAFT)} emails with at least one aspect.")
print("Here's a sample of what we found:")

In [0]:
# Helper to review our draft
# Change 'i' to look at different examples (0, 1, 2, ... 152)

i = 100  # <-- CHANGE THIS NUMBER TO SEE THE NEXT EXAMPLE

(text, data) = TRAINING_DATA_DRAFT[i]

print(f"--- Reviewing Example {i} ---")
print("\nFULL TEXT:\n")
print(text)
print("\n------------------------------")
print(f"\nFOUND LABELS: {data['entities']}")

In [0]:
# This is our FINAL, clean training data, based on our review.

# After reviewing, I found that some examples were good. I limited myself to 25 of em becasue of the tediousness of reveiewing them.
# I will copy them here.

TRAINING_DATA = [
    TRAINING_DATA_DRAFT[0],  # This one looked good
    TRAINING_DATA_DRAFT[1],  # This one also looked good
    TRAINING_DATA_DRAFT[10],  # This one was good too
    TRAINING_DATA_DRAFT[14],  
    TRAINING_DATA_DRAFT[21],
    TRAINING_DATA_DRAFT[23],
    TRAINING_DATA_DRAFT[25],
    TRAINING_DATA_DRAFT[26],
    TRAINING_DATA_DRAFT[29],
    TRAINING_DATA_DRAFT[35],
    TRAINING_DATA_DRAFT[39],
    TRAINING_DATA_DRAFT[42],
    TRAINING_DATA_DRAFT[44],
    TRAINING_DATA_DRAFT[45],
    TRAINING_DATA_DRAFT[46], 
    TRAINING_DATA_DRAFT[49],
    TRAINING_DATA_DRAFT[51],
    TRAINING_DATA_DRAFT[52],
    TRAINING_DATA_DRAFT[58],
    TRAINING_DATA_DRAFT[65],
    TRAINING_DATA_DRAFT[70],
    TRAINING_DATA_DRAFT[78],
    TRAINING_DATA_DRAFT[79],
    TRAINING_DATA_DRAFT[92],
    TRAINING_DATA_DRAFT[100],
]

print(f"Final, reviewed TRAINING_DATA contains {len(TRAINING_DATA)} examples.")

In [0]:
# --- Create a Blank "Newborn" Model ---
# We start with a blank English model. It knows nothing.
nlp = spacy.blank("en")

# --- Add the "Aspect" Label to the Model ---
# We add the "Named Entity Recognition" (NER) tool to its brain.
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# We teach the NER tool the *only* label we care about: "ASPECT"
ner.add_label("ASPECT")

# --- Start the Training ---
print("Starting training...")
n_iter = 20 # We will show the model our textbook 20 times

# "Open the model's brain" to start learning
optimizer = nlp.begin_training()

for itn in range(n_iter):
    # Shuffle the examples each time so it doesn't just memorize the order
    random.shuffle(TRAINING_DATA)
    losses = {}
    
    # --- This is the main training loop ---
    for text, annotations in TRAINING_DATA:
        try:
            # Create an "Example" object (the text + the correct answers)
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            
            # --- This is the magic line ---
            # 1. Model "guesses" on the example
            # 2. Compares its guess to the correct answer
            # 3. Updates its brain (weights) to get better
            nlp.update([example], drop=0.5, losses=losses, sgd=optimizer)
            
        except Exception as e:
            # Handle any bad data, e.g. overlapping entities
            # print(f"Skipping bad data: {e}")
            pass
            
    # Print the "loss" (mistake score). We want this number to go DOWN.
    print(f"Iteration {itn+1}/{n_iter}  |  Loss: {losses.get('ner', 0.0)}")

print("--- Training Complete ---")

# --- Save the Final, Trained Model ---
# We save the model to the 'models/' folder *inside* our Git repo.
# The 'notebooks' folder is at /Workspace/Repos/.../notebooks/
# So, '../models/' goes "up one level" and "into models/"
output_dir = "../models/ner_model"

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the model's brain (all its files) to that folder
nlp.to_disk(output_dir)

print(f"Model saved to: {output_dir}")

In [0]:
# Define the model we want to use and where to save it
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"
OUTPUT_DIR = "../models/sentiment_model"

# Create the directory if it doesn't exist
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

print(f"Downloading model '{MODEL_NAME}'...")

# Download the tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# The model is the 'brain' for sentiment classification
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

print("Download complete.")

# Save the tokenizer and model files locally
# so that the app can use them offline
tokenizer.save_pretrained(OUTPUT_DIR)
model.save_pretrained(OUTPUT_DIR)

print(f"Model and tokenizer saved to: {OUTPUT_DIR}")