In [0]:
from pyspark.sql.functions import col, explode, split, udf, lit, regexp_replace, length
# Load the "Bronze" table we created in the spark job in bronze_etl
# This is "reading from the Delta Lake"
df_bronze = spark.table("voc_bronze_layer")

# Select just the columns we need for training
# We only care about the clean body text
df_body_text = df_bronze.select("message_id", "body_clean") \
                        .where(col("body_clean").isNotNull()) \
                        .where(length(col("body_clean")) > 100) # Filter out tiny/empty emails

# Take a 1000-row sample
# This is more than enough for our initial training
# We use .limit() for a quick, non-random sample. It's fine for this.
df_sample = df_body_text.limit(1000)

# Convert to Pandas!
# Spark is for "big data," but spaCy and other libraries
# are easier to use with a "Pandas" DataFrame.
# .toPandas() collects all the data from the Spark cluster 
# onto the driver node as a single, in-memory object.
pdf_sample = df_sample.toPandas()

# Display the result
print(f"Loaded {len(pdf_sample)} rows into a Pandas DataFrame.")
pdf_sample.head()

In [0]:
pdf_sample.info()

In [0]:
print(pdf_sample['body_clean'].iloc[0])
