In [0]:
#Gold Table

#Load silver table
from pyspark.sql import functions as F
from pyspark.sql import Window

silver = spark.table("genai_support_pipeline.silver_support_tickets_clean")
print("silver_count:", silver.count())

#Chunk text for LLM
CHUNK_SIZE = 500

df = silver.withColumn(
    "chunk_array",
    F.expr(f"""
        transform(
            sequence(0, length(text_for_llm), {CHUNK_SIZE}),
            i -> substring(text_for_llm, i + 1, {CHUNK_SIZE})
        )
    """)
)

#Explode into one-row-per-chunk
df_chunks = df.select(
    "ticket_key",
    "ticket_status_fixed",
    "ticket_priority_fixed",
    "ticket_channel_fixed",
    "customer_gender_fixed",
    F.explode("chunk_array").alias("text_chunk")
)

#Add chunk index
w = Window.partitionBy("ticket_key").orderBy(F.monotonically_increasing_id())

df_chunks = df_chunks.withColumn(
    "chunk_index",
    F.row_number().over(w)
)

#Add embedding-ready metadata
df_chunks = df_chunks.withColumn(
    "chunk_id",
    F.concat_ws("_", F.col("ticket_key"), F.col("chunk_index"))
).withColumn(
    "chunk_length",
    F.length("text_chunk")
)

#Write Gold Table
df_chunks.write.mode("overwrite").saveAsTable("genai_support_pipeline.gold_support_tickets_chunks")
print("gold_count:", df_chunks.count())



silver_count: 9119
gold_count: 9119


In [0]:
silver.select(
    F.max(F.length("text_for_llm")).alias("max_len"),
    F.avg(F.length("text_for_llm")).alias("avg_len")
).show()


+-------+-----------------+
|max_len|          avg_len|
+-------+-----------------+
|    437|178.5810944182476|
+-------+-----------------+

