Create generic 3PL Accessorial Sample Dataset

In [0]:
accessorial_data = [
    (1,  "Liftgate Service - Handling - CarrierA - Midwest - NoExp - NoTemp"),
    (2,  "lift gate service - Handling - CarrierB - Midwest - NoExp - NoTemp"),
    (3,  "Lift-Gate Service Charge - Handling - CarrierC - Midwest - NoExp - NoTemp"),
    (4,  "Service Liftgate Handling - CarrierA - Midwest - NoExp - NoTemp"),
    (5,  "Handling CarrierB Lift Gate Service - Midwest - NoExp - NoTemp"),
    (6,  "LIFtgate - Handling - CarrierA - MidWest - NoExp - NoTemp"),
    (7,  "Lifgtate Servic - Handling - CarrierB - Midwest - NoExp - NoTemp"),
    (8,  "Residential Delivery - Delivery - CarrierA - Northeast - NoExp - NoTemp"),
    (9,  "res. delivery charge - Delivery - CarrierC - Northeast - NoExp - NoTemp"),
    (10, "Delivery Fee Residential - CarrierB - Northeast - NoExp - NoTemp"),
    (11, "Residential Delivery Fee - Delivery - CarrierB - Northeast - NoExp - NoTemp"),
    (12, "Expedited Shipping - Delivery - CarrierA - South - Exp - NoTemp"),
    (13, "Ship Fee expedited - Delivery - CarrierB - South - Exp - NoTemp"),
    (14, "Exp Ship Fee - Delivery - CarrierA - South - Exp - NoTemp"),
    (15, "Pallet Jack Usage - Handling - Warehouse1 - Midwest - NoExp - NoTemp"),
    (16, "Pallet Jack Use Fee - Handling - Warehouse1 - Midwest - NoExp - NoTemp"),
    (17, "Usage Pallet Jack Fee - Warehouse1 - Midwest - NoExp - NoTemp"),
    (18, "Temperature Controlled Fee - Handling - CarrierC - West - NoExp - Temp"),
    (19, "Temp Ctrl Fee - Handling - CarrierC - West - NoExp - Temp"),
    (20, "Temp Controlled Handling Fee - CarrierC - West - NoExp - Temp"),
    (21, "Freight Bill Correction - Adjustment - BillingSys - All - NoExp - NoTemp"),
    (22, "Freight Bill Corr. Fee - Adjustment - BillingSys - All - NoExp - NoTemp"),
    (23, "Bill Correction Freight - Adjustment - BillingSys - All - NoExp - NoTemp"),
    (24, "Lift Gate Delivery - Delivery - CarrierB - Midwest - NoExp - NoTemp"),
    (25, "Delivery Lift Gate - CarrierB - Midwest - NoExp - NoTemp"),
    (26, "Lift Gate Delvry Service - Delivery - CarrierB - Midwest - NoExp - NoTemp"),
    (27, "Residential Delvry Fee - Delivery - CarrierA - Northeast - NoExp - NoTemp"),
    (28, "Residential Delivery Extra Charge - Delivery - CarrierA - Northeast - NoExp - NoTemp"),
    (29, "Express Shipping Service - Delivery - CarrierB - South - Exp - NoTemp"),
    (30, "Express Ship Fee - Delivery - CarrierC - South - Exp - NoTemp"),
    (31, "Temperature Control Charge - Handling - CarrierC - West - NoExp - Temp"),
    (32, "Seasonal Salad - Appetizer - OutletA - Downtown - NoExp - NoTemp"),
    (33, "Salad Mix with Nuts - Appetizer - OutletB - Uptown - NoExp - NoTemp"),
    (34, "Fresh Green Salad with Dressings - Appetizer - OutletA - Downtown - NoExp - NoTemp"),
    (35, "Classic Tomato Pie - Main - OutletC - Midtown - NoExp - NoTemp"),
    (36, "Tomato Flatbread - Main - OutletC - Midtown - NoExp - NoTemp"),
    (37, "Cheese Flatbread - Main - OutletD - Midtown - NoExp - NoTemp"),
    (38, "Grilled Chicken Wrap - Main - OutletE - Downtown - NoExp - NoTemp"),
    (39, "Chicken Wrap Grilled - Main - OutletE - Downtown - NoExp - NoTemp"),
    (40, "Veggie Wrap - Main - OutletE - Downtown - NoExp - NoTemp"),
    (41, "Vegan Wrap Deluxe - Main - OutletE - Downtown - NoExp - NoTemp"),
    (42, "Chocolate Dessert Cake - Dessert - OutletF - Suburb - NoExp - NoTemp"),
    (43, "Chocolate Fudge Cake - Dessert - OutletF - Suburb - NoExp - NoTemp"),
    (44, "Caramel Ice Dessert - Dessert - OutletF - Suburb - NoExp - NoTemp"),
    (45, "Vanilla Ice Dessert - Dessert - OutletF - Suburb - NoExp - NoTemp"),
    (46, "Lemon Drink - Beverage - OutletA - Downtown - NoExp - NoTemp"),
    (47, "Lemon Citrus Juice - Beverage - OutletA - Downtown - NoExp - NoTemp"),
    (48, "Fresh Lemon Drink - Beverage - OutletA - Downtown - NoExp - NoTemp"),
    (49, "Iced Herbal Tea - Beverage - OutletA - Downtown - NoExp - NoTemp"),
    (50, "Herbal Tea Iced - Beverage - OutletA - Downtown - NoExp - NoTemp"),
]

columns = ["id", "attribute"]
df_accessorial = spark.createDataFrame(accessorial_data, schema=columns)
spark.sql(
    """
    CREATE SCHEMA IF NOT EXISTS sandbox.bronze
    """
)
df_accessorial.write.format("delta").mode("overwrite").saveAsTable("sandbox.bronze.accessorials")

# Display the DataFrame interactively in the notebook
display(df_accessorial)

- Compose Prompt Instructions
- Query AI Model
- Parse & Save Response
(with out MLFlow)

In [0]:
import json
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# Step 1: Load bronze accessorial data
df = spark.table("sandbox.bronze.accessorials").select("id", "attribute")
items_list = df.collect()

# Step 2: Construct prompt input lines
lines = [f"{row.id}: {row.attribute}" for row in items_list]
data_str = "\n".join(lines)

# Step 3: Compose prompt with extended category logic for "Snack"
prompt = f"""
You are given a list of shipment accessorial service descriptions, each with a unique ID.

Your task:
1. Compute semantic similarity and fuzzy matches between descriptions.
2. Group similar items into clusters representing the same entity despite typos or rearrangement.
3. Assign each cluster a numeric entity_group_id starting from 1.
4. Select a canonical representative description per cluster (shortest/clearest).
5. For each item, if the description or grouped attribute relates to any kind of food or beverage, assign the category "Snack".
   Otherwise, assign the category as the grouped_attribute value.
6. Output a JSON array of objects with fields: id, attribute, entity_group_id, grouped_attribute, category.

Provide ONLY the JSON array of objects.

Data:
{data_str}
"""

# Step 4: Register prompt as temp view
spark.createDataFrame([(prompt,)], ["prompt"]).createOrReplaceTempView("prompt_table")

# Step 5: Query the AI model for output JSON
result_df = spark.sql("""
SELECT ai_query(
    'databricks-meta-llama-3-3-70b-instruct',
    prompt
) AS raw_json
FROM prompt_table
""")

raw_json = result_df.collect()[0]["raw_json"]

# Step 6: Parse JSON response safely and create Spark DataFrame
if raw_json and raw_json.strip():
    try:
        raw_json_clean = raw_json.strip()
        # Remove markdown code fences if present
        if raw_json_clean.startswith(""):
            raw_json_clean = raw_json_clean.lstrip("`").lstrip("json").strip()
            if raw_json_clean.endswith(""):
                raw_json_clean = raw_json_clean[:-3].strip()
        parsed_list = json.loads(raw_json_clean)
        
        # Check if parse resulted in a list and has records
        if isinstance(parsed_list, list) and len(parsed_list) > 0:
            final_df = spark.createDataFrame(parsed_list)
            
            # Ensure the schema exists before saving the table
            spark.sql("CREATE SCHEMA IF NOT EXISTS sandbox.gold")

            # Persist DataFrame to Gold Delta table with overwrite mode
            final_df.write.mode("overwrite").format("delta").saveAsTable("sandbox.gold.accessorials")
            
            # Query and display Gold table in tabular format
            gold_df = spark.sql("SELECT * FROM sandbox.gold.accessorials")
            display(gold_df)
        else:
            print("Parsed JSON is not a valid list or is empty.")
            
    except json.JSONDecodeError as e:
        print("Error parsing JSON from LLM response:")
        print(e)
        print("Raw JSON output was:")
        print(raw_json_clean)
else:
    print("LLM response empty or only whitespace - check prompt or model output.")

- Compose Prompt Instructions
- Query AI Model
- Parse & Save Response
(with MLFlow)

In [0]:
import json
import re
import mlflow
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

def strip_markdown_fences(text):
    # Remove markdown code fences and language tags if present
    return re.sub(r"^(\s*json||json)", "", text, flags=re.MULTILINE).strip()

def extract_json_and_reasoning(text):
    # Extract JSON substring and reasoning text (everything before JSON)
    start = text.find('[')
    end = text.rfind(']')
    if start == -1 or end == -1:
        return None, text
    json_part = text[start:end+1]
    reasoning_part = text[:start].strip()
    return json_part, reasoning_part

# Step 1: Load bronze accessorial data
df = spark.table("sandbox.bronze.accessorials").select("id", "attribute")
items_list = df.collect()

# Step 2: Prepare prompt input lines
lines = [f"{row.id}: {row.attribute}" for row in items_list]
data_str = "\n".join(lines)

# Step 3: Compose prompt with explicit reasoning request
prompt = f"""
You are given a list of shipment accessorial service descriptions, each with a unique ID.

Your task:
1. Compute semantic similarity and fuzzy matches between descriptions.
2. Group similar items into clusters representing the same entity despite typos or rearrangement.
3. Assign each cluster a numeric entity_group_id starting from 1.
4. Select a canonical representative description per cluster (shortest/clearest).
5. For each item, if the description or grouped attribute relates to any kind of food or beverage, assign the category "Snack".
   Otherwise, assign the category as the grouped_attribute value.
6. Output a JSON array of objects with fields: id, attribute, entity_group_id, grouped_attribute, category.

Additionally, provide a detailed explanation of your reasoning and the steps you took to group and categorize the items, including examples of how you handled typos, rearrangements, and category assignments. Place your reasoning before the JSON output.

Provide your reasoning first, then the JSON array of objects.

Data:
{data_str}
"""

spark.createDataFrame([(prompt,)], ["prompt"]).createOrReplaceTempView("prompt_table")

# Step 4: Query the AI model for the output JSON and reasoning
result_df = spark.sql("""
SELECT ai_query(
    'databricks-meta-llama-3-3-70b-instruct',
    prompt
) AS raw_output
FROM prompt_table
""")

raw_output = result_df.collect()[0]["raw_output"]

# Step 5: extract reasoning and JSON parts
clean_output = strip_markdown_fences(raw_output)
json_str, reasoning = extract_json_and_reasoning(clean_output)

# Step 6: Log reasoning and prompt in MLflow experiment but do not print them
with mlflow.start_run(run_name="llm_accessorials_reasoning_log_only") as run:
    mlflow.log_text(f"PROMPT:\n{prompt}\n\nREASONING:\n{reasoning or ''}", "llm_reasoning.txt")

# Step 7: Parse JSON and proceed like original flow (unchanged)
if json_str and json_str.strip():
    try:
        parsed_list = json.loads(json_str)
        if isinstance(parsed_list, list) and len(parsed_list) > 0:
            final_df = spark.createDataFrame(parsed_list)

            spark.sql("CREATE SCHEMA IF NOT EXISTS sandbox.gold")
            final_df.write.mode("overwrite").format("delta").saveAsTable("sandbox.gold.accessorials")

            # Display the Gold table interactively
            gold_df = spark.sql("SELECT * FROM sandbox.gold.accessorials")
            display(gold_df)
        else:
            print("Parsed JSON is not a valid list or is empty.")
    except json.JSONDecodeError as e:
        print("Error parsing JSON from LLM response:")
        print(e)
        print("Raw JSON output was:")
        print(json_str)
else:
    print("LLM response empty or only whitespace - check prompt or model output.")