In [0]:
llm = "llama-3-3-70b-instruct-pt"

prompt = """Extract the following information from the review: 
Location (city name, street name, whatever may be mentioned in the review), Service Score (1-5), Product Score (1-5), Product Name (if mentioned) (comma separated if multiple), Atmosphere Score [cleanliness, accessibility, location, etc] (1-5), Urgency (high, low) [set to high if the review is actionable and due to exceptionally good or poor service]. 
If the review doesnt contain an element, leave it blank or set it to zero. For instance, if the review does not mention service, then set service_score = 0. Urgency should always have a value. All scores should be 1-5 (if they are not 0), with 1 being the worst and 5 being the best.
Review: """

response_schema = """
{
    "type": "json_schema",
    "json_schema": {
        "name": "review_extraction",
        "schema": {
            "type": "object",
            "properties": {
                "location": { "type": "string" },
                "service_score": { "type": "integer" },
                "product_score": { "type": "integer" },
                "product_name": { "type": "string" },
                "atmosphere_score": { "type": "integer" },
                "urgency": { "type": "string" ,
                        "enum": ["high", "low"] }
            }
        },
        "strict": true
    }
}
"""

import dlt
from pyspark.sql.functions import col, expr, split, trim, regexp_replace

@dlt.table
def reviews_structured():
    return (
        spark.table("retail_prod.media.reviews")
        .withColumn(
            "structured_review",
            expr(f"ai_query('{llm}', CONCAT('{prompt}', review), responseFormat => '{response_schema}')")
        )
    )

@dlt.table
def reviews_structured_gold():
    parsed_reviews = (
        dlt.read("reviews_structured")
        .selectExpr(
            "parse_json(structured_review):location::string AS location",
            "parse_json(structured_review):service_score::int AS service_score",
            "parse_json(structured_review):product_score::int AS product_score",
            "parse_json(structured_review):atmosphere_score::int AS atmosphere_score",
            "parse_json(structured_review):urgency::string AS urgency",
            "parse_json(structured_review):product_name::string AS product_name",
            "*"
        )
    )
    
    return (
        parsed_reviews
        .select(
            expr("CASE WHEN location = '' THEN NULL ELSE location END").alias("location"),
            expr("NULLIF(service_score, 0)").alias("service_score"),
            expr("NULLIF(product_score, 0)").alias("product_score"),
            expr("NULLIF(atmosphere_score, 0)").alias("atmosphere_score"),
            col("urgency"),
            expr(
                "TRANSFORM("
                "FILTER("
                "SPLIT(TRIM(REGEXP_REPLACE(product_name, '(?i)\\bcookies?\\b', '')), ','), "
                "x -> TRIM(x) != ''"
                "), "
                "x -> TRIM(x)"
                ")"
            ).alias("product_name"),
            col("review"),
            col("franchiseID").cast("string").alias("franchiseID"),
            col("review_date")
        )
    )
