In [0]:
llm = "databricks-meta-llama-3-3-70b-instruct"
prompt = """Extract the following information from the review: 
Location (city name, street name, whatever may be mentioned in the review), Service Score (1-5), Product Score (1-5), Product Name (if mentioned) (comma separated if multiple), Atmosphere Score [cleanliness, accessibility, location, etc] (1-5), Urgency (high, low) [set to high if the review is actionable and due to exceptionally good or poor service]. 
If the review doesnt contain an element, leave it blank or set it to zero. For instance, if the review does not mention service, then set service_score = 0. Urgency should always have a value. All scores should be 1-5 (if they are not 0), with 1 being the worst and 5 being the best.
Review: """

In [0]:
response_schema = """
{
    "type": "json_schema",
    "json_schema": {
        "name": "review_extraction",
        "schema": {
            "type": "object",
            "properties": {
                "location": { "type": "string" },
                "service_score": { "type": "integer" },
                "product_score": { "type": "integer" },
                "product_name": { "type": "string" },
                "atmosphere_score": { "type": "integer" },
                "urgency": { "type": "string" ,
                        "enum": ["high", "low"] }
            }
        },
        "strict": true
    }
}
"""

In [0]:
from pyspark.sql.functions import col, expr, split, trim, regexp_replace, from_json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Define the schema for the structured review
schema = StructType([
    StructField("location", StringType(), True),
    StructField("service_score", IntegerType(), True),
    StructField("product_score", IntegerType(), True),
    StructField("product_name", StringType(), True),
    StructField("atmosphere_score", IntegerType(), True),
    StructField("urgency", StringType(), True)
])

# Read the streaming data from the source table
reviews_stream = (
    spark.readStream
    .table("retail_prod.media.reviews")
)

# Apply the AI query to extract structured information
structured_reviews = (
    reviews_stream
    .withColumn(
        "structured_review",
        expr(f"ai_query('{llm}', CONCAT('{prompt}', review), responseFormat => '{response_schema}')")
    )
)

# Parse the structured review JSON
parsed_reviews = (
    structured_reviews
    .withColumn("structured_review", from_json(col("structured_review"), schema))
    .select(
        col("structured_review.location").alias("location"),
        col("structured_review.service_score").alias("service_score"),
        col("structured_review.product_score").alias("product_score"),
        col("structured_review.atmosphere_score").alias("atmosphere_score"),
        col("structured_review.urgency").alias("urgency"),
        col("structured_review.product_name").alias("product_name"),
        col("review"),
        col("franchiseID").cast("string").alias("franchiseID"),
        col("review_date")
    )
)

# Transform the parsed reviews
transformed_reviews = (
    parsed_reviews
    .select(
        expr("CASE WHEN location = '' THEN NULL ELSE location END").alias("location"),
        expr("NULLIF(service_score, 0)").alias("service_score"),
        expr("NULLIF(product_score, 0)").alias("product_score"),
        expr("NULLIF(atmosphere_score, 0)").alias("atmosphere_score"),
        col("urgency"),
        expr(
            "TRANSFORM("
            "FILTER("
            "SPLIT(TRIM(REGEXP_REPLACE(product_name, '(?i)\\bcookies?\\b', '')), ','), "
            "x -> TRIM(x) != ''"
            "), "
            "x -> TRIM(x)"
            ")"
        ).alias("product_name"),
        col("review"),
        col("franchiseID"),
        col("review_date")
    )
)

In [0]:
display(transformed_reviews)

In [0]:

# Write the transformed reviews to a Delta table
query = (
    transformed_reviews
    .writeStream
    .trigger(availableNow=True)
    .format("delta")
    .option("checkpointLocation", "/path/to/checkpoint")
    .outputMode("append")
    .table("retail_prod.media.reviews_structured_gold")
)

query.awaitTermination()