In [2]:
import pandas as pd
import numpy as np
import json
import requests
import re
import os
import time

from tqdm import tqdm
from jsonschema import validate
from sklearn.metrics import accuracy_score


In [3]:
DATA_PATH = "/kaggle/input/yelp-reviews-dataset/yelp.csv"

df = pd.read_csv(DATA_PATH)

# Keep only required columns
df = df[['text', 'stars']]

df.head()


Unnamed: 0,text,stars
0,My wife took me here on my birthday for breakf...,5
1,I have no idea why some people give bad review...,5
2,love the gyro plate. Rice is so good and I als...,4
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5
4,General Manager Scott Petello is a good egg!!!...,5


In [4]:
df_sample = (
    df.groupby('stars', group_keys=False)
      .apply(lambda x: x.sample(min(len(x), 40), random_state=42))
      .sample(frac=1, random_state=42)
      .reset_index(drop=True)
)

df_sample['stars'].value_counts()


  .apply(lambda x: x.sample(min(len(x), 40), random_state=42))


stars
3    40
1    40
4    40
2    40
5    40
Name: count, dtype: int64

In [5]:
PROMPTS = {
    "v1_zero_shot": """
Classify the following Yelp review into a 1â€“5 star rating.

You MUST respond with ONLY a valid JSON object.
Do NOT include any text before or after the JSON.
Do NOT use markdown or code blocks.

The JSON MUST be:
{{
  "predicted_stars": number,
  "explanation": string
}}

Review:
{review}
""",

    "v2_rubric_based": """
You are a sentiment classification system.

Use this rubric:
1 = Very negative experience
2 = Mostly negative
3 = Mixed or neutral experience
4 = Mostly positive experience
5 = Very positive experience

You MUST respond with ONLY a valid JSON object.
Do NOT include any text before or after the JSON.
Do NOT use markdown or code blocks.

The JSON MUST be:
{{
  "predicted_stars": number,
  "explanation": string
}}

Review:
{review}
""",

    "v3_step_based": """
You are a strict Yelp review rating classifier.

Internally:
1. Identify sentiment (positive, neutral, negative).
2. Map sentiment to a 1â€“5 star rating.
3. Ensure explanation matches the rating.

You MUST respond with ONLY a valid JSON object.
Do NOT include any text before or after the JSON.
Do NOT use markdown or code blocks.

The JSON MUST be:
{{
  "predicted_stars": number,
  "explanation": string
}}

Review:
{review}
"""
}


In [8]:
from getpass import getpass

os.environ["OPENROUTER_API_KEY"] = getpass("Enter OpenRouter API key: ")


Enter OpenRouter API key:  Â·Â·Â·Â·Â·Â·Â·Â·


In [9]:
def call_llm(prompt):
    headers = {
        "Authorization": f"Bearer {os.environ['OPENROUTER_API_KEY']}",
        "Content-Type": "application/json",
        "HTTP-Referer": "https://kaggle.com",
        "X-Title": "Yelp Rating Prediction"
    }

    payload = {  # IMPORTANT CHANGE
        "model": "openai/gpt-3.5-turbo",
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "temperature": 0
    }

    response = requests.post(
        "https://openrouter.ai/api/v1/chat/completions",
        headers=headers,
        json=payload,
        timeout=30
    )

    if response.status_code != 200:
        raise RuntimeError(response.text)

    text = response.json()["choices"][0]["message"]["content"]

    # Safe JSON extraction
    match = re.search(r"\{[\s\S]*?\}", text)
    if not match:
        raise ValueError(f"No JSON found:\n{text}")

    return json.loads(match.group())


In [10]:
output_schema = {
    "type": "object",
    "properties": {
        "predicted_stars": {"type": "number"},
        "explanation": {"type": "string"}
    },
    "required": ["predicted_stars", "explanation"]
}

def evaluate_prompt(prompt_template, df):
    predictions = []
    valid_json_count = 0

    for _, row in tqdm(df.iterrows(), total=len(df)):
        prompt = prompt_template.format(
            review=row["text"][:2000]
        )

        try:
            response = call_llm(prompt)

            # ðŸ”§ Normalize star value
            response["predicted_stars"] = int(float(response["predicted_stars"]))
            response["predicted_stars"] = min(5, max(1, response["predicted_stars"]))

            validate(instance=response, schema=output_schema)

            predictions.append(response["predicted_stars"])
            valid_json_count += 1

        except Exception:
            predictions.append(None)

        time.sleep(0.5)

    df_result = df.copy()
    df_result["predicted_stars"] = predictions

    valid_rows = df_result.dropna(subset=["predicted_stars"])

    accuracy = (
        accuracy_score(valid_rows["stars"], valid_rows["predicted_stars"])
        if len(valid_rows) > 0 else np.nan
    )

    return {
        "accuracy": accuracy,
        "json_validity_rate": valid_json_count / len(df),
        "results": df_result
    }


In [11]:
results = {}

for name, prompt in PROMPTS.items():
    print(f"\nEvaluating {name}")
    results[name] = evaluate_prompt(prompt, df_sample)



Evaluating v1_zero_shot


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 200/200 [05:09<00:00,  1.55s/it]



Evaluating v2_rubric_based


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 200/200 [03:16<00:00,  1.02it/s]



Evaluating v3_step_based


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 200/200 [02:10<00:00,  1.54it/s]


In [12]:
comparison_table = pd.DataFrame([
    {
        "Prompt Version": name,
        "Accuracy": round(res["accuracy"], 3),
        "JSON Validity Rate": round(res["json_validity_rate"], 3)
    }
    for name, res in results.items()
])

comparison_table


Unnamed: 0,Prompt Version,Accuracy,JSON Validity Rate
0,v1_zero_shot,0.635,1.0
1,v2_rubric_based,0.595,0.37
2,v3_step_based,,0.0


In [24]:
comparison_table.to_csv("prompt_comparison_results.csv", index=False)
results["v3_step_based"]["results"].to_csv(
    "v3_step_based_predictions.csv", index=False
)
