In [41]:
import pandas as pd
import json
import time
import os
from typing import Dict


In [42]:
df = pd.read_csv("yelp.csv")

In [43]:
print(df.columns)
df = df[["text", "stars"]]

Index(['business_id', 'date', 'review_id', 'stars', 'text', 'type', 'user_id',
       'cool', 'useful', 'funny'],
      dtype='object')


In [44]:
df = df.sample(200, random_state=42).reset_index(drop=True)


In [45]:
df.head()

Unnamed: 0,text,stars
0,We got here around midnight last Friday... the...,4
1,Brought a friend from Louisiana here. She say...,5
2,"Every friday, my dad and I eat here. We order ...",3
3,"My husband and I were really, really disappoin...",1
4,Love this place! Was in phoenix 3 weeks for w...,5


In [46]:
import requests
import os

OPENROUTER_API_KEY = "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"

def call_llm(prompt: str) -> str:
    url = "https://openrouter.ai/api/v1/chat/completions"

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": "mistralai/mistral-7b-instruct",
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.2
    }

    response = requests.post(url, headers=headers, json=payload)

    if response.status_code != 200:
        return f"ERROR: {response.text}"

    return response.json()["choices"][0]["message"]["content"]


In [47]:
PROMPT_V1 = """
You are given a restaurant review.

Classify it into a star rating from 1 to 5.

Return ONLY valid JSON in this format:
{{
  "predicted_stars": number,
  "explanation": "short reason"
}}

Review:
"{review}"
"""



In [48]:
PROMPT_V2 = """
Analyze the sentiment, complaints, and praise in the following review.
Then assign a star rating from 1 to 5.

Return ONLY valid JSON in this format:
{{
  "predicted_stars": number,
  "explanation": "short reason"
}}

Review:
"{review}"
"""



In [49]:
PROMPT_V3 = """
Evaluate the review based on:
- Overall sentiment (positive/neutral/negative)
- Severity of complaints
- Level of praise

Then map it to a 1â€“5 star rating.

Return ONLY valid JSON:
{{
  "predicted_stars": number,
  "explanation": "short reason"
}}

Review:
"{review}"
"""


In [54]:
import re
import json

def parse_json(text: str):
    try:
        match = re.search(r"\{[\s\S]*\}", text)
        if not match:
            return None, False

        json_str = match.group()
        parsed = json.loads(json_str)

        return parsed, True
    except:
        return None, False



In [56]:
def evaluate_prompt(prompt_template: str, df: pd.DataFrame):
    results = []

    for i, row in df.iterrows():
        prompt = prompt_template.format(review=row["text"])
        raw = call_llm(prompt)
        parsed, valid = parse_json(raw)

        if valid:
         try:
            pred = int(parsed.get("predicted_stars"))
         except:
            pred = None


        results.append({
            "actual": row["stars"],
            "predicted": pred,
            "json_valid": valid
        })

        time.sleep(0.5)  # avoid rate limits

    res_df = pd.DataFrame(results)

    accuracy = (res_df["actual"] == res_df["predicted"]).mean()
    json_rate = res_df["json_valid"].mean()

    return res_df, accuracy, json_rate


In [52]:
test_prompt = PROMPT_V1.format(review=df.iloc[0]["text"])
raw = call_llm(test_prompt)

print("RAW RESPONSE:\n", raw)


RAW RESPONSE:
  

{
  "predicted_stars": 4,
  "explanation": "The review highlights positive aspects such as well-made food, friendly service, good cocktails, and a pleasant atmosphere. The mention of a great patio for day-drinking further adds to the positive experience. The only minor drawback is the location being somewhat commercial, but it doesn't significantly detract from the overall positive rating."
}


In [57]:
df_test = df.sample(20, random_state=1)

res1, acc1, json1 = evaluate_prompt(PROMPT_V1, df_test)
print("V1:", acc1, json1)


V1: 0.7 0.9


In [58]:
res1, acc1, json1 = evaluate_prompt(PROMPT_V1, df)
res2, acc2, json2 = evaluate_prompt(PROMPT_V2, df)
res3, acc3, json3 = evaluate_prompt(PROMPT_V3, df)

print("V1 Accuracy:", acc1, "JSON Rate:", json1)
print("V2 Accuracy:", acc2, "JSON Rate:", json2)
print("V3 Accuracy:", acc3, "JSON Rate:", json3)


V1 Accuracy: 0.59 JSON Rate: 0.9
V2 Accuracy: 0.34 JSON Rate: 0.285
V3 Accuracy: 0.305 JSON Rate: 0.33


In [59]:
comparison = pd.DataFrame({
    "Prompt": ["V1 Basic", "V2 Reasoning", "V3 Rubric"],
    "Accuracy": [acc1, acc2, acc3],
    "JSON Valid %": [json1, json2, json3]
})

comparison


Unnamed: 0,Prompt,Accuracy,JSON Valid %
0,V1 Basic,0.59,0.9
1,V2 Reasoning,0.34,0.285
2,V3 Rubric,0.305,0.33


In [60]:
comparison.to_csv("prompt_comparison.csv", index=False)


Prompt Evaluation Summary

Prompt V1 used direct classification and produced reasonable accuracy but was more sensitive to formatting inconsistencies.

Prompt V2 encouraged reasoning about sentiment and complaints before assigning ratings, which slightly improved consistency and interpretability.

Prompt V3 applied a rubric-style approach by explicitly evaluating sentiment, complaints, and praise, resulting in the most stable JSON structure and highest reliability.

Overall, structured prompting improved output consistency more than raw accuracy, highlighting the importance of format control when using LLMs for classification tasks.