In [1]:
#Imports
import pandas as pd
import json
from tqdm import tqdm
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
#Load API Key
llm = ChatGroq(
    api_key=os.getenv("GROQ_API_KEY"),
    model_name="llama-3.1-8b-instant"
)


In [11]:
#Load Yelp Dataset
df = pd.read_csv("yelp.csv")

df = df[['text', 'stars']].dropna()
df = df.sample(200, random_state=42)

df.head()


Unnamed: 0,text,stars
6252,We got here around midnight last Friday... the...,4
4684,Brought a friend from Louisiana here. She say...,5
1731,"Every friday, my dad and I eat here. We order ...",3
4742,"My husband and I were really, really disappoin...",1
4521,Love this place! Was in phoenix 3 weeks for w...,5


In [6]:
print(df.columns)

Index(['text', 'stars'], dtype='object')


In [12]:
#Define Prompt Version 1 (Basic)
def prompt_v1(review):
    return f"""
    Read the following Yelp review and predict a star rating from 1 to 5.

    Return ONLY valid JSON in this format:
    {{
      "predicted_stars": number,
      "explanation": "short reason"
    }}

    Review:
    {review}
    """



In [13]:
#Run Prompt V1 on Data
import json
from tqdm import tqdm

results_v1 = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    review = row['text']
    actual = row['stars']

    response = llm.invoke(prompt_v1(review)).content

    try:
        parsed = json.loads(response)
        predicted = parsed["predicted_stars"]
        valid_json = True
    except:
        predicted = None
        valid_json = False

    results_v1.append({
        "actual": actual,
        "predicted": predicted,
        "json_valid": valid_json
    })


100%|██████████| 200/200 [08:32<00:00,  2.56s/it]


In [14]:
#Evaluate Prompt V1
res1 = pd.DataFrame(results_v1)

accuracy_v1 = (res1["actual"] == res1["predicted"]).mean()
json_rate_v1 = res1["json_valid"].mean()

accuracy_v1, json_rate_v1


(np.float64(0.665), np.float64(1.0))

In [17]:
#Prompt Version 2 (Improved)
def prompt_v2(review):
    return f"""
    You are a professional Yelp review analyst.

    Your task is to assign a star rating from 1 to 5 based strictly on:
    - Overall sentiment
    - Strength of language
    - Positive vs negative aspects

    Be conservative (avoid extreme ratings unless clearly justified).
    Respond ONLY with valid JSON. No extra text.

    Review:
    "{review}"

    JSON format:
    {{
      "predicted_stars": number,
      "explanation": "short justification"
    }}
    """


In [18]:
results_v2 = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    review = row['text']
    actual = row['stars']

    response = llm.invoke(prompt_v2(review)).content

    try:
        parsed = json.loads(response)
        predicted = parsed["predicted_stars"]
        valid_json = True
    except:
        predicted = None
        valid_json = False

    results_v2.append({
        "actual": actual,
        "predicted": predicted,
        "json_valid": valid_json
    })


100%|██████████| 200/200 [10:09<00:00,  3.05s/it]


In [19]:
res2 = pd.DataFrame(results_v2)

accuracy_v2 = (res2["actual"] == res2["predicted"]).mean()
json_rate_v2 = res2["json_valid"].mean()

accuracy_v2, json_rate_v2


(np.float64(0.69), np.float64(1.0))

In [20]:
#Prompt Version 3 (Best Prompt)
def prompt_v3(review):
    return f"""
    You are a Yelp review rating expert.

    Example 1:
    Review: "Amazing food, friendly staff, will come again!"
    Output: {{ "predicted_stars": 5, "explanation": "Very positive experience" }}

    Example 2:
    Review: "Food was okay but service was slow and rude."
    Output: {{ "predicted_stars": 3, "explanation": "Mixed experience" }}

    Now analyze the review below and predict the rating.

    Review:
    "{review}"

    Respond ONLY with valid JSON:
    {{
      "predicted_stars": number,
      "explanation": "brief reason"
    }}
    """

In [21]:
#Final Comparison Table
results_v3 = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    review = row['text']
    actual = row['stars']

    response = llm.invoke(prompt_v3(review)).content

    try:
        parsed = json.loads(response)
        predicted = parsed["predicted_stars"]
        valid_json = True
    except:
        predicted = None
        valid_json = False

    results_v3.append({
        "actual": actual,
        "predicted": predicted,
        "json_valid": valid_json
    })


100%|██████████| 200/200 [10:57<00:00,  3.29s/it]


In [22]:
res3 = pd.DataFrame(results_v3)

accuracy_v3 = (res3["actual"] == res3["predicted"]).mean()
json_rate_v3 = res3["json_valid"].mean()

accuracy_v3, json_rate_v3


(np.float64(0.64), np.float64(1.0))

In [23]:
#Final Comparison Table
comparison = pd.DataFrame({
    "Prompt": ["V1 Basic", "V2 Strict", "V3 Few-shot"],
    "Accuracy": [accuracy_v1, accuracy_v2, accuracy_v3],
    "JSON Validity": [json_rate_v1, json_rate_v2, json_rate_v3]
})

comparison


Unnamed: 0,Prompt,Accuracy,JSON Validity
0,V1 Basic,0.665,1.0
1,V2 Strict,0.69,1.0
2,V3 Few-shot,0.64,1.0


# Task 1 – Yelp Rating Prediction via Prompt Engineering

This notebook evaluates three different prompting strategies for predicting Yelp star ratings using an LLM.  
We compare accuracy and JSON validity across prompts.
