In [7]:
pip install google-generativeai


Collecting google-generativeai
  Using cached google_generativeai-0.8.5-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Using cached google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google-generativeai)
  Using cached google_api_core-2.28.1-py3-none-any.whl.metadata (3.3 kB)
Collecting google-api-python-client (from google-generativeai)
  Using cached google_api_python_client-2.187.0-py3-none-any.whl.metadata (7.0 kB)
Collecting google-auth>=2.15.0 (from google-generativeai)
  Using cached google_auth-2.43.0-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.15->google-generativeai)
  Using cached proto_plus-1.26.1-py3-none-any.whl.metadata (2.2 kB)
Collecting googleapis-common-protos<2.0.0,>=1.56.2 (from google-api-core->google-generativeai)
  Using cached googleapis_common_protos-1.72.0-py3-none-any.

In [9]:
import os
import re
import json
import time
import random
import pandas as pd
import numpy as np

from collections import Counter

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
       
import google.generativeai as genai
genai.configure(api_key="AIzaSyB5h_IhiTedRz2gaXxzik0oEKOixKnWOc4")


In [10]:
df = pd.read_csv(r"C:\Users\delll\Downloads\yelp.csv")

df = df[["text", "stars"]].dropna()
df["stars"] = df["stars"].astype(int)

df_sample = df.sample(200, random_state=42).reset_index(drop=True)
df_sample.head()


Unnamed: 0,text,stars
0,We got here around midnight last Friday... the...,4
1,Brought a friend from Louisiana here. She say...,5
2,"Every friday, my dad and I eat here. We order ...",3
3,"My husband and I were really, really disappoin...",1
4,Love this place! Was in phoenix 3 weeks for w...,5


In [11]:
PROMPT_V1 = """
You are a simple review classifier.

Task:
Given a Yelp-style customer review, classify its star rating from 1 to 5:
1 = very bad
2 = bad
3 = neutral / mixed
4 = good
5 = excellent

Return ONLY a JSON object with this exact schema:
{{
  "predicted_stars": <integer from 1 to 5>,
  "explanation": "<short explanation>"
}}

Review:
"{review_text}"
"""


In [12]:
PROMPT_V2 = """
You are an expert sentiment analyst for a restaurant review platform.

Read the following review and reason step by step about each aspect:
- Food quality
- Service quality
- Ambience
- Price/value
- Overall tone (positive / neutral / negative)

Then assign a star rating from 1 to 5 using these rules:
1 = strongly negative, major issues
2 = negative, clear dissatisfaction
3 = mixed or neutral, as many positives as negatives
4 = positive with minor complaints
5 = strongly positive, very satisfied

Return ONLY a valid JSON object with this exact schema:
{{
  "predicted_stars": <integer 1-5>,
  "explanation": "<one or two sentences explaining the rating>"
}}

Do NOT include any extra text before or after the JSON. No markdown.

Review:
"{review_text}"
"""


In [13]:
PROMPT_V3 = """
You are a reliable, production-grade rating engine for Yelp-style reviews.

Your output will be consumed by an automated system, so you MUST:
- Always return STRICTLY valid JSON
- Use the exact field names: "predicted_stars" and "explanation"
- Return only one JSON object and nothing else.

Rating guidelines:
1 = very bad experience, mostly negative
2 = bad experience, more negatives than positives
3 = mixed/average, balanced positives and negatives
4 = good experience with minor issues
5 = excellent experience, strongly positive

Examples:

Review: "Terrible service and the food was cold."
Output:
{{
  "predicted_stars": 1,
  "explanation": "The customer is very unhappy with both food and service."
}}

Review: "Great food and friendly staff, but the wait time was a bit long."
Output:
{{
  "predicted_stars": 4,
  "explanation": "Overall positive experience with a small complaint about waiting."
}}

Now classify the following review. Use the same JSON format.

Review:
"{review_text}"
"""


In [14]:
def call_llm(prompt: str) -> str:
    """Send prompt to chosen LLM provider and return raw text."""
    if USE_OPENAI:
        resp = client.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
        )
        return resp.choices[0].message.content
    else:
        model = genai.GenerativeModel("gemini-1.5-flash")
        resp = model.generate_content(prompt)
        return resp.text

JSON_RE = re.compile(r"\{.*\}", re.DOTALL)

def extract_json(text: str):
    """Try to extract valid JSON object from model output."""
    if text is None:
        return None
    # Try to find {...}
    match = JSON_RE.search(text)
    candidate = match.group(0) if match else text.strip()
    try:
        obj = json.loads(candidate)
        return obj
    except Exception:
        # attempt simple fixes
        candidate = candidate.replace("```json", "").replace("```", "").strip()
        try:
            obj = json.loads(candidate)
            return obj
        except Exception:
            return None


def safe_predict(review_text: str, prompt_template: str, max_retries: int = 2):
    """Call LLM and always return (predicted_stars or None, explanation or '')"""
    for attempt in range(max_retries + 1):
        raw = call_llm(prompt_template.format(review_text=review_text))
        data = extract_json(raw)
        if data and "predicted_stars" in data:
            try:
                rating = int(data["predicted_stars"])
                if 1 <= rating <= 5:
                    explanation = str(data.get("explanation", "")).strip()
                    return rating, explanation, True
            except Exception:
                pass
        # backoff a bit
        time.sleep(0.3 + 0.2 * attempt)
    return None, "", False


In [19]:
def run_experiment(df, prompt_template, label: str):
    preds = []
    exps = []
    valids = []

    for i, row in df.iterrows():
        true_label = int(row["stars"])
        pred, explanation, valid_json = safe_predict(row["text"], prompt_template)
        preds.append(pred)
        exps.append(explanation)
        valids.append(valid_json)

        # Optional: light progress print
        if (i + 1) % 20 == 0:
            print(f"{label}: processed {i+1}/{len(df)}")

    result_df = df.copy()
    result_df[f"{label}_pred"] = preds
    result_df[f"{label}_exp"] = exps
    result_df[f"{label}_valid_json"] = valids

    # Filter out None predictions for accuracy
    mask = result_df[f"{label}_pred"].notnull()
    acc = accuracy_score(result_df.loc[mask, "stars"], result_df.loc[mask, f"{label}_pred"])
    json_valid_rate = np.mean(result_df[f"{label}_valid_json"])

    print(f"\n=== {label} RESULTS ===")
    print(f"Accuracy (on valid preds): {acc:.3f}")
    print(f"JSON validity rate:        {json_valid_rate:.3f}")
    print()

    return result_df, acc, json_valid_rate


In [20]:
v1_df, v1_acc, v1_json = run_experiment(df_sample, PROMPT_V1, "v1")
v2_df, v2_acc, v2_json = run_experiment(df_sample, PROMPT_V2, "v2")
v3_df, v3_acc, v3_json = run_experiment(df_sample, PROMPT_V3, "v3")


v1: processed 20/200
v1: processed 40/200
v1: processed 60/200
v1: processed 80/200
v1: processed 100/200
v1: processed 120/200
v1: processed 140/200
v1: processed 160/200
v1: processed 180/200
v1: processed 200/200


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)



=== v1 RESULTS ===
Accuracy (on valid preds): nan
JSON validity rate:        0.000

v2: processed 20/200
v2: processed 40/200
v2: processed 60/200
v2: processed 80/200
v2: processed 100/200
v2: processed 120/200
v2: processed 140/200
v2: processed 160/200
v2: processed 180/200
v2: processed 200/200

=== v2 RESULTS ===
Accuracy (on valid preds): nan
JSON validity rate:        0.000



  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


v3: processed 20/200
v3: processed 40/200
v3: processed 60/200
v3: processed 80/200
v3: processed 100/200
v3: processed 120/200
v3: processed 140/200
v3: processed 160/200
v3: processed 180/200
v3: processed 200/200

=== v3 RESULTS ===
Accuracy (on valid preds): nan
JSON validity rate:        0.000



  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


In [23]:
import os
import re
import json

USE_OPENAI = False
USE_GEMINI = True

def call_llm(prompt):
    if USE_OPENAI:   
        ...


In [24]:
comparison = pd.DataFrame([
    {"Prompt": "V1 (baseline)", "Accuracy": v1_acc, "JSON_Validity": v1_json},
    {"Prompt": "V2 (structured)", "Accuracy": v2_acc, "JSON_Validity": v2_json},
    {"Prompt": "V3 (few-shot + strict)", "Accuracy": v3_acc, "JSON_Validity": v3_json},
])
comparison


Unnamed: 0,Prompt,Accuracy,JSON_Validity
0,V1 (baseline),,0.0
1,V2 (structured),,0.0
2,V3 (few-shot + strict),,0.0
