In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -q transformers accelerate bitsandbytes


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import google.generativeai as genai
from google.api_core.exceptions import ResourceExhausted
from tqdm import tqdm
import json
%matplotlib inline

In [4]:
df = pd.read_csv('/content/drive/MyDrive/yelp.csv')

In [5]:
df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [6]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['clean_text'] = df['text'].apply(clean_text)

In [7]:
df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny,clean_text
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0,my wife took me here on my birthday for breakf...
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0,i have no idea why some people give bad review...
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0,love the gyro plate rice is so good and i also...
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0,rosie dakota and i love chaparral dog park its...
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0,general manager scott petello is a good egg no...


In [8]:
sampled_df = df.sample(n=200, random_state=42)
print("Sampled DataFrame created with shape:", sampled_df.shape)

Sampled DataFrame created with shape: (200, 11)


In [9]:

prompt_template_1 = """
You are a helpful assistant. Classify the sentiment of this Yelp review into a star rating (1-5).
Return ONLY a valid JSON object.

Review: "{review_text}"

JSON Schema:
{{
  "predicted_stars": int,
  "explanation": "string"
}}
"""
print("Prompt Template 1 defined.")

Prompt Template 1 defined.


In [10]:
prompt_template_2 = """
Classify the following Yelp review into a 1-5 star rating based on these examples.

Example 1:
Review: "Horrible service, food was cold."
Output: {{ "predicted_stars": 1, "explanation": "Negative service and food quality." }}

Example 2:
Review: "Food was good but the wait was too long."
Output: {{ "predicted_stars": 3, "explanation": "Good product but bad experience." }}

Example 3:
Review: "Absolutely loved it! Best pizza in town."
Output: {{ "predicted_stars": 5, "explanation": "Enthusiastic positive sentiment." }}

Current Review: "{review_text}"

Return ONLY the JSON object for the Current Review.
"""

print("Prompt Template 2 defined.")

Prompt Template 2 defined.


In [11]:
prompt_template_3 = """
Analyze the Yelp review below deeply. Follow these steps:
1. Identify sentiment regarding Food, Service, and Price separately.
2. Check for sarcasm (e.g., "Great, another hour wait").
3. Assign a 1-5 star rating based on the overall balance.

Review: "{review_text}"

Return strictly valid JSON:
{{
  "reasoning_steps": {{
    "food": "string",
    "service": "string",
    "sarcasm_detected": bool
  }},
  "predicted_stars": int,
  "explanation": "Final summary"
}}
"""

print("Prompt Template 3 defined.")

Prompt Template 3 defined.


In [12]:
def extract_json(text):
    try:
        match = re.search(r"\{.*\}", text, re.DOTALL)
        return json.loads(match.group()) if match else None
    except:
        return None

In [25]:
def generate_response(prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [14]:
def evaluate_prompt(prompt_template, df, runs=1):
    results = []

    for run in range(runs):
        for _, row in tqdm(df.iterrows(), total=len(df)):
            prompt = prompt_template.format(
                review_text=row["review_text"]
            )

            text = generate_response(prompt)
            parsed = extract_json(text)

            results.append({
                "true_stars": row["true_stars"],
                "predicted_stars": parsed.get("predicted_stars") if parsed else None,
                "json_valid": parsed is not None,
                "raw_output": text,
                "run": run
            })

    return pd.DataFrame(results)


In [15]:
def accuracy_metrics(df):
    valid = df[df["predicted_stars"].notna()]

    exact_accuracy = (
        valid["predicted_stars"] == valid["true_stars"]
    ).mean()

    mae = (valid["predicted_stars"] - valid["true_stars"]).abs().mean()

    return {
        "exact_match_accuracy": exact_accuracy,
        "mae": mae
    }


In [16]:
def json_validity_rate(df):
    return df["json_valid"].mean()


In [17]:
def prediction_consistency(df):
    grouped = df.groupby(["review_text"])

    stable = 0
    total = 0

    for _, group in grouped:
        preds = group["predicted_stars"].dropna().tolist()
        if len(preds) > 1:
            total += 1
            if len(set(preds)) == 1:
                stable += 1

    return stable / total if total > 0 else 0


In [18]:
def json_consistency(df):
    grouped = df.groupby(["review_text"])
    stable = 0
    total = 0

    for _, group in grouped:
        vals = group["json_valid"].tolist()
        if len(vals) > 1:
            total += 1
            if all(vals) or not any(vals):
                stable += 1

    return stable / total if total > 0 else 0


In [30]:
def full_evaluation(prompt_template, df, batch_size=10, runs=1):
    res = evaluate_prompt_batched(
        prompt_template,
        df,
        batch_size=batch_size,
        runs=runs
    )

    exact_acc, mae = accuracy_metrics(res)

    return {
        "exact_match_accuracy": exact_acc,
        "mae": mae,
        "json_validity_rate": json_validity_rate(res),
        "prediction_consistency": prediction_consistency(res)
    }


In [20]:
df = df.rename(columns={
    "text": "review_text",
    "stars": "true_stars"
})


In [21]:
df_eval = df.sample(500, random_state=42)

In [22]:
MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"


In [23]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.float16
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [27]:
def build_batch_prompt(reviews):
    numbered = "\n".join(
        [f"{i+1}. {r}" for i, r in enumerate(reviews)]
    )

    return f"""
Rate the sentiment of each review from 1 to 5 stars.

Reviews:
{numbered}

Return STRICT JSON only as an array:
[
  {{ "index": int, "predicted_stars": int }}
]
"""


In [29]:
from tqdm import tqdm
import pandas as pd

def evaluate_prompt_batched(prompt_template, df, batch_size=10, runs=1):
    all_results = []

    for run in range(runs):
        for i in tqdm(range(0, len(df), batch_size)):
            batch = df.iloc[i:i+batch_size]
            reviews = batch["review_text"].tolist()

            prompt = build_batch_prompt(reviews)
            inputs = tokenizer(prompt, return_tensors="pt")

            outputs = model.generate(
                **inputs,
                max_new_tokens=150,
                do_sample=False
            )

            text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            parsed = extract_json(text)

            if isinstance(parsed, list):
                for item in parsed:
                    idx = item.get("index", -1) - 1
                    if 0 <= idx < len(batch):
                        all_results.append({
                            "review_text": batch.iloc[idx]["review_text"],
                            "true_stars": batch.iloc[idx]["true_stars"],
                            "predicted_stars": item.get("predicted_stars"),
                            "json_valid": True,
                            "run": run
                        })
            else:
                # JSON failed → mark entire batch invalid
                for _, row in batch.iterrows():
                    all_results.append({
                        "review_text": row["review_text"],
                        "true_stars": row["true_stars"],
                        "predicted_stars": None,
                        "json_valid": False,
                        "run": run
                    })

    return pd.DataFrame(all_results)


In [None]:
df_eval = df.sample(100, random_state=42)
results = {
    "prompt_1": full_evaluation(prompt_template_1, df_eval, batch_size=10),
    "prompt_2": full_evaluation(prompt_template_2, df_eval, batch_size=10),
    "prompt_3": full_evaluation(prompt_template_3, df_eval, batch_size=10),
}

pd.DataFrame(results).T


 40%|████      | 4/10 [2:20:51<4:03:46, 2437.69s/it]