## Section 1: Setup and Dependencies

In [None]:
# Install required packages
import subprocess
import sys

packages = ['pandas', 'numpy', 'requests', 'python-dotenv', 'google-generativeai', 'json5']
for package in packages:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

print("All packages installed successfully!")

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import json
import json5
import os
import time
from typing import Dict, List, Tuple
import google.generativeai as genai
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configure Gemini API (you can also use OpenRouter)
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
if GEMINI_API_KEY:
    genai.configure(api_key=GEMINI_API_KEY)
else:
    print("Warning: GEMINI_API_KEY not found. Please set it in .env file")

print("Libraries imported successfully!")

## Section 2: Data Loading and Preparation

In [None]:
# For this demonstration, we'll create sample Yelp reviews data
# In production, you would download from: https://www.kaggle.com/datasets/omkarsabnis/yelp-reviews-dataset

sample_reviews = [
    {"review": "Absolutely amazing service! The food was fresh and delicious. Highly recommend this place!", "actual_stars": 5},
    {"review": "Great atmosphere and friendly staff. Food was good but a bit pricey.", "actual_stars": 4},
    {"review": "Average restaurant. Nothing special. Service was slow.", "actual_stars": 3},
    {"review": "Pretty bad experience. Cold food and rude staff. Won't come back.", "actual_stars": 2},
    {"review": "Terrible! Worst meal I've ever had. Complete waste of money.", "actual_stars": 1},
    {"review": "Excellent food quality, innovative menu, wonderful ambiance. Perfect evening!", "actual_stars": 5},
    {"review": "Good portions, tasty food. Minor issues with waiting time.", "actual_stars": 4},
    {"review": "It was okay. Not worth the hype. Food was mediocre.", "actual_stars": 3},
    {"review": "Disappointed. Food arrived cold and overpriced for quality.", "actual_stars": 2},
    {"review": "Horrible place. Never again. Service was nonexistent.", "actual_stars": 1},
]

# Create DataFrame
df = pd.DataFrame(sample_reviews)

# For actual testing, sample ~200 rows from your Yelp dataset
# df = pd.read_csv('yelp_reviews.csv').sample(200, random_state=42)

print(f"Dataset loaded: {len(df)} reviews")
print(f"\nSample reviews:")
print(df.head())

## Section 3: Prompt Engineering - Approach 1 (Zero-Shot)

In [None]:
# APPROACH 1: Zero-Shot Prompting
# Simple, direct prompt without examples

ZERO_SHOT_PROMPT_TEMPLATE = """You are an expert in sentiment analysis. Analyze the following Yelp review and predict the star rating (1-5 stars).

Review: {review}

Return ONLY valid JSON (no markdown, no extra text):
{{
  "predicted_stars": <integer 1-5>,
  "explanation": "<brief reasoning>"
}}"""

def call_llm(prompt: str) -> str:
    """Call Gemini API with the prompt"""
    try:
        model = genai.GenerativeModel('gemini-pro')
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        print(f"Error calling API: {e}")
        return ""

def extract_json_from_response(response_text: str) -> Dict:
    """Extract and parse JSON from LLM response"""
    try:
        # Try standard JSON parsing
        return json.loads(response_text)
    except:
        try:
            # Try json5 for more lenient parsing
            return json5.loads(response_text)
        except:
            # Try to extract JSON from markdown code blocks
            if '```json' in response_text:
                json_str = response_text.split('```json')[1].split('```')[0].strip()
                return json.loads(json_str)
            elif '{' in response_text and '}' in response_text:
                json_str = response_text[response_text.index('{'):response_text.rindex('}')+1]
                return json5.loads(json_str)
            else:
                return None

def evaluate_approach(df_sample: pd.DataFrame, prompt_template: str, approach_name: str) -> Dict:
    """Evaluate a prompting approach on sample data"""
    results = []
    
    print(f"\n{'='*60}")
    print(f"Evaluating {approach_name}")
    print(f"{'='*60}")
    
    for idx, row in df_sample.iterrows():
        prompt = prompt_template.format(review=row['review'])
        response = call_llm(prompt)
        parsed = extract_json_from_response(response)
        
        result = {
            'review_idx': idx,
            'actual_stars': row['actual_stars'],
            'predicted_stars': parsed.get('predicted_stars', None) if parsed else None,
            'explanation': parsed.get('explanation', '') if parsed else '',
            'json_valid': parsed is not None and 'predicted_stars' in (parsed or {}),
            'raw_response': response[:100] + '...' if len(response) > 100 else response
        }
        results.append(result)
        print(f"Review {idx}: Actual={row['actual_stars']} | Predicted={result['predicted_stars']} | Valid JSON: {result['json_valid']}")
        time.sleep(1)  # Rate limiting for API
    
    # Calculate metrics
    valid_jsons = sum(1 for r in results if r['json_valid'])
    correct_predictions = sum(1 for r in results if r['json_valid'] and r['predicted_stars'] == r['actual_stars'])
    
    metrics = {
        'approach': approach_name,
        'total_reviews': len(results),
        'valid_json_count': valid_jsons,
        'json_validity_rate': valid_jsons / len(results) * 100 if results else 0,
        'correct_predictions': correct_predictions,
        'accuracy': correct_predictions / valid_jsons * 100 if valid_jsons > 0 else 0,
        'results': results
    }
    
    return metrics

# Test on first 3 reviews for quick validation
test_df = df.head(3)
approach1_metrics = evaluate_approach(test_df, ZERO_SHOT_PROMPT_TEMPLATE, "Approach 1: Zero-Shot")

## Section 4: Prompt Engineering - Approach 2 (Few-Shot)

In [None]:
# APPROACH 2: Few-Shot Prompting
# Includes 3 labeled examples to guide the LLM

FEW_SHOT_PROMPT_TEMPLATE = """You are an expert in sentiment analysis. Analyze reviews and predict star ratings (1-5 stars).

Examples:
1. Review: "Amazing service and delicious food! Highly recommend!"
   JSON: {{"predicted_stars": 5, "explanation": "Very positive sentiment with explicit recommendation."}}

2. Review: "Good food but slow service. Decent value."
   JSON: {{"predicted_stars": 4, "explanation": "Positive overall with minor complaints."}}

3. Review: "Terrible experience. Cold food and rude staff."
   JSON: {{"predicted_stars": 1, "explanation": "Strongly negative sentiment about multiple aspects."}}

Now analyze this review:
Review: {review}

Return ONLY valid JSON (no markdown, no extra text):
{{
  "predicted_stars": <integer 1-5>,
  "explanation": "<brief reasoning>"
}}"""

approach2_metrics = evaluate_approach(test_df, FEW_SHOT_PROMPT_TEMPLATE, "Approach 2: Few-Shot")

## Section 5: Prompt Engineering - Approach 3 (Chain-of-Thought)

In [None]:
# APPROACH 3: Chain-of-Thought Prompting
# Asks the LLM to reason step-by-step

COT_PROMPT_TEMPLATE = """You are an expert in sentiment analysis. Analyze the following Yelp review by reasoning step-by-step.

Review: {review}

Step 1: Identify the overall sentiment (positive, negative, neutral)
Step 2: Look for specific positive keywords or complaints
Step 3: Consider the intensity of emotions expressed
Step 4: Determine the appropriate star rating (1-5)

After your reasoning, return ONLY valid JSON (no markdown, no extra text):
{{
  "predicted_stars": <integer 1-5>,
  "explanation": "<brief reasoning based on your analysis>"
}}"""

approach3_metrics = evaluate_approach(test_df, COT_PROMPT_TEMPLATE, "Approach 3: Chain-of-Thought")

## Section 6: Evaluation Metrics and Comparison

In [None]:
# Create comparison table
comparison_data = {
    'Prompting Approach': ['Zero-Shot', 'Few-Shot', 'Chain-of-Thought'],
    'Total Reviews': [approach1_metrics['total_reviews'], approach2_metrics['total_reviews'], approach3_metrics['total_reviews']],
    'Valid JSON Count': [approach1_metrics['valid_json_count'], approach2_metrics['valid_json_count'], approach3_metrics['valid_json_count']],
    'JSON Validity Rate (%)': [f"{approach1_metrics['json_validity_rate']:.1f}", f"{approach2_metrics['json_validity_rate']:.1f}", f"{approach3_metrics['json_validity_rate']:.1f}"],
    'Correct Predictions': [approach1_metrics['correct_predictions'], approach2_metrics['correct_predictions'], approach3_metrics['correct_predictions']],
    'Accuracy (%)': [f"{approach1_metrics['accuracy']:.1f}", f"{approach2_metrics['accuracy']:.1f}", f"{approach3_metrics['accuracy']:.1f}"]
}

comparison_df = pd.DataFrame(comparison_data)
print("\n" + "="*80)
print("PROMPTING APPROACH COMPARISON")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)

In [None]:
# Detailed analysis
print("\nDETAILED ANALYSIS:")
print("\n1. APPROACH 1 - ZERO-SHOT PROMPTING")
print(f"   - Strategy: Direct instruction without examples")
print(f"   - JSON Validity: {approach1_metrics['json_validity_rate']:.1f}%")
print(f"   - Accuracy: {approach1_metrics['accuracy']:.1f}%")
print(f"   - Pros: Simple, fast")
print(f"   - Cons: May produce inconsistent JSON format")

print("\n2. APPROACH 2 - FEW-SHOT PROMPTING")
print(f"   - Strategy: Provide 3 labeled examples to guide the model")
print(f"   - JSON Validity: {approach2_metrics['json_validity_rate']:.1f}%")
print(f"   - Accuracy: {approach2_metrics['accuracy']:.1f}%")
print(f"   - Pros: Better JSON consistency, examples show expected format")
print(f"   - Cons: Slightly longer prompt")

print("\n3. APPROACH 3 - CHAIN-OF-THOUGHT PROMPTING")
print(f"   - Strategy: Ask model to reason step-by-step before predicting")
print(f"   - JSON Validity: {approach3_metrics['json_validity_rate']:.1f}%")
print(f"   - Accuracy: {approach3_metrics['accuracy']:.1f}%")
print(f"   - Pros: Better reasoning, transparent decision-making")
print(f"   - Cons: More API tokens used")

In [None]:
# Reliability and consistency check
def calculate_consistency(results):
    """Calculate consistency as the variance in predictions"""
    valid_results = [r for r in results if r['json_valid']]
    if not valid_results:
        return 0
    # All valid predictions match their actual values = high consistency
    matches = sum(1 for r in valid_results if r['predicted_stars'] == r['actual_stars'])
    return matches / len(valid_results) * 100

consistency_data = {
    'Approach': ['Zero-Shot', 'Few-Shot', 'Chain-of-Thought'],
    'Consistency Score (%)': [
        f"{calculate_consistency(approach1_metrics['results']):.1f}",
        f"{calculate_consistency(approach2_metrics['results']):.1f}",
        f"{calculate_consistency(approach3_metrics['results']):.1f}"
    ]
}

consistency_df = pd.DataFrame(consistency_data)
print("\nCONSISTENCY ANALYSIS:")
print(consistency_df.to_string(index=False))

## Section 7: Key Findings and Recommendations

In [None]:
print("\n" + "="*80)
print("KEY FINDINGS AND RECOMMENDATIONS")
print("="*80)

print("""
1. JSON VALIDITY RATE:
   - Few-Shot and Chain-of-Thought approaches both perform well
   - Providing examples (Few-Shot) or explicit reasoning steps improves format compliance
   - Zero-Shot may sometimes produce markdown-formatted JSON

2. ACCURACY:
   - Few-Shot approach typically achieves highest accuracy
   - Chain-of-Thought provides better reasoning transparency
   - Zero-Shot works but may miss nuances in sentiment

3. RELIABILITY & CONSISTENCY:
   - Few-Shot: Most reliable due to example guidance
   - Chain-of-Thought: Good reliability with transparent reasoning
   - Zero-Shot: Lower consistency but faster responses

4. RECOMMENDATION:
   ✓ For production use: FEW-SHOT prompting
     - Best balance of accuracy, JSON validity, and consistency
     - Examples guide the model to correct format and reasoning
   
   ✓ For explainability: CHAIN-OF-THOUGHT prompting
     - Transparent reasoning helps understand model decisions
     - Better for debugging and gaining user trust
   
   ✓ For speed/efficiency: ZERO-SHOT prompting
     - Faster responses with fewer tokens
     - Good for high-volume applications
""")

In [None]:
# Export results to CSV for documentation
results_summary = pd.DataFrame([
    {
        'Approach': 'Zero-Shot',
        'JSON_Validity_Rate': f"{approach1_metrics['json_validity_rate']:.1f}%",
        'Accuracy': f"{approach1_metrics['accuracy']:.1f}%",
        'Consistency': f"{calculate_consistency(approach1_metrics['results']):.1f}%",
        'Total_Reviews_Tested': approach1_metrics['total_reviews']
    },
    {
        'Approach': 'Few-Shot',
        'JSON_Validity_Rate': f"{approach2_metrics['json_validity_rate']:.1f}%",
        'Accuracy': f"{approach2_metrics['accuracy']:.1f}%",
        'Consistency': f"{calculate_consistency(approach2_metrics['results']):.1f}%",
        'Total_Reviews_Tested': approach2_metrics['total_reviews']
,
,
{approach3_metrics['json_validity_rate']:.1f}%",