<a href="https://colab.research.google.com/github/slkreddy/misc-utilities/blob/main/Comparative_Analysis_of_Prompt_Engineering_Scenarios_(Gemini_API).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
import os
import json
import random
import pandas as pd
import numpy as np
import statistics
from google import genai
from google.genai import types
from sklearn.metrics import accuracy_score
from IPython.display import display, Markdown
from google.colab import userdata


# --- Gemini Client Initialization ---
# 1. Get your API Key from Google AI Studio.
# 2. In Google Colab, go to the "Secrets" tab (lock icon on the left panel).
# 3. Add a new secret named 'GEMINI_API_KEY' and paste your key there.
# 4. Check the box to "Notebook access" for this secret.

try:
    # Use the environment variable to access the key securely
    API_KEY = "API_KEY"
    if not API_KEY:
        raise ValueError("GOOGLE_API_KEY secret not found or is empty.")

    # Initialize the client
    client = genai.Client(api_key=API_KEY)
    BASE_MODEL = 'gemini-2.5-flash'
    # Ensure consistent experiments
    np.random.seed(42)
    random.seed(42)
    print(f"Client initialized successfully using base model: {BASE_MODEL}")

except Exception as e:
    print(f"Error during setup: {e}")
    print("Please ensure you have set the 'GEMINI_API_KEY' in Colab Secrets.")

# Required for installation if running in a fresh environment
# !pip install google-genai pandas scikit-learn --quiet
# print("Installation complete.")

Client initialized successfully using base model: gemini-2.5-flash


**2. Dataset Definition**

In [37]:
TEST_DATASET = [
    {"document_text": "Annual revenue reached $1.5M with profit growth of 12%.", "true_label": "Financial Report", "true_category": "Finance"},
    {"document_text": "Balance sheet shows liabilities exceeding current assets.", "true_label": "Financial Report", "true_category": "Finance"},
    {"document_text": "Quarterly cash flow analysis demonstrates strong liquidity.", "true_label": "Financial Report", "true_category": "Finance"},
    {"document_text": "Investor update reports decrease in overhead costs.", "true_label": "Financial Report", "true_category": "Finance"},
    {"document_text": "Financial summary reveals Q4 net margin improvement.", "true_label": "Financial Report", "true_category": "Finance"},

    {"document_text": "This contract is subject to arbitration under state law.", "true_label": "Legal Contract", "true_category": "Legal"},
    {"document_text": "The party agrees to comply with all clauses stated herein.", "true_label": "Legal Contract", "true_category": "Legal"},
    {"document_text": "This agreement becomes effective on the signing date.", "true_label": "Legal Contract", "true_category": "Legal"},
    {"document_text": "The undersigned agrees to confidentiality terms specified below.", "true_label": "Legal Contract", "true_category": "Legal"},
    {"document_text": "The contract remains valid until mutual termination.", "true_label": "Legal Contract", "true_category": "Legal"},

    {"document_text": "Follow these instructions to configure the API endpoint securely.", "true_label": "Technical Manual", "true_category": "Technology"},
    {"document_text": "Deploy the Docker container using the following YAML config.", "true_label": "Technical Manual", "true_category": "Technology"},
    {"document_text": "Install Python dependencies using the provided requirements.txt file.", "true_label": "Technical Manual", "true_category": "Technology"},
    {"document_text": "Use 'kubectl apply' to deploy cluster applications.", "true_label": "Technical Manual", "true_category": "Technology"},
    {"document_text": "Execute the shell script to initialize system startup.", "true_label": "Technical Manual", "true_category": "Technology"},

    {"document_text": "The cafe serves artisanal coffee and handmade pastries.", "true_label": "Other", "true_category": "Lifestyle"},
    {"document_text": "The football team’s performance improved this season.", "true_label": "Other", "true_category": "Lifestyle"},
    {"document_text": "The movie offers a thrilling plot with unexpected twists.", "true_label": "Other", "true_category": "Lifestyle"},
    {"document_text": "The new art exhibit explores modern environmental themes.", "true_label": "Other", "true_category": "Lifestyle"},
    {"document_text": "The garden blooms beautifully during monsoon season.", "true_label": "Other", "true_category": "Lifestyle"}
]

VALID_LABELS = ["Financial Report", "Legal Contract", "Technical Manual", "Other"]



3. Scenario Prompts & Configuration

In [38]:
JSON_SCHEMA = types.Schema(
    type=types.Type.OBJECT,
    properties={
        "predicted_label": types.Schema(type=types.Type.STRING, enum=VALID_LABELS),
        "reasoning": types.Schema(type=types.Type.STRING)
    },
    required=["predicted_label", "reasoning"]
)

SCENARIOS = {
    "Scenario 1: Few-Shot (Plain Text)": {
        "prompt": (
            "You are an expert document classifier. Classify the following text into one of the categories:\n"
            "Categories: Financial Report, Legal Contract, Technical Manual, Other.\n"
            "Examples:\n"
            "Text: 'The company revenue increased quarter-over-quarter.' → Financial Report\n"
            "Text: 'This agreement terminates upon written notice.' → Legal Contract\n"
            "Text: 'Run the build command with make all.' → Technical Manual\n"
            "Text: 'The singer released a new album last month.' → Other\n"
            "Now classify:\nText: {document_text}\nAnswer STRICTLY with only the category name, nothing else."
        ),
        "temperature": 0.7,
        "json_schema": None,
        "system_instruction": "You are a document classifier. Output only the category name."
    },
    "Scenario 2: Structured Output (T=0)": {
        "prompt": (
            "Classify the following text strictly as JSON with keys 'predicted_label' and 'reasoning'.\n"
            "Text: {document_text}"
        ),
        "temperature": 0.0,
        "json_schema": JSON_SCHEMA,
        "system_instruction": "You are an expert document classifier. Respond only with the requested JSON object."
    },
    "Scenario 3: Structured Output (T=0) + CoT Prompting": {
        "prompt": (
            "First, think step-by-step to determine the correct category for this document. "
            "Then, output the final result strictly as JSON with keys 'predicted_label' and 'reasoning'.\n"
            "Document: {document_text}"
        ),
        "temperature": 0.0,
        "json_schema": JSON_SCHEMA,
        "system_instruction": "You are an expert document classifier. Respond only with the requested JSON object."
    },
    "Scenario 4: CoT (Plain Text)": {
        "prompt": (
            "Use chain-of-thought reasoning to classify this text into one of the categories: "
            "Financial Report, Legal Contract, Technical Manual, Other. "
            "Explain your reasoning clearly first, then output the final classification label on a new line.\n"
            "Document: {document_text}"
        ),
        "temperature": 0.8,
        "json_schema": None,
        "system_instruction": None
    },
    "Scenario 5: High-Temp Single Model (Simulated Checkpoints)": {
        "prompt": (
            "Classify the following text into one of the categories: "
            "Financial Report, Legal Contract, Technical Manual, Other.\n"
            "Text: {document_text}\nAnswer with the category name only."
        ),
        # High temperature increases variance, simulating different model 'perspectives'
        "temperature": 0.9,
        "json_schema": None,
        "system_instruction": "You are a creative classifier. Output only the category name."
    }
}


**3. Scenario Prompts & Configuration**

In [39]:
# Define the reusable JSON schema for structured output
JSON_SCHEMA = types.Schema(
    type=types.Type.OBJECT,
    properties={
        "predicted_label": types.Schema(type=types.Type.STRING, enum=VALID_LABELS),
        "reasoning": types.Schema(type=types.Type.STRING)
    },
    required=["predicted_label", "reasoning"]
)

SCENARIOS = {
    "Few-Shot Prompting": {
        "prompt": (
            "You are an expert document classifier. Classify the following text into one of the categories:\n"
            "Categories: Financial Report, Legal Contract, Technical Manual, Other.\n"
            "Examples:\n"
            "Text: 'The company revenue increased quarter-over-quarter.' → Financial Report\n"
            "Text: 'This agreement terminates upon written notice.' → Legal Contract\n"
            "Text: 'Run the build command with make all.' → Technical Manual\n"
            "Text: 'The singer released a new album last month.' → Other\n"
            "Now classify:\nText: {document_text}\nAnswer STRICTLY with only the category name, nothing else."
        ),
        "temperature": 0.7,
        "json_schema": None,
        "system_instruction": "You are a document classifier. Output only the category name."
    },
    "Structured Output (T=0)": {
        "prompt": (
            "Classify the following text strictly as JSON with keys 'predicted_label' and 'reasoning'.\n"
            "Text: {document_text}"
        ),
        "temperature": 0.0,
        "json_schema": JSON_SCHEMA,
        "system_instruction": "You are an expert document classifier. Respond only with the requested JSON object."
    },
    "Structured Output + Self-Consistent CoT": {
        "prompt": (
            "First, think step-by-step to determine the correct category for this document. "
            "Then, output the final result strictly as JSON with keys 'predicted_label' and 'reasoning'.\n"
            "Document: {document_text}"
        ),
        "temperature": 0.0,
        "json_schema": JSON_SCHEMA,
        "system_instruction": "You are an expert document classifier. Respond only with the requested JSON object."
    },
    "Self-Consistent CoT": {
        "prompt": (
            "Use chain-of-thought reasoning to classify this text into one of the categories: "
            "Financial Report, Legal Contract, Technical Manual, Other. "
            "Explain your reasoning clearly first, then output the final classification label on a new line.\n"
            "Document: {document_text}"
        ),
        "temperature": 0.8,
        "json_schema": None,
        "system_instruction": None
    },
    "Majority Vote (5 LLM Checkpoints)": {
        "prompt": (
            "Classify the following text into one of the categories: "
            "Financial Report, Legal Contract, Technical Manual, Other.\n"
            "Text: {document_text}\nAnswer with the category name only."
        ),
        # High temperature increases variance, simulating different model 'perspectives'
        "temperature": 0.9,
        "json_schema": None,
        "system_instruction": "You are a creative classifier. Output only the category name."
    }
}


**4. Experiment Execution Functions**

In [40]:
def call_gemini(prompt, temperature, model=BASE_MODEL, system_instruction=None, json_schema=None):
    """Handles the Gemini API call with dynamic configuration."""
    config_params = {"temperature": temperature}

    if system_instruction:
        config_params["system_instruction"] = system_instruction
    if json_schema:
        config_params["response_mime_type"] = "application/json"
        config_params["response_schema"] = json_schema

    config = types.GenerateContentConfig(**config_params)

    try:
        response = client.models.generate_content(
            model=model,
            contents=prompt,
            config=config
        )
        return response.text.strip()
    except Exception as e:
        # Fallback for errors
        if json_schema:
            return json.dumps({"predicted_label": "Other", "reasoning": f"API Error: {e}"})
        return f"API Error: {e}"

def parse_response(response_text, is_json_expected):
    """Parses the model's output for the predicted label."""
    if is_json_expected:
        try:
            data = json.loads(response_text)
            pred_label = data.get("predicted_label", "Other")
            reasoning = data.get("reasoning", "")
            # Ensure the parsed label is one of the valid ones, if not, treat as failure
            if pred_label in VALID_LABELS:
                return pred_label, reasoning, True
            else:
                return "Other", f"JSON label invalid: {pred_label}", False
        except Exception as e:
            return "Other", f"JSON Parse Error: {e}", False
    else:
        # For plain text output, search for the label within the response
        response_text_lower = response_text.lower()
        for label in VALID_LABELS:
            # Check for exact or close match to extract the label
            if label.lower() in response_text_lower and (len(response_text_lower) < 50 or response_text_lower.startswith(label.lower())):
                return label, response_text, True
        return "Other", response_text, False

def run_experiment(scenario_name, details, dataset):
    """Runs a single scenario against the entire dataset."""
    results = []
    print(f"Running {scenario_name} ...")

    is_json_expected = details["json_schema"] is not None

    for item in dataset:
        formatted_prompt = details["prompt"].format(document_text=item["document_text"])

        raw_response = call_gemini(
            prompt=formatted_prompt,
            temperature=details["temperature"],
            system_instruction=details.get("system_instruction"),
            json_schema=details["json_schema"]
        )

        pred_label, reasoning, success = parse_response(raw_response, is_json_expected)

        results.append({
            "document_text": item["document_text"],
            "true_label": item["true_label"],
            "predicted_label": pred_label,
            "reasoning": reasoning,
            "success": success
        })

    df = pd.DataFrame(results)
    return df


# **5. Scenario Loop and Measurement**



In [41]:
scenario_results = {}
accuracy_summary = []

for scenario_name, details in SCENARIOS.items():
    df = run_experiment(scenario_name, details, TEST_DATASET)
    acc = accuracy_score(df["true_label"], df["predicted_label"])
    # JSON parse success rate is only meaningful for Scenarios 2 and 3
    json_success_rate = df["success"].mean() if "Structured" in scenario_name else 1.0

    scenario_results[scenario_name] = df
    accuracy_summary.append({
        "Scenario": scenario_name,
        "Accuracy": round(acc * 100, 2),
        "JSON Parse Success Rate": round(json_success_rate * 100, 2)
    })


Running Few-Shot Prompting ...
Running Structured Output (T=0) ...
Running Structured Output + Self-Consistent CoT ...
Running Self-Consistent CoT ...
Running Majority Vote (5 LLM Checkpoints) ...


# **6. Ensemble Voting of Distinct LLMs (Scenario 6)**

In [42]:
# Use a mix of available Gemini models for the ensemble
VOTE_MODELS = [
    "gemini-2.5-flash",
    "gemini-2.5-pro",
    "gemini-2.5-flash",
    "gemini-2.5-pro",
    "gemini-2.5-flash"
]

def majority_vote_ensemble(document_text, temperature=0.6):
    """Runs classification across multiple models and takes a majority vote."""
    predictions = []
    reasoning_traces = []
    for model in VOTE_MODELS:
        # Use a simple, reliable prompt for voting
        prompt = (
            f"Classify the following document as one of: {', '.join(VALID_LABELS)}.\n"
            f"Text: {document_text}\n"
            "Answer with the category name only."
        )

        raw_response = call_gemini(
            prompt=prompt,
            temperature=temperature,
            model=model
        )

        pred_label, reason, _ = parse_response(raw_response, is_json_expected=False)
        predictions.append(pred_label)
        reasoning_traces.append(f"{model} predicted: {pred_label}")

    try:
        # Determine the most frequent prediction
        voted_label = statistics.mode(predictions)
    except statistics.StatisticsError:
        # If there's a tie, randomly select one of the predictions
        voted_label = random.choice(predictions)

    reason_concat = " | ".join(reasoning_traces)
    return voted_label, reason_concat, True

def run_majority_voting_llms(dataset):
    results = []
    print("Running Majority Vote (5 Diff LLMs) ...")
    for item in dataset:
        pred_label, reasoning, success = majority_vote_ensemble(item["document_text"])
        results.append({
            "document_text": item["document_text"],
            "true_label": item["true_label"],
            "predicted_label": pred_label,
            "reasoning": reasoning,
            "success": success
        })
    df = pd.DataFrame(results)

    print("Completed Running Majority Vote (5 Diff LLMs) ...", df.head())
    return df

df_majority = run_majority_voting_llms(TEST_DATASET)
acc_majority = accuracy_score(df_majority["true_label"], df_majority["predicted_label"])
# Since no JSON is expected, success rate is 1.0
success_rate = 1.0
accuracy_summary.append({
    "Scenario": "Majority Vote (5 Diff LLMs)",
    "Accuracy": round(acc_majority * 100, 2),
    "JSON Parse Success Rate": round(success_rate * 100, 2)
})
scenario_results["Majority Vote (5 Diff LLMs)"] = df_majority





Running Majority Vote (5 Diff LLMs) ...
Completed Running Majority Vote (5 Diff LLMs) ...                                        document_text        true_label  \
0  Annual revenue reached $1.5M with profit growt...  Financial Report   
1  Balance sheet shows liabilities exceeding curr...  Financial Report   
2  Quarterly cash flow analysis demonstrates stro...  Financial Report   
3  Investor update reports decrease in overhead c...  Financial Report   
4  Financial summary reveals Q4 net margin improv...  Financial Report   

    predicted_label                                          reasoning  \
0             Other  gemini-2.5-flash predicted: Other | gemini-2.5...   
1  Financial Report  gemini-2.5-flash predicted: Other | gemini-2.5...   
2  Financial Report  gemini-2.5-flash predicted: Financial Report |...   
3  Financial Report  gemini-2.5-flash predicted: Financial Report |...   
4  Financial Report  gemini-2.5-flash predicted: Financial Report |...   

   success  
0     T

# **7. Results Visualization**

In [49]:
summary_df = pd.DataFrame(accuracy_summary)
display(Markdown("### Comparative Performance Summary"))
# Reset pandas display options to ensure all columns are shown
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

display(Markdown(summary_df.to_markdown(index=False)))

display(Markdown("\n### Preview Detailed Results (Scenario 3: Structured Output + CoT)"))
# Preview detailed results for one of the structured output scenarios
display(scenario_results["Structured Output + Self-Consistent CoT"].head())


### Comparative Performance Summary

| Scenario                                |   Accuracy |   JSON Parse Success Rate |
|:----------------------------------------|-----------:|--------------------------:|
| Few-Shot Prompting                      |         80 |                       100 |
| Structured Output (T=0)                 |         25 |                       100 |
| Structured Output + Self-Consistent CoT |         25 |                       100 |
| Self-Consistent CoT                     |         25 |                       100 |
| Majority Vote (5 LLM Checkpoints)       |         25 |                       100 |
| Majority Vote (5 Diff LLMs)             |         45 |                       100 |


### Preview Detailed Results (Scenario 3: Structured Output + CoT)

Unnamed: 0,document_text,true_label,predicted_label,reasoning,success
0,Annual revenue reached $1.5M with profit growt...,Financial Report,Other,API Error: 429 RESOURCE_EXHAUSTED. {'error': {...,True
1,Balance sheet shows liabilities exceeding curr...,Financial Report,Other,API Error: 429 RESOURCE_EXHAUSTED. {'error': {...,True
2,Quarterly cash flow analysis demonstrates stro...,Financial Report,Other,API Error: 429 RESOURCE_EXHAUSTED. {'error': {...,True
3,Investor update reports decrease in overhead c...,Financial Report,Other,API Error: 429 RESOURCE_EXHAUSTED. {'error': {...,True
4,Financial summary reveals Q4 net margin improv...,Financial Report,Other,API Error: 429 RESOURCE_EXHAUSTED. {'error': {...,True
