<a href="https://colab.research.google.com/github/slkreddy/misc-utilities/blob/main/Comparative_Analysis_of_Prompt_Engineering_Scenarios_(Open_AI)_11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import json
import random
import pandas as pd
import numpy as np
import statistics
# from google import genai # Remove Gemini import
# from google.genai import types # Remove Gemini types import
from openai import OpenAI # Import OpenAI library
from sklearn.metrics import accuracy_score
from IPython.display import display, Markdown
from google.colab import userdata


# --- API Client Initialization ---
# 1. Get your API Key from Google AI Studio (for Gemini) or OpenAI (for OpenAI).
# 2. In Google Colab, go to the "Secrets" tab (lock icon on the left panel).
# 3. Add a new secret named 'OPENAI_API_KEY' and paste your key there.
# 4. Check the box to "Notebook access" for this secret.

try:
    # Use the environment variable to access the key securely
    # API_KEY = "AIzaSyBue2n6qBUGYjI_Br6dFgh2FTH4vpoIK2A" # Remove Gemini API key
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
    if not OPENAI_API_KEY:
        raise ValueError("OPENAI_API_KEY secret not found or is empty.")

    # Initialize the client
    # client = genai.Client(api_key=API_KEY) # Remove Gemini client initialization
    client = OpenAI(api_key=OPENAI_API_KEY) # Initialize OpenAI client
    # BASE_MODEL = 'gemini-2.5-flash' # Remove Gemini base model
    BASE_MODEL = 'gpt-3.5-turbo' # Set OpenAI base model
    # Ensure consistent experiments
    np.random.seed(42)
    random.seed(42)
    print(f"Client initialized successfully using base model: {BASE_MODEL}")

except Exception as e:
    print(f"Error during setup: {e}")
    print("Please ensure you have set the 'OPENAI_API_KEY' in Colab Secrets.")

# Required for installation if running in a fresh environment
# !pip install google-genai pandas scikit-learn --quiet # Remove Gemini install
# print("Installation complete.")

Client initialized successfully using base model: gpt-3.5-turbo


# **`2. Dataset Definition`**

In [None]:
TEST_DATASET = [
    # Finance/Business/Legal/Technical nuances
    {"document_text": "Partial Q2 revenue reported amidst ongoing merger talks. References to FASB standard revisions complicate net asset categorization.", "true_label": "Financial Report", "true_category": "Finance"},
    {"document_text": "Profit & loss consolidation shows overlapping obligations with regional subsidiaries, according to regulation SEC-405.", "true_label": "Financial Report", "true_category": "Finance"},
    {"document_text": "Annual audit memo contains both forward-looking statements and non-cash adjustments due to IFRS-9.", "true_label": "Financial Report", "true_category": "Finance"},
    {"document_text": "The quarterly review includes embedded legal disclaimers and warranty breaches.", "true_label": "Legal Analysis", "true_category": "Legal"},
    {"document_text": "Balance sheet reconciliations are challenged by pending compliance litigation.", "true_label": "Financial Report", "true_category": "Finance"},
    {"document_text": "The investment committee’s update references amendments to fiduciary duty, but also risk tolerance thresholds.", "true_label": "Legal Analysis", "true_category": "Finance/Legal"},

    # Contracts and HR policies
    {"document_text": "The employment agreement outlines IP assignment, opt-out provisions for remote work, and mandatory arbitration.", "true_label": "Employment Contract", "true_category": "Legal/HR"},
    {"document_text": "Supplier contract drafts include escalation, exclusivity, and NDA terms, reviewed for antitrust compatibility.", "true_label": "Supply Chain Agreement", "true_category": "Legal/Procurement"},
    {"document_text": "Data processing addendum includes cross-jurisdiction GDPR compliance, retention schedules, and DSR workflow.", "true_label": "Data Policy Addendum", "true_category": "Legal/Data"},
    {"document_text": "Consulting contract with variable deliverables and rolling renewal clause, includes non-compete protocol.", "true_label": "Consulting Contract", "true_category": "Legal/Business"},
    {"document_text": "The freelance agreement budgets for milestone payments with liability caps, but omits exclusivity.", "true_label": "Freelance Agreement", "true_category": "Legal/Finance"},
    {"document_text": "Board resolution covering indemnity provisions and deferred compensation for key management.", "true_label": "Board Resolution", "true_category": "Legal/Governance"},

    # Technical docs with ambiguous context
    {"document_text": "Docker installation guide includes Kubernetes pod affinity configuration and references legacy Swarm CLI compatibility.", "true_label": "Tech Manual", "true_category": "Technology"},
    {"document_text": "API spec discusses OAuth2, custom error classes, and layered RBAC, mixed with product requirement annex.", "true_label": "API Specification", "true_category": "Technology/Product"},
    {"document_text": "The troubleshooting FAQ interleaves networking topology illustrations with compliance checkpoint reminders.", "true_label": "Support FAQ", "true_category": "Technology/Compliance"},
    {"document_text": "System deployment manual covers cloud failover, secret rotations, and hybrid on-prem protocol fallback.", "true_label": "Deployment Manual", "true_category": "Technology/IT"},
    {"document_text": "SDLC workflow diagrams embedded within SRE best practices and incident response flow.", "true_label": "SRE Protocol", "true_category": "Technology/Operations"},
    {"document_text": "Data pipeline whitepaper mixes ETL transformations, GDPR recommendations, and machine learning benchmarks.", "true_label": "Data Engineering Report", "true_category": "Technology/Data"},

    # News/Media/Entertainment ambiguity
    {"document_text": "The editorial critiques both policy shifts in monetary regulation and pop culture’s response.", "true_label": "Editorial Article", "true_category": "Media/Economics"},
    {"document_text": "Feature story blending investigative reporting on antitrust law and music rights disputes.", "true_label": "Feature Article", "true_category": "Media/Legal"},
    {"document_text": "Concert review discusses artist’s social activism and new streaming platform policy changes.", "true_label": "Music Review", "true_category": "Media/Entertainment"},
    {"document_text": "Documentary summary that weaves together climate policy, local business impact, and visual arts.", "true_label": "Documentary Synopsis", "true_category": "Media/Environment"},
    {"document_text": "Film festival guide covers both jury procedures and independent funding rules.", "true_label": "Festival Guide", "true_category": "Media/Legal"},
    {"document_text": "Opinion piece explores blockchain application in journalism and copyright reform.", "true_label": "Opinion Article", "true_category": "Media/Tech"},

    # Science/Healthcare/Research tricky boundaries
    {"document_text": "Research note reviews meta-analysis methods in biostatistics and clinical ethics regulation.", "true_label": "Research Note", "true_category": "Science/Healthcare"},
    {"document_text": "Hospital operations report addresses staff scheduling algorithm and patient confidentiality safeguards.", "true_label": "Healthcare Operations", "true_category": "Healthcare/IT"},
    {"document_text": "Clinical trial paper discusses phase II recruitment and decentralized identity management.", "true_label": "Clinical Trial Report", "true_category": "Healthcare/Technology"},
    {"document_text": "Nutrition guide includes molecular composition tables and local regulatory approvals.", "true_label": "Nutrition Guide", "true_category": "Science/Policy"},
    {"document_text": "Genetics study uses multi-omics visualization and legal status of bio-data sovereignty.", "true_label": "Genomics Study", "true_category": "Science/Legal"},
    {"document_text": "Medical software manual integrates privacy compliance and algorithm explainability.", "true_label": "Medical Software Manual", "true_category": "Healthcare/Technology"},

    # Lifestyle/Culture/Education/Commerce complexity
    {"document_text": "Travelogue evaluates urban sustainability metrics and experiential hospitality reviews.", "true_label": "Travelogue", "true_category": "Lifestyle/Environment"},
    {"document_text": "University program handbook mixes admission benchmarks with anti-discrimination code.", "true_label": "Program Handbook", "true_category": "Education/Policy"},
    {"document_text": "Entrepreneur profile highlights cross-border trade innovation and startup governance challenges.", "true_label": "Entrepreneur Profile", "true_category": "Business/Culture"},
    {"document_text": "Home appliance review presents IoT integrations and regional energy standards.", "true_label": "Appliance Review", "true_category": "Lifestyle/Tech"},
    {"document_text": "Culinary event brochure covers nutrition data and food licensing.", "true_label": "Event Brochure", "true_category": "Lifestyle/Health"},
    {"document_text": "Textbook chapter surveys both empirical math models and UNESCO best practices.", "true_label": "Textbook Chapter", "true_category": "Education/Science"},

    # Social, automation, and policy overlaps
    {"document_text": "Smart home assessment integrates n8n workflow automation and housing equity impact analysis.", "true_label": "Home Automation Review", "true_category": "Lifestyle/Tech/Social"},
    {"document_text": "Civic technology bulletin spotlights open source agentic platforms and government adoption barriers.", "true_label": "Civic Tech Bulletin", "true_category": "Tech/Policy"},
    {"document_text": "LinkedIn strategy memo details global time zone engagement and post optimization compliance.", "true_label": "Social Media Memo", "true_category": "Business/Tech"},
    {"document_text": "Podcast recap covers moon landing video discussions and AI influencer metrics.", "true_label": "Podcast Recap", "true_category": "Media/Tech"},
    {"document_text": "Book summary with semantic graph illustrations and PII guardrails.", "true_label": "Book Summary", "true_category": "Education/Tech/Security"},
    {"document_text": "Consumer electronics announcement mixes supply chain forecasts and E-waste recycling mandates.", "true_label": "Product Announcement", "true_category": "Business/Policy"}
]

VALID_LABELS = [
    "Financial Report", "Legal Analysis", "Employment Contract", "Supply Chain Agreement", "Data Policy Addendum", "Consulting Contract", "Freelance Agreement", "Board Resolution",
    "Tech Manual", "API Specification", "Support FAQ", "Deployment Manual", "SRE Protocol", "Data Engineering Report",
    "Editorial Article", "Feature Article", "Music Review", "Documentary Synopsis", "Festival Guide", "Opinion Article",
    "Research Note", "Healthcare Operations", "Clinical Trial Report", "Nutrition Guide", "Genomics Study", "Medical Software Manual",
    "Travelogue", "Program Handbook", "Entrepreneur Profile", "Appliance Review", "Event Brochure", "Textbook Chapter",
    "Home Automation Review", "Civic Tech Bulletin", "Social Media Memo", "Podcast Recap", "Book Summary", "Product Announcement"
]




# 3. Scenario Prompts & Configuration




In [None]:
# Update the JSON schema if necessary (OpenAI might handle it slightly differently, but this schema is generic)
# JSON_SCHEMA = types.Schema( # Remove Gemini types.Schema
#     type=types.Type.OBJECT,
#     properties={
#         "predicted_label": types.Schema(type=types.Type.STRING, enum=VALID_LABELS),
#         "reasoning": types.Schema(type=types.Type.STRING)
#     },
#     required=["predicted_label", "reasoning"]
# )

JSON_SCHEMA = { # Define a dictionary that represents the schema
    "type": "object",
    "properties": {
        "predicted_label": {"type": "string", "enum": VALID_LABELS},
        "reasoning": {"type": "string"}
    },
    "required": ["predicted_label", "reasoning"]
}


SCENARIOS = {
    "Few-Shot Prompting": {
        "prompt": (
            "You are an expert document classifier. Classify the following text into one of the categories:\n"
            "Categories: Financial Report, Legal Contract, Technical Manual, Other.\n"
            "Examples:\n"
            "Text: 'The company revenue increased quarter-over-quarter.' → Financial Report\n"
            "Text: 'This agreement terminates upon written notice.' → Legal Contract\n"
            "Text: 'Run the build command with make all.' → Technical Manual\n"
            "Text: 'The singer released a new album last month.' → Other\n"
            "Now classify:\nText: {document_text}\nAnswer STRICTLY with only the category name, nothing else."
        ),
        "temperature": 0.7,
        "json_schema": None,
        # "system_instruction": "You are a document classifier. Output only the category name." # System instruction is handled in the API call for OpenAI
         "system_prompt": "You are a document classifier. Output only the category name." # Use system_prompt for OpenAI
    },
    "Structured Output (T=0)": {
        "prompt": (
            "Classify the following text strictly as JSON with keys 'predicted_label' and 'reasoning'.\n"
            "Text: {document_text}"
        ),
        "temperature": 0.0,
        "json_schema": JSON_SCHEMA,
        # "system_instruction": "You are an expert document classifier. Respond only with the requested JSON object." # System instruction is handled in the API call for OpenAI
        "system_prompt": "You are an expert document classifier. Respond only with the requested JSON object." # Use system_prompt for OpenAI

    },
    "Structured Output + Self-Consistent CoT": {
        "prompt": (
            "First, think step-by-step to determine the correct category for this document. "
            "Then, output the final result strictly as JSON with keys 'predicted_label' and 'reasoning'.\n"
            "Document: {document_text}"
        ),
        "temperature": 0.0,
        "json_schema": JSON_SCHEMA,
        # "system_instruction": "You are an expert document classifier. Respond only with the requested JSON object." # System instruction is handled in the API call for OpenAI
        "system_prompt": "You are an expert document classifier. Respond only with the requested JSON object." # Use system_prompt for OpenAI
    },
    "Self-Consistent CoT": {
        "prompt": (
            "Use chain-of-thought reasoning to classify this text into one of the categories: "
            "Financial Report, Legal Contract, Technical Manual, Other. "
            "Explain your reasoning clearly first, then output the final classification label on a new line.\n"
            "Document: {document_text}"
        ),
        "temperature": 0.8,
        "json_schema": None,
        "system_prompt": None # No system prompt for this scenario

    },
    "Majority Vote (5 LLM Checkpoints)": {
        "prompt": (
            "Classify the following text into one of the categories: "
            "Financial Report, Legal Contract, Technical Manual, Other.\n"
            "Text: {document_text}\nAnswer with the category name only."
        ),
        # High temperature increases variance, simulating different model 'perspectives'
        "temperature": 0.9,
        "json_schema": None,
        # "system_instruction": "You are a creative classifier. Output only the category name." # System instruction is handled in the API call for OpenAI
         "system_prompt": "You are a creative classifier. Output only the category name." # Use system_prompt for OpenAI
    }
}

# **4. Experiment Execution Functions**


In [None]:
def call_openai(prompt, temperature, model=BASE_MODEL, system_prompt=None, json_schema=None):
    """Handles the OpenAI API call with dynamic configuration."""
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": prompt})

    try:
        # Configure response format for JSON
        response_format = {"type": "text"}
        if json_schema:
            response_format = {"type": "json_object"}

        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature,
            response_format=response_format
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        # Fallback for errors
        if json_schema:
            return json.dumps({"predicted_label": "Other", "reasoning": f"API Error: {e}"})
        return f"API Error: {e}"


def call_gemini(prompt, temperature, model=BASE_MODEL, system_instruction=None, json_schema=None):
    """Handles the Gemini API call with dynamic configuration."""
    config_params = {"temperature": temperature}

    if system_instruction:
        config_params["system_instruction"] = system_instruction
    if json_schema:
        config_params["response_mime_type"] = "application/json"
        config_params["response_schema"] = json_schema

    # Check if the client is initialized for Gemini before calling
    if 'genai' in globals() and genai.Client and isinstance(client, genai.Client):
        try:
            response = client.models.generate_content(
                model=model,
                contents=prompt,
                config=config_params # Use config_params directly
            )
            return response.text.strip()
        except Exception as e:
            # Fallback for errors
            if json_schema:
                return json.dumps({"predicted_label": "Other", "reasoning": f"API Error: {e}"})
            return f"API Error: {e}"
    else:
        return "API Error: Gemini client not initialized."



def parse_response(response_text, is_json_expected):
    """Parses the model's output for the predicted label."""
    if is_json_expected:
        try:
            data = json.loads(response_text)
            pred_label_raw = data.get("predicted_label", "Other")
            reasoning = data.get("reasoning", "")

            # Attempt to find a close match in VALID_LABELS
            pred_label = "Other"
            for valid_label in VALID_LABELS:
                if valid_label.lower() in pred_label_raw.lower() or pred_label_raw.lower() in valid_label.lower():
                    pred_label = valid_label
                    break # Found a match, use this and stop

            if pred_label == "Other":
                 reasoning = f"JSON label invalid: {pred_label_raw}"
                 success = False
            else:
                 success = True

            return pred_label, reasoning, success

        except Exception as e:
            return "Other", f"JSON Parse Error: {e}", False
    else:
        # For plain text output, search for the label within the response
        response_text_lower = response_text.lower()
        for label in VALID_LABELS:
            # Check for exact or close match to extract the label
            # Modified parsing logic for potentially less strict plain text responses
            if label.lower() in response_text_lower:
                # Simple check: if the label is in the response, assume it's the prediction
                # More robust parsing might be needed depending on model output
                return label, response_text, True

        # If no valid label is found, return "Other"
        return "Other", response_text, False


def run_experiment(scenario_name, details, dataset):
    """Runs a single scenario against the entire dataset."""
    results = []
    print(f"Running {scenario_name} ...")

    is_json_expected = details.get("json_schema") is not None # Use .get for robustness

    for item in dataset:
        formatted_prompt = details["prompt"].format(document_text=item["document_text"])

        # Use call_openai instead of call_gemini
        raw_response = call_openai(
            prompt=formatted_prompt,
            temperature=details["temperature"],
            system_prompt=details.get("system_prompt"), # Use .get for robustness
            json_schema=details.get("json_schema") # Use .get for robustness
        )


        pred_label, reasoning, success = parse_response(raw_response, is_json_expected)

        results.append({
            "document_text": item["document_text"],
            "true_label": item["true_label"],
            "predicted_label": pred_label,
            "reasoning": reasoning,
            "success": success
        })

    df = pd.DataFrame(results)
    return df

In [None]:
display(Markdown("### Individual Model Predictions in Majority Vote (5 Diff LLMs)"))
display(df_majority[['document_text', 'true_label', 'predicted_label', 'reasoning']].head())

### Individual Model Predictions in Majority Vote (5 Diff LLMs)

Unnamed: 0,document_text,true_label,predicted_label,reasoning
0,Partial Q2 revenue reported amidst ongoing mer...,Financial Report,Financial Report,gpt-3.5-turbo predicted: Financial Report | gp...
1,Profit & loss consolidation shows overlapping ...,Financial Report,Financial Report,gpt-3.5-turbo predicted: Financial Report | gp...
2,Annual audit memo contains both forward-lookin...,Financial Report,Financial Report,gpt-3.5-turbo predicted: Financial Report | gp...
3,The quarterly review includes embedded legal d...,Legal Analysis,Financial Report,gpt-3.5-turbo predicted: Financial Report | gp...
4,Balance sheet reconciliations are challenged b...,Financial Report,Financial Report,gpt-3.5-turbo predicted: Financial Report | gp...


In [None]:
# Calculate and display accuracy for each individual model in the Majority Vote scenario
individual_model_accuracies = {}

# Extract individual model predictions from the 'reasoning' column
for model_name in VOTE_MODELS:
    model_predictions = []
    for index, row in df_majority.iterrows():
        # Find the prediction for the current model in the reasoning string
        reasoning_parts = row['reasoning'].split(" | ")
        model_pred = "Other" # Default if not found or parsed incorrectly
        for part in reasoning_parts:
            if part.startswith(f"{model_name} predicted:"):
                # Extract the predicted label after the colon and strip whitespace
                pred_text = part.split(":", 1)[1].strip()
                # Attempt to match to a valid label (similar logic to parse_response for non-JSON)
                matched_label = "Other"
                for valid_label in VALID_LABELS:
                     # Check for exact or close match
                    if valid_label.lower() in pred_text.lower() or pred_text.lower() in valid_label.lower():
                        matched_label = valid_label
                        break
                model_pred = matched_label
                break # Found the prediction for this model

        model_predictions.append(model_pred)

    # Calculate accuracy for the current model
    acc = accuracy_score(df_majority["true_label"], model_predictions)
    individual_model_accuracies[model_name] = round(acc * 100, 2)

display(Markdown("### Accuracy of Individual Models in Majority Vote (5 Diff LLMs)"))
for model, accuracy in individual_model_accuracies.items():
    print(f"{model}: {accuracy:.2f}%")

# Determine the most accurate model
most_accurate_model = max(individual_model_accuracies, key=individual_model_accuracies.get)
print(f"\nThe most accurate individual model is: {most_accurate_model} ({individual_model_accuracies[most_accurate_model]:.2f}%)")

### Accuracy of Individual Models in Majority Vote (5 Diff LLMs)

gpt-3.5-turbo: 76.19%
gpt-4o-mini: 88.10%
gpt-4-turbo: 90.48%
gpt-4o: 90.48%
gpt-4: 85.71%

The most accurate individual model is: gpt-4-turbo (90.48%)


# **5. Scenario Loop and Measurement**



In [None]:
scenario_results = {}
accuracy_summary = []

# Define the subset size for reducing cost
subset_size = 10 # You can adjust this number to control the size of the subset

# Create a random subset of the TEST_DATASET
random_subset = random.sample(TEST_DATASET, min(subset_size, len(TEST_DATASET)))


for scenario_name, details in SCENARIOS.items():
    # Use the random subset for the experiment
    df = run_experiment(scenario_name, details, random_subset)
    acc = accuracy_score(df["true_label"], df["predicted_label"])
    # JSON parse success rate is only meaningful for Scenarios with JSON schema
    json_success_rate = df["success"].mean() if "json_schema" in details and details["json_schema"] is not None else 1.0

    scenario_results[scenario_name] = df
    accuracy_summary.append({
        "Scenario": scenario_name,
        "Accuracy": round(acc * 100, 2),
        "JSON Parse Success Rate": round(json_success_rate * 100, 2)
    })

Running Few-Shot Prompting ...
Running Structured Output (T=0) ...
Running Structured Output + Self-Consistent CoT ...
Running Self-Consistent CoT ...
Running Majority Vote (5 LLM Checkpoints) ...


# **6. Ensemble Voting of Distinct LLMs (Scenario 6)**

In [None]:
# Use a mix of available OpenAI models for the ensemble, prioritizing cheaper ones
VOTE_MODELS = [
    "gpt-3.5-turbo",
    "gpt-4o-mini",
    "gpt-3.5-turbo",
    "gpt-4o-mini",
    "gpt-3.5-turbo" # Added more instances of cheaper models
]

def majority_vote_ensemble(document_text, temperature=0.6):
    """Runs classification across multiple models and takes a majority vote."""
    predictions = []
    reasoning_traces = []
    for model in VOTE_MODELS:
        # Use a simple, reliable prompt for voting
        prompt = (
            f"Classify the following document as one of: {', '.join(VALID_LABELS)}.\n"
            f"Text: {document_text}\n"
            "Answer with the category name only."
        )

        # Use call_openai instead of call_gemini
        raw_response = call_openai(
            prompt=prompt,
            temperature=temperature,
            model=model
        )

        # Assuming parse_response handles plain text output correctly
        pred_label, reason, _ = parse_response(raw_response, is_json_expected=False)
        predictions.append(pred_label)
        reasoning_traces.append(f"{model} predicted: {pred_label}")

    try:
        # Determine the most frequent prediction
        voted_label = statistics.mode(predictions)
    except statistics.StatisticsError:
        # If there's a tie, randomly select one of the predictions
        voted_label = random.choice(predictions)

    reason_concat = " | ".join(reasoning_traces)
    return voted_label, reason_concat, True

def run_majority_voting_llms(dataset):
    results = []
    print("Running Majority Vote (Different LLMs) ...") # Updated print message
    for item in dataset:
        pred_label, reasoning, success = majority_vote_ensemble(item["document_text"])
        results.append({
            "document_text": item["document_text"],
            "true_label": item["true_label"],
            "predicted_label": pred_label,
            "reasoning": reasoning,
            "success": success
        })
    df = pd.DataFrame(results)

    print("Completed Running Majority Vote (Different LLMs) ...", df.head()) # Updated print message
    return df

df_majority = run_majority_voting_llms(TEST_DATASET)
acc_majority = accuracy_score(df_majority["true_label"], df_majority["predicted_label"])
# Since no JSON is expected, success rate is 1.0
success_rate = 1.0
accuracy_summary.append({
    "Scenario": "Majority Vote (Different LLMs)", # Updated scenario name
    "Accuracy": round(acc_majority * 100, 2),
    "JSON Parse Success Rate": round(success_rate * 100, 2)
})
scenario_results["Majority Vote (Different LLMs)"] = df_majority # Updated scenario name

Running Majority Vote (Different LLMs) ...
Completed Running Majority Vote (Different LLMs) ...                                        document_text        true_label  \
0  Partial Q2 revenue reported amidst ongoing mer...  Financial Report   
1  Profit & loss consolidation shows overlapping ...  Financial Report   
2  Annual audit memo contains both forward-lookin...  Financial Report   
3  The quarterly review includes embedded legal d...    Legal Analysis   
4  Balance sheet reconciliations are challenged b...  Financial Report   

    predicted_label                                          reasoning  \
0  Financial Report  gpt-3.5-turbo predicted: Financial Report | gp...   
1  Financial Report  gpt-3.5-turbo predicted: Financial Report | gp...   
2  Financial Report  gpt-3.5-turbo predicted: Financial Report | gp...   
3  Financial Report  gpt-3.5-turbo predicted: Financial Report | gp...   
4  Financial Report  gpt-3.5-turbo predicted: Financial Report | gp...   

   success  
0

# **6. Ensemble Voting of Distinct LLMs (Scenario 6)**

In [None]:
scenario_results = {}
accuracy_summary = []

for scenario_name, details in SCENARIOS.items():
    df = run_experiment(scenario_name, details, TEST_DATASET)
    acc = accuracy_score(df["true_label"], df["predicted_label"])
    # JSON parse success rate is only meaningful for Scenarios with JSON schema
    json_success_rate = df["success"].mean() if "json_schema" in details and details["json_schema"] is not None else 1.0

    scenario_results[scenario_name] = df
    accuracy_summary.append({
        "Scenario": scenario_name,
        "Accuracy": round(acc * 100, 2),
        "JSON Parse Success Rate": round(json_success_rate * 100, 2)
    })

Running Few-Shot Prompting ...
Running Structured Output (T=0) ...
Running Structured Output + Self-Consistent CoT ...
Running Self-Consistent CoT ...
Running Majority Vote (5 LLM Checkpoints) ...


# **7. Results Visualization**

In [None]:
import tiktoken

# Initialize the tokenizer for the base model (gpt-3.5-turbo is a good default)
try:
    encoding = tiktoken.encoding_for_model(BASE_MODEL)
except KeyError:
    # Fallback for models not directly supported by encoding_for_model
    encoding = tiktoken.get_encoding("cl100k_base")


def count_tokens(text):
    """Counts tokens in a given text using the initialized tokenizer."""
    return len(encoding.encode(text))

total_tokens = 0
request_count = 0

# Iterate through each scenario and each item in the dataset
# Note: This calculates the average token count for the *prompts* sent to the API across all scenarios and dataset items.
for scenario_name, details in SCENARIOS.items():
    for item in TEST_DATASET:
        # Format the prompt for the current scenario and document
        formatted_prompt = details["prompt"].format(document_text=item["document_text"])

        # Include system prompt tokens if a system prompt is used
        prompt_tokens = count_tokens(formatted_prompt)
        if details.get("system_prompt"):
             prompt_tokens += count_tokens(details["system_prompt"])

        total_tokens += prompt_tokens
        request_count += 1

# Calculate the average token count per request
average_tokens_per_request = total_tokens / request_count if request_count > 0 else 0

print(f"Average token count per request: {average_tokens_per_request:.2f}")

Average token count per request: 74.53


In [None]:
summary_df = pd.DataFrame(accuracy_summary)
display(Markdown("### Comparative Performance Summary"))
# Reset pandas display options to ensure all columns are shown
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

display(Markdown(summary_df.to_markdown(index=False)))

display(Markdown("\n### Preview Detailed Results (Scenario 3: Structured Output + CoT)"))
# Preview detailed results for one of the structured output scenarios
display(scenario_results["Structured Output + Self-Consistent CoT"].head())


### Comparative Performance Summary

| Scenario                                |   Accuracy |   JSON Parse Success Rate |
|:----------------------------------------|-----------:|--------------------------:|
| Few-Shot Prompting                      |       7.14 |                    100    |
| Structured Output (T=0)                 |      11.9  |                     16.67 |
| Structured Output + Self-Consistent CoT |      11.9  |                     23.81 |
| Self-Consistent CoT                     |      11.9  |                    100    |
| Majority Vote (5 LLM Checkpoints)       |       7.14 |                    100    |


### Preview Detailed Results (Scenario 3: Structured Output + CoT)

Unnamed: 0,document_text,true_label,predicted_label,reasoning,success
0,Partial Q2 revenue reported amidst ongoing mer...,Financial Report,Other,JSON label invalid: Finance,False
1,Profit & loss consolidation shows overlapping ...,Financial Report,Financial Report,The document discusses profit & loss consolida...,True
2,Annual audit memo contains both forward-lookin...,Financial Report,Financial Report,"The document mentions an annual audit memo, fo...",True
3,The quarterly review includes embedded legal d...,Legal Analysis,Legal Analysis,The document mentions legal disclaimers and wa...,True
4,Balance sheet reconciliations are challenged b...,Financial Report,Legal Analysis,The document mentions pending compliance litig...,True
