In [None]:
import os
import pandas as pd
import requests
import time

In [None]:
# Config
# Replace "YOUR_API_KEY" with your actual API key from OpenRouter.
API_KEY = "YOUR_API_KEY"
API_URL = "https://openrouter.ai/api/v1/chat/completions"

# Set to True to run on only the first 2 prompts in the dataset.
# Set to False to run on the full dataset.
TEST_MODE = True

# List of models to benchmark.
MODELS_TO_BENCHMARK = [
    "meta-llama/llama-3.1-8b-instruct",
    "meta-llama/llama-3.1-70b-instruct",
    "mistralai/mistral-7b-instruct-v0.3",
    "mistralai/mixtral-8x22b-instruct",
    "qwen/qwen-2.5-7b-instruct",
    "qwen/qwen-2.5-72b-instruct"
]

INPUT_FILE = "dataset_final.csv"
OUTPUT_FILE = "benchmark_results_0_shot.csv"
REQUEST_TIMEOUT = 300 # Timeout for API requests in seconds

# System prompt we will send to each model.
system_prompt = (
"You are an expert on UK government services. Your goal is to provide accurate and factual information."
)

# User prompt we will send to each model.
user_prompt_template = (
    "As a resident of the United Kingdom, I need some information about government services. "
    "Please provide a direct and factual answer to the following question. "
    "Do not include any conversational pleasantries in your response.\n\n"
    "Question: {question}"
)

# Sends a question to a specific model and returns its response.
def get_model_response(model_name, question):

    print(f"  - Querying model: {model_name}...")
    
    # Format the user prompt template with the actual question
    prompt = user_prompt_template.format(question=question)
    
    # --- FIX: Use the formatted 'prompt' variable here ---
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt} # This was the line with the error
    ]
    
    data = {
        "model": model_name, 
        "messages": messages,
        "max_tokens": 1024
    }
    headers = {"Authorization": f"Bearer {API_KEY}"}

    try:
        response = requests.post(API_URL, headers=headers, json=data, timeout=REQUEST_TIMEOUT)
        response.raise_for_status()
        return response.json()['choices'][0]['message']['content'].strip()
    except Exception as e:
        print(f"    [ERROR] Failed to get response from {model_name}: {e}")
        return "ERROR: No response"
    
# Main function to run the benchmarking process.
def main():
    print("---- Starting Benchmarking ----")
    
    if TEST_MODE:
        print("RUNNING IN TEST MODE")
    
    if not API_KEY or API_KEY == "YOUR_API_KEY":
        print("FATAL ERROR: API_KEY is not set. Please edit the script and add your API key.")
        return

    if not os.path.exists(INPUT_FILE):
        print(f"FATAL ERROR: Input file not found at '{INPUT_FILE}'")
        return

    # Load the questions from the input CSV
    print(f"Loading questions from '{INPUT_FILE}'...")
    df = pd.read_csv(INPUT_FILE)
    
    # Logic for Test Mode
    if TEST_MODE:
        questions = df['prompt'].tolist()[:2] # Get only the first two questions
    else:
        questions = df['prompt'].tolist() # Get all questions
        
    print(f"Found {len(questions)} question(s) to benchmark.")

    # Create a new DataFrame to store results
    results_df = pd.DataFrame({'Question': questions})

    # Loop through each model and get its answers for all questions
    for model_id in MODELS_TO_BENCHMARK:
        print(f"\n--- Benchmarking Model: {model_id} ---")
        answers = []
        for i, question in enumerate(questions):
            print(f"  Processing question {i+1}/{len(questions)}: '{question[:60]}...'")
            answer = get_model_response(model_id, question)
            answers.append(answer)
            time.sleep(1) # Be courteous to the API
        
        # Add the list of answers as a new column to the results DataFrame
        results_df[f"Answer_{model_id.replace('/', '_')}"] = answers

    # Save the final results to a new CSV file
    print(f"\n--- Saving all results to '{OUTPUT_FILE}' ---")
    results_df.to_csv(OUTPUT_FILE, index=False)
    print("Benchmarking complete.")

if __name__ == "__main__":
    main()

---- Starting Benchmarking ----
RUNNING IN TEST MODE
Loading questions from 'dataset_final.csv'...
Found 250 question(s) to benchmark.

--- Benchmarking Model: meta-llama/llama-3.1-8b-instruct ---
  Processing question 1/250: 'What types of concerns can I report as a whistleblower?...'
  - Querying model: meta-llama/llama-3.1-8b-instruct...
  Processing question 2/250: 'Can I still report a wrongdoing if it happened in the past?...'
  - Querying model: meta-llama/llama-3.1-8b-instruct...
  Processing question 3/250: 'Will I be protected if I report my personal grievance, like ...'
  - Querying model: meta-llama/llama-3.1-8b-instruct...
  Processing question 4/250: 'What does it mean if my employer is insolvent?...'
  - Querying model: meta-llama/llama-3.1-8b-instruct...
  Processing question 5/250: 'What are the different types of insolvency?...'
  - Querying model: meta-llama/llama-3.1-8b-instruct...
  Processing question 6/250: 'Can I apply for financial support if my employer is ins