In [None]:
from collections import defaultdict
from tqdm import tqdm
import rootutils
import os




def evaluate_prompts(dataset, client, system_prompt):
    """
    Evaluates different prompt strategies on the provided dataset.
    
    Parameters:
    - math_ds: List of dictionaries with 'question' and 'answer' keys.
    - client: The client object used by callGPT.
    - system_prompt: The system prompt to be used with callGPT.
    
    Returns:
    - metrics: A dictionary with accuracy metrics for each prompt type.
    """
    for example in tqdm(dataset, desc="Evaluating Prompts"):
        question = example.get('question', '').strip()
        true_answer_raw = example.get('answer', '')
        true_answer = clean_answer(extract_answer(true_answer_raw))
        
        if not question or not true_answer:
            continue
        
        for prompt_name, prompt_func in prompt_functions.items():
            try:
                prompt = prompt_func(question)
                model_output = callGPT(system_prompt, prompt, client)
                
                model_answer_raw = extract_answer(model_output)
                model_answer = clean_answer(model_answer_raw)
                
                metrics[prompt_name]['total'] += 1
                if model_answer == true_answer:
                    metrics[prompt_name]['correct'] += 1
            except Exception as e:
                print(f"Error with prompt '{prompt_name}' on question '{question}': {e}")
                continue
    
    accuracy_results = {}
    for prompt_name, data in metrics.items():
        if data['total'] > 0:
            accuracy = data['correct'] / data['total']
            accuracy_results[prompt_name] = accuracy
        else:
            accuracy_results[prompt_name] = None  # No data for this prompt
    
    return accuracy_results

In [None]:

from concurrent.futures import ThreadPoolExecutor
import datetime
import json
from pathlib import Path
from typing import Any, Dict, List
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict
from datetime import datetime
from pathlib import Path
import json
from tqdm import tqdm
from typing import Dict, Any, List
current_file = os.getcwd()
root = rootutils.find_root(search_from=current_file, indicator=".project-root")
rootutils.setup_root(root, pythonpath=True)

from src.prompt_utils import getKojimaPrompt, getRegularPrompt, MATH_SYSTEM_PROMPT
from src.gpt_utils import callGPT

prompt_functions = {
    'Kojima': getKojimaPrompt,
    'Regular': getRegularPrompt,
}

metrics = defaultdict(lambda: {'correct': 0, 'total': 0})

def extract_answer(text):
    """
    Extracts the answer following the '####' delimiter.
    If '####' is not found, returns an empty string.
    """
    delimiter = '####'
    if delimiter in text:
        return text.split(delimiter)[-1].strip()
    else:
        return ''

def clean_answer(answer):
    """
    Cleans the extracted answer for comparison.
    This can include stripping whitespace, converting to lowercase, etc.
    Modify as needed based on the answer format.
    """
    return answer.strip().lower()

def process_example(args: tuple) -> Dict[str, Any]:
    """
    Process a single example with all prompt types.
    """
    example, client, system_prompt = args
    
    question = example.get('question', '').strip()
    true_answer_raw = example.get('answer', '')
    true_answer = clean_answer(extract_answer(true_answer_raw))
    
    if not question or not true_answer:
        return None
    
    example_results = {
        'question': question,
        'true_answer': true_answer_raw,
        'prompt_results': {}
    }
    
    for prompt_name, prompt_func in prompt_functions.items():
        try:
            prompt = prompt_func(question)
            model_output = callGPT(system_prompt, prompt, client)
            
            model_answer_raw = extract_answer(model_output)
            model_answer = clean_answer(model_answer_raw)
            
            example_results['prompt_results'][prompt_name] = {
                'prompt': prompt,
                'model_output': model_output,
                'extracted_answer': model_answer_raw,
                'is_correct': model_answer == true_answer
            }
            
        except Exception as e:
            print(f"Error with prompt '{prompt_name}' on question '{question}': {e}")
            example_results['prompt_results'][prompt_name] = {
                'error': str(e)
            }
    
    return example_results

def evaluate_prompts(dataset: List[Dict], 
                    client, 
                    system_prompt: str,
                    output_path: str = "evaluation_results.json",
                    max_workers: int = 5) -> Dict[str, float]:
    """
    Evaluates different prompt strategies and saves all results to a single JSON file.
    """
    metrics = defaultdict(lambda: {'correct': 0, 'total': 0})
    all_results = {
        'metadata': {
            'timestamp': datetime.now().isoformat(),
            'total_examples': len(dataset)
        },
        'results': []
    }
    
    # Create arguments for parallel processing
    process_args = [(example, client, system_prompt) for example in dataset]
    
    # Process examples in parallel using a simpler approach
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Use list comprehension with tqdm
        futures = [executor.submit(process_example, args) for args in process_args]
        
        # Process results as they complete
        for future in tqdm(futures, total=len(dataset), desc="Processing examples"):
            try:
                result = future.result()
                if result is not None:
                    all_results['results'].append(result)
                    
                    # Update metrics
                    for prompt_name, prompt_result in result['prompt_results'].items():
                        if 'is_correct' in prompt_result:
                            metrics[prompt_name]['total'] += 1
                            if prompt_result['is_correct']:
                                metrics[prompt_name]['correct'] += 1
                
            except Exception as e:
                print(f"Error processing example: {e}")
    
    # Calculate final accuracies
    accuracy_results = {}
    for prompt_name, data in metrics.items():
        if data['total'] > 0:
            accuracy = data['correct'] / data['total']
            accuracy_results[prompt_name] = accuracy
        else:
            accuracy_results[prompt_name] = None
    
    # Add metrics to results
    all_results['metrics'] = {
        'accuracy_results': accuracy_results,
        'detailed_metrics': {k: dict(v) for k, v in metrics.items()}
    }
    
    # Save all results to a single JSON file
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2)
    
    print(f"\nResults saved to: {output_path}")
    return accuracy_results

In [18]:
from src.gpt_utils import initialize_openai_model
from src.data_utils import CoTDataset

client = initialize_openai_model()

dataset = list(CoTDataset("openai/gsm8k"))[:10]

Loaded dataset 'openai/gsm8k'.
Processed 'openai/gsm8k' dataset.


In [23]:
results = evaluate_prompts(
    dataset=dataset,
    client=client,
    system_prompt=MATH_SYSTEM_PROMPT,
    output_path="evaluation_results.json",
    max_workers=5
)

print("\nAccuracy Results:")
for prompt_name, accuracy in results.items():
    print(f"{prompt_name}: {accuracy:.2%}")

Processing examples: 100%|██████████| 10/10 [00:10<00:00,  1.08s/it]


Results saved to: evaluation_results.json

Accuracy Results:
Kojima: 90.00%
Regular: 20.00%



