In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Union
import requests
import time
import numpy as np

# Constants
OLLAMA_API = "http://localhost:11434/api/generate"
MODEL_NAME = "phi"

# Load prompts
def load_prompt(filename: str) -> str:
    with open(f"../prompts/{filename}", "r") as f:
        return f.read()

# Load test cases
def load_test_cases() -> List[Dict]:
    with open("../evaluation/input_queries.json", "r") as f:
        data = json.load(f)
    return data["test_cases"]

# Initialize prompt templates
zero_shot_template = load_prompt("zero_shot.txt")
few_shot_template = load_prompt("few_shot.txt")
cot_template = load_prompt("cot_prompt.txt")
meta_template = load_prompt("meta_prompt.txt")


In [2]:
def query_model(prompt: str, max_tokens: int = 1000) -> str:
    """Query the Ollama model with a prompt."""
    data = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False,
        "max_tokens": max_tokens
    }
    
    try:
        response = requests.post(OLLAMA_API, json=data)
        response.raise_for_status()
        return response.json()["response"]
    except Exception as e:
        print(f"Error querying model: {e}")
        return ""

def evaluate_response(response: str, expected: Union[float, List[float]]) -> Dict:
    """
    Evaluate the model's response against expected answer.
    Returns metrics including accuracy and reasoning quality.
    """
    # Extract numerical answer from response
    # This is a simple implementation - you might want to make it more robust
    try:
        if isinstance(expected, list):
            # For problems with multiple answers (like quadratic equations)
            actual = [float(x) for x in response.split() if x.replace('.','').isdigit()]
            accuracy = all(any(abs(a - e) < 0.1 for e in expected) for a in actual)
        else:
            # For problems with single numerical answer
            actual = float([x for x in response.split() if x.replace('.','').isdigit()][0])
            accuracy = abs(actual - expected) < 0.1
            
        # Evaluate reasoning quality (simple heuristics)
        reasoning_score = min(5, len(response.split('\n'))) / 5  # More steps = better reasoning
        
        return {
            "accuracy": 1 if accuracy else 0,
            "reasoning_quality": reasoning_score,
            "response_length": len(response)
        }
    except Exception as e:
        print(f"Error evaluating response: {e}")
        return {
            "accuracy": 0,
            "reasoning_quality": 0,
            "response_length": len(response)
        }


In [None]:
# Run evaluation for each prompt type
def run_evaluation():
    test_cases = load_test_cases()
    results = []
    
    prompt_types = {
        "zero_shot": zero_shot_template,
        "few_shot": few_shot_template,
        "chain_of_thought": cot_template,
        "meta_prompt": meta_template
    }
    
    for case in test_cases:
        print(f"Evaluating problem {case['id']}...")
        
        for prompt_type, template in prompt_types.items():
            # Prepare prompt
            prompt = template.replace("[PROBLEM]", case["problem"])
            
            # Query model
            response = query_model(prompt)
            
            # Evaluate response
            metrics = evaluate_response(response, case["expected_answer"])
            
            # Store results
            results.append({
                "problem_id": case["id"],
                "category": case["category"],
                "difficulty": case["difficulty"],
                "prompt_type": prompt_type,
                **metrics
            })
            
            # Add delay to avoid overwhelming the API
            time.sleep(1)
    
    return pd.DataFrame(results)

# Run evaluation
results_df = run_evaluation()


Evaluating problem 1...


In [None]:
# Analyze results
def plot_results(df: pd.DataFrame):
    # Set style
    plt.style.use('seaborn')
    
    # Create figure with subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Accuracy by prompt type
    sns.barplot(
        data=df,
        x='prompt_type',
        y='accuracy',
        ax=axes[0,0]
    )
    axes[0,0].set_title('Accuracy by Prompt Type')
    axes[0,0].set_xticklabels(axes[0,0].get_xticklabels(), rotation=45)
    
    # 2. Reasoning quality by prompt type
    sns.barplot(
        data=df,
        x='prompt_type',
        y='reasoning_quality',
        ax=axes[0,1]
    )
    axes[0,1].set_title('Reasoning Quality by Prompt Type')
    axes[0,1].set_xticklabels(axes[0,1].get_xticklabels(), rotation=45)
    
    # 3. Accuracy by difficulty
    sns.barplot(
        data=df,
        x='difficulty',
        y='accuracy',
        hue='prompt_type',
        ax=axes[1,0]
    )
    axes[1,0].set_title('Accuracy by Difficulty and Prompt Type')
    axes[1,0].set_xticklabels(axes[1,0].get_xticklabels(), rotation=45)
    
    # 4. Accuracy by category
    sns.barplot(
        data=df,
        x='category',
        y='accuracy',
        hue='prompt_type',
        ax=axes[1,1]
    )
    axes[1,1].set_title('Accuracy by Category and Prompt Type')
    axes[1,1].set_xticklabels(axes[1,1].get_xticklabels(), rotation=45)
    
    plt.tight_layout()
    plt.show()

# Generate plots
plot_results(results_df)

# Print summary statistics
print("\nSummary Statistics:")
print("\nAccuracy by Prompt Type:")
print(results_df.groupby('prompt_type')['accuracy'].mean())

print("\nReasoning Quality by Prompt Type:")
print(results_df.groupby('prompt_type')['reasoning_quality'].mean())

print("\nAccuracy by Difficulty:")
print(results_df.groupby(['difficulty', 'prompt_type'])['accuracy'].mean().unstack())
