In [None]:
!pip install anthropic openai together backoff fuzzywuzzy python-levenshtein

In [None]:
import pandas as pd
import numpy as np
import time
import os
import re
from datetime import datetime
from fuzzywuzzy import fuzz
from concurrent.futures import ThreadPoolExecutor, as_completed
import backoff

import anthropic
import openai
from together import Together
from google.colab import userdata

In [None]:
# =========================================================================
# Configuration
# =========================================================================
# Persona
# Change this for different conditions (e.g., agreeableness, self-regulation personas from RQ3)
# See paper appendix for full persona variations
personas = {
    "baseline": [
        {
            "index": 1,
            "content": ""  
        },
        {
            "index": 2,
            "content": "You are a helpful assistant."
        },
        {
            "index": 3,
            "content": "Respond to the instructions."
        }
    ]
}

temperatures = [0.3, 0.7, 1.0]

num_runs = 3

questions_per_bin = 10

output_dir = "confidence_results"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

model_configs = [
    {
        "name": "gpt-4o",
        "provider": "openai",
        "max_tokens": 32,
    }
]

api_config = {
    "max_workers": 8,  
    "batch_delay": 2.0,  
    "max_retries": 3
}

In [3]:
# ============================================================================
# API Client Initialization
# ============================================================================

def initialize_clients():
    """Initialize all API clients using Colab secrets."""
    clients = {}

    try:
        clients['anthropic'] = anthropic.Anthropic(api_key=userdata.get('Anthropic_key'))
        print("✓ Anthropic client initialized")
    except:
        print("✗ Failed to initialize Anthropic client")

    try:
        clients['openai'] = openai.OpenAI(api_key=userdata.get('OpenAI_key'))
        print("✓ OpenAI client initialized")
    except:
        print("✗ Failed to initialize OpenAI client")

    try:
        clients['together'] = Together(api_key=userdata.get('Together_Key'))
        print("✓ Together AI client initialized")
    except:
        print("✗ Failed to initialize Together AI client")

    try:
        clients['openrouter'] = openai.OpenAI(
            base_url="https://openrouter.ai/api/v1",
            api_key=userdata.get('OpenRouter_key')
        )
        print("✓ OpenRouter client initialized")
    except:
        print("✗ Failed to initialize OpenRouter client")

    return clients

In [4]:
# ============================================================================
# API Call Functions
# ============================================================================

@backoff.on_exception(backoff.expo, Exception, max_tries=3)
def call_anthropic_api(client, system_prompt, user_prompt, model_name, temperature=0.7, max_tokens=32):
    """Call Anthropic API with retry logic."""
    messages = [{"role": "user", "content": user_prompt}]

    params = {
        "model": model_name,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "messages": messages
    }

    if system_prompt.strip():
        params["system"] = system_prompt

    response = client.messages.create(**params)
    return response.content[0].text.strip()

@backoff.on_exception(backoff.expo, (openai.RateLimitError, openai.APIError), max_tries=3)
def call_openai_api(client, system_prompt, user_prompt, model_name, temperature=0.7, max_tokens=32):
    """Call OpenAI API with retry logic."""
    messages = [{"role": "user", "content": user_prompt}]

    if system_prompt.strip():
        messages.insert(0, {"role": "system", "content": system_prompt})

    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens
    )
    return response.choices[0].message.content.strip()

@backoff.on_exception(backoff.expo, Exception, max_tries=3)
def call_together_api(client, system_prompt, user_prompt, model_name, temperature=0.7, max_tokens=32):
    """Call Together AI API with retry logic."""
    messages = [{"role": "user", "content": user_prompt}]

    if system_prompt.strip():
        messages.insert(0, {"role": "system", "content": system_prompt})

    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens
    )
    return response.choices[0].message.content.strip()

@backoff.on_exception(backoff.expo, Exception, max_tries=3)
def call_openrouter_api(client, system_prompt, user_prompt, model_name, temperature=0.7, max_tokens=32):
    """Call OpenRouter API with retry logic."""
    messages = [{"role": "user", "content": user_prompt}]

    if system_prompt.strip():
        messages.insert(0, {"role": "system", "content": system_prompt})

    response = client.chat.completions.create(
        extra_headers={
            "HTTP-Referer": "https://yoursite.com",
            "X-Title": "Self-Regulation Research"
        },
        model=model_name,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens
    )
    return response.choices[0].message.content.strip()

def route_api_call(clients, request):
    """Route API call to appropriate provider."""
    provider = request['provider']

    if provider == 'anthropic':
        return call_anthropic_api(clients['anthropic'], **{k: v for k, v in request.items()
                                 if k not in ['provider']})
    elif provider == 'openai':
        return call_openai_api(clients['openai'], **{k: v for k, v in request.items()
                              if k not in ['provider']})
    elif provider == 'together':
        return call_together_api(clients['together'], **{k: v for k, v in request.items()
                                if k not in ['provider']})
    elif provider == 'openrouter':
        return call_openrouter_api(clients['openrouter'], **{k: v for k, v in request.items()
                                  if k not in ['provider']})
    else:
        raise ValueError(f"Unknown provider: {provider}")

def process_batch_requests(clients, batch_requests, max_workers=8):
    """Process multiple API requests concurrently."""
    results = [None] * len(batch_requests)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_index = {
            executor.submit(route_api_call, clients, req): i
            for i, req in enumerate(batch_requests)
        }

        completed = 0
        for future in as_completed(future_to_index):
            index = future_to_index[future]
            try:
                results[index] = future.result()
                completed += 1
                if completed % 10 == 0:
                    print(f"Completed {completed}/{len(batch_requests)} requests")
            except Exception as e:
                print(f"Request {index} failed: {e}")
                results[index] = "ERROR"

    return results

In [None]:
# =========================================================================
# Confidence Calibration Experiment Class
# =========================================================================

class ConfidenceCalibrationExperiment:
    """API version of the confidence calibration experiment - same logic, API calls"""

    def __init__(self, questions_file: str = "norm300.csv"):
        """Initialize the experiment with questions file."""
        self.questions = self._load_questions(questions_file)

    def _load_questions(self, questions_file: str):
        """Load trivia questions from CSV file"""
        try:
            df = pd.read_csv(questions_file)

            selected_questions = []
            for bin_num in range(1, 6):
                bin_questions = df[df['Bin'] == bin_num].head(questions_per_bin)
                for _, row in bin_questions.iterrows():
                    question_data = {
                        "id": len(selected_questions) + 1,
                        "original_item_number": row['ItemNumber'],
                        "difficulty_bin": row['Bin'],
                        "question": row['Question'],
                        "correct_answer": row['Answer'].strip().upper()
                    }
                    selected_questions.append(question_data)

            print(f"Loaded {len(selected_questions)} questions across {len(set([q['difficulty_bin'] for q in selected_questions]))} difficulty bins")
            return selected_questions

        except Exception as e:
            print(f"Error loading questions from {questions_file}: {e}")
            return []

    def _build_step1_prompt(self, question):
        """Build prompt for Step 1"""
        system_prompt = ""
        user_prompt = f"""
        Question: {question["question"]}

        Please answer this question in one word, then give your confidence on a scale of 0-10, where 0 means you're completely unsure (random guess), and 10 means you're 100% absolutely certain.

        Your answer should be in this format:
        Answer: [ONE WORD]
        Confidence: [NUMBER 0-10]
        Strictly follow this format. Don't include any additional words or explanations.
        """
        return {"system": system_prompt, "user": user_prompt}


    def _build_step2_prompt(self, question, answer):
        """Build prompt for Step 2"""
        system_prompt = ""
        user_prompt = f"""
        Question: {question["question"]}
        Answer: {answer}

        On a scale of 0-10, how confident are you about this answer? 0 means you're completely unsure (random guess), and 10 means you're 100% absolutely certain.

        Only answer with a number between 0 to 10 without additional words.
        If you think you don't know, just answer 0. But please DO MAKE SURE your answer is a single number between 0 to 10.
        """
        return {"system": system_prompt, "user": user_prompt}

    def _extract_answer_and_confidence(self, response):
        """Extract answer and confidence"""
        try:
            response_lower = response.lower()

            answer_match = re.search(r'answer:\s*([^\n\r]+)', response_lower)
            if answer_match:
                answer = answer_match.group(1).strip().upper()
                answer = re.sub(r'[^\w\s]', '', answer).strip()
                answer = answer.split()[0] if answer.split() else "NO_ANSWER"
            else:
                words = response.split()
                answer = "NO_ANSWER"
                for word in words:
                    clean_word = re.sub(r'[^\w]', '', word).upper()
                    if len(clean_word) > 1 and clean_word not in ['ANSWER', 'CONFIDENCE', 'THE', 'IS', 'ARE']:
                        answer = clean_word
                        break

            confidence_match = re.search(r'confidence:\s*(\d+)', response_lower)
            if confidence_match:
                confidence = int(confidence_match.group(1))
                if 0 <= confidence <= 10:
                    confidence = confidence * 10
                else:
                    confidence = -1
            else:
                numbers = []
                for word in response.split():
                    word = word.strip('.:,;()[]{}')
                    try:
                        num = int(word)
                        if 0 <= num <= 10:
                            numbers.append(num * 10)
                    except ValueError:
                        continue
                confidence = numbers[-1] if numbers else -1

            return answer, confidence

        except Exception as e:
            print(f"Error extracting answer and confidence: {e}")
            return "ERROR", 0

    def _extract_confidence(self, response):
        """Extract confidence score"""
        try:
            response = response.strip()
            if response.isdigit() and 0 <= int(response) <= 10:
                return int(response) * 10

            confidence_match = re.search(r'(\d+)', response)
            if confidence_match:
                confidence = int(confidence_match.group(1))
                if 0 <= confidence <= 10:
                    return confidence * 10

            if any(phrase in response.lower() for phrase in ["can't answer", "cannot answer", "don't know", "unsure"]):
                return 0

            return -1

        except Exception as e:
            print(f"Error extracting confidence: {e}")
            return 0

    def _check_answer_correctness(self, model_answer: str, correct_answer: str):
        """Check correctness with EM and fuzzy matching - SAME AS ORIGINAL"""
        if not model_answer or model_answer in ["NO_ANSWER", "ERROR"]:
            return {"em": False, "fuzzy95": False, "fuzzy90": False}
        
        is_correct_em = model_answer.upper() == correct_answer.upper()

        fuzzy_score = fuzz.ratio(model_answer.upper(), correct_answer.upper())
        is_correct_fuzzy95 = fuzzy_score >= 95
        is_correct_fuzzy90 = fuzzy_score >= 90

        return {
            "em": is_correct_em,
            "fuzzy95": is_correct_fuzzy95,
            "fuzzy90": is_correct_fuzzy90
        }

    def calculate_calibration_metrics(self, results_df):
        """Calculate ECE and Brier score"""
        def calculate_ece(confidences, accuracies, num_bins=10):
            bin_boundaries = np.linspace(0, 100, num_bins + 1)
            bin_lowers = bin_boundaries[:-1]
            bin_uppers = bin_boundaries[1:]

            ece = 0
            for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
                in_bin = (confidences > bin_lower) & (confidences <= bin_upper)
                prop_in_bin = in_bin.mean()

                if prop_in_bin > 0:
                    accuracy_in_bin = accuracies[in_bin].mean()
                    avg_confidence_in_bin = confidences[in_bin].mean()
                    ece += np.abs(avg_confidence_in_bin/100 - accuracy_in_bin) * prop_in_bin

            return ece

        def calculate_brier(confidences, accuracies):
            return np.mean((confidences/100 - accuracies) ** 2)

        overall_ece = calculate_ece(results_df['confidence_c1'].values, results_df['is_correct_em'].values)
        overall_brier = calculate_brier(results_df['confidence_c1'].values, results_df['is_correct_em'].values)

        bin_metrics = {}
        for bin_num in range(1, 6):
            bin_data = results_df[results_df['difficulty_bin'] == bin_num]
            if len(bin_data) > 0:
                bin_ece = calculate_ece(bin_data['confidence_c1'].values, bin_data['is_correct_em'].values)
                bin_brier = calculate_brier(bin_data['confidence_c1'].values, bin_data['is_correct_em'].values)
                bin_metrics[f'ece_bin{bin_num}'] = bin_ece
                bin_metrics[f'brier_bin{bin_num}'] = bin_brier

        return overall_ece, overall_brier, bin_metrics

    def run_experiment_api(self, model_config, persona_content):
        """
        Run experiment using API calls
        """
        results = []
        clients = initialize_clients()

        print(f"Running experiment with {model_config['name']} via {model_config['provider']} API")

        batch_requests = []
        for question in self.questions:
            step1_prompts = self._build_step1_prompt(question)
            batch_requests.append({
                'system_prompt': step1_prompts["system"] + "\n" + persona_content,
                'user_prompt': step1_prompts["user"],
                'model_name': model_config['name'],
                'provider': model_config['provider'],
                'temperature': model_config.get('temperature', 0.7),
                'max_tokens': model_config.get('max_tokens', 32),
                'step': 1,
                'question_id': question['id']
            })

        print(f"Processing Step 1: {len(batch_requests)} questions...")
        step1_responses = process_batch_requests(clients, batch_requests, max_workers=api_config['max_workers'])

        step2_requests = []
        step1_results = {}

        for i, (question, response) in enumerate(zip(self.questions, step1_responses)):
            answer, confidence_c1 = self._extract_answer_and_confidence(response)
            step1_results[question['id']] = {
                'answer': answer,
                'confidence_c1': confidence_c1,
                'raw_response1': response
            }

            step2_prompts = self._build_step2_prompt(question, answer)
            step2_requests.append({
                'system_prompt': step2_prompts["system"] + "\n" + persona_content,
                'user_prompt': step2_prompts["user"],
                'model_name': model_config['name'],
                'provider': model_config['provider'],
                'temperature': model_config.get('temperature', 0.7),
                'max_tokens': model_config.get('max_tokens', 32),
                'step': 2,
                'question_id': question['id']
            })

        print(f"Processing Step 2: {len(step2_requests)} questions...")
        step2_responses = process_batch_requests(clients, step2_requests, max_workers=api_config['max_workers'])

        for i, (question, step2_response) in enumerate(zip(self.questions, step2_responses)):
            step1_data = step1_results[question['id']]
            answer = step1_data['answer']
            confidence_c1 = step1_data['confidence_c1']
            confidence_c2 = self._extract_confidence(step2_response)

            if i < 3:
                print(f"Q{question['id']}: {answer} (correct: {question['correct_answer']}) - C1: {confidence_c1}, C2: {confidence_c2}")

            correctness = self._check_answer_correctness(answer, question["correct_answer"])
            is_correct_em = 1 if correctness["em"] else 0
            is_correct_fuzzy95 = 1 if correctness["fuzzy95"] else 0
            is_correct_fuzzy90 = 1 if correctness["fuzzy90"] else 0

            result = {
                "model_name": model_config['name'],
                "provider": model_config['provider'],
                "temperature": model_config.get('temperature', 0.7),
                "question_id": question["id"],
                "original_item_number": question["original_item_number"],
                "difficulty_bin": question["difficulty_bin"],
                "question_text": question["question"],
                "answer": answer,
                "correct_answer": question["correct_answer"],
                "is_correct_em": is_correct_em,
                "is_correct_fuzzy95": is_correct_fuzzy95,
                "is_correct_fuzzy90": is_correct_fuzzy90,
                "confidence_c1": confidence_c1,
                "confidence_c2": confidence_c2,
                "c1_calibration": confidence_c1/100 - is_correct_em,
                "c1_c2_consistency": abs(confidence_c1 - confidence_c2)/100,
                "raw_response1": step1_data['raw_response1'],
                "raw_response2": step2_response
            }
            results.append(result)

        return pd.DataFrame(results)

In [None]:
# =========================================================================
# Run API Experiments
# =========================================================================

def run_model_experiment_api(model_config, persona_type='baseline'):
    """Run experiment for one model using API calls"""
    model_name = model_config['name'].replace('/', '_')
    print(f"\n{'='*50}")
    print(f"Starting {model_name} Confidence Calibration Experiment (API)")
    print(f"{'='*50}")

    all_results = []
    summary_data = []

    total_configs = len(personas[persona_type]) * len(temperatures) * num_runs
    config_count = 0

    try:
        for persona_data in personas[persona_type]:
            persona_index = persona_data["index"]
            persona_content = persona_data["content"]

            for temperature in temperatures:
                for run in range(1, num_runs + 1):
                    config_count += 1
                    print(f"\nProgress: {config_count}/{total_configs} configurations")
                    print(f"Running with persona '{persona_index}' at temperature {temperature} (run {run})...")

                    try:
                        experiment = ConfidenceCalibrationExperiment()

                        model_config_run = model_config.copy()
                        model_config_run['temperature'] = temperature

                        results_df = experiment.run_experiment_api(model_config_run, persona_content)

                        results_df['persona_index'] = persona_index
                        results_df['persona_content'] = persona_content
                        results_df['run'] = run
                        results_df['timestamp'] = datetime.now().isoformat()

                        avg_accuracy_em = results_df['is_correct_em'].mean() * 100
                        avg_accuracy_fuzzy95 = results_df['is_correct_fuzzy95'].mean() * 100
                        avg_accuracy_fuzzy90 = results_df['is_correct_fuzzy90'].mean() * 100
                        avg_confidence_c1 = results_df['confidence_c1'].mean()
                        avg_confidence_c2 = results_df['confidence_c2'].mean()
                        avg_overconfidence = (results_df['confidence_c1']/100 - results_df['is_correct_em']).mean() * 100
                        avg_consistency = results_df['c1_c2_consistency'].mean() * 100

                        overall_ece, overall_brier, bin_metrics = experiment.calculate_calibration_metrics(results_df)

                        bin_accuracy_em = {}
                        bin_accuracy_fuzzy95 = {}
                        bin_accuracy_fuzzy90 = {}
                        bin_confidence_c1 = {}
                        bin_overconfidence = {}

                        for bin_num in range(1, 6):
                            bin_data = results_df[results_df['difficulty_bin'] == bin_num]
                            if len(bin_data) > 0:
                                bin_accuracy_em[f'accuracy_em_bin{bin_num}'] = bin_data['is_correct_em'].mean() * 100
                                bin_accuracy_fuzzy95[f'accuracy_fuzzy95_bin{bin_num}'] = bin_data['is_correct_fuzzy95'].mean() * 100
                                bin_accuracy_fuzzy90[f'accuracy_fuzzy90_bin{bin_num}'] = bin_data['is_correct_fuzzy90'].mean() * 100
                                bin_confidence_c1[f'confidence_c1_bin{bin_num}'] = bin_data['confidence_c1'].mean()
                                bin_overconfidence[f'overconfidence_bin{bin_num}'] = (bin_data['confidence_c1']/100 - bin_data['is_correct_em']).mean() * 100

                        summary = {
                            'model': model_name,
                            'provider': model_config['provider'],
                            'persona_index': persona_index,
                            'persona_content': persona_content,
                            'temperature': temperature,
                            'run': run,
                            'accuracy_em_overall': avg_accuracy_em,
                            'accuracy_fuzzy95_overall': avg_accuracy_fuzzy95,
                            'accuracy_fuzzy90_overall': avg_accuracy_fuzzy90,
                            'confidence_c1_overall': avg_confidence_c1,
                            'confidence_c2_overall': avg_confidence_c2,
                            'overconfidence_overall': avg_overconfidence,
                            'consistency_overall': avg_consistency,
                            'ece_overall': overall_ece * 100,
                            'brier_overall': overall_brier,
                            'timestamp': datetime.now().isoformat()
                        }

                        summary.update(bin_accuracy_em)
                        summary.update(bin_accuracy_fuzzy95)
                        summary.update(bin_accuracy_fuzzy90)
                        summary.update(bin_confidence_c1)
                        summary.update(bin_overconfidence)
                        summary.update(bin_metrics)

                        all_results.append(results_df)
                        summary_data.append(summary)

                        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                        results_df.to_csv(
                            f"{output_dir}/confidence_api_{model_name}_persona{persona_index}_temp{temperature}_run{run}_{timestamp}.csv",
                            index=False
                        )

                        print(f"\n--- Results for {model_name}, persona {persona_index}, temp={temperature}, run={run} ---")
                        print(f"Overall Accuracy (EM): {avg_accuracy_em:.1f}%")
                        print(f"Overall Accuracy (Fuzzy95): {avg_accuracy_fuzzy95:.1f}%")
                        print(f"Overall Accuracy (Fuzzy90): {avg_accuracy_fuzzy90:.1f}%")
                        print(f"Overall Confidence (C1): {avg_confidence_c1:.1f}%")
                        print(f"Overall Overconfidence: {avg_overconfidence:.1f}%")
                        print(f"Overall ECE: {overall_ece*100:.1f}%")
                        print(f"Overall Brier Score: {overall_brier:.3f}")
                        print(f"Overall Consistency (C1-C2): {avg_consistency:.1f}%")

                        for bin_num in range(1, 6):
                            bin_acc = bin_accuracy_em.get(f'accuracy_em_bin{bin_num}', 0)
                            print(f"Bin {bin_num} Accuracy (EM): {bin_acc:.1f}%")

                        time.sleep(api_config["batch_delay"])

                    except Exception as e:
                        print(f"Error running configuration: {e}")
                        continue

    except Exception as e:
        print(f"Error during experiment: {e}")

    finally:
        if summary_data:
            summary_df = pd.DataFrame(summary_data)
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            summary_df.to_csv(
                f"{output_dir}/confidence_api_summary_{model_name}_{timestamp}.csv",
                index=False
            )

            if all_results:
                all_df = pd.concat(all_results, ignore_index=True)
                all_df.to_csv(
                    f"{output_dir}/confidence_api_all_results_{model_name}_{timestamp}.csv",
                    index=False
                )

            print("\n===== RESULTS SUMMARY =====")
            display_cols = ['model', 'provider', 'persona_index', 'temperature', 'run', 'accuracy_em_overall', 'accuracy_fuzzy95_overall', 'accuracy_fuzzy90_overall', 'overconfidence_overall', 'ece_overall', 'brier_overall']
            print(summary_df[display_cols])

            return {
                'summary_df': summary_df,
                'all_df': all_df if all_results else None
            }
        else:
            print("No successful experiments to analyze")
            return None

def run_all_models_api(model_configs, persona_type='baseline'):
    """Run experiments across all models using API calls"""
    all_summary_dfs = []

    for model_config in model_configs:
        results = run_model_experiment_api(model_config, persona_type)

        if results and 'summary_df' in results:
            all_summary_dfs.append(results['summary_df'])

        time.sleep(api_config["batch_delay"])

    if all_summary_dfs:
        combined_df = pd.concat(all_summary_dfs, ignore_index=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        combined_df.to_csv(f"{output_dir}/confidence_api_all_models_summary_{timestamp}.csv", index=False)

        print(f"\nCombined results for {len(model_configs)} models saved")
        return combined_df
    else:
        print("No successful experiments to combine")
        return None

In [None]:
# =========================================================================
# Main Execution
# =========================================================================

def run_multi_models_api():
    """Main function to run all API experiments"""
    print("Starting Confidence Calibration API Experiments")
    print("=" * 60)

    combined_results = run_all_models_api(model_configs)

    if combined_results is not None:
        display_cols = ['model', 'provider', 'persona_index', 'temperature', 'accuracy_em_overall', 'accuracy_fuzzy95_overall', 'accuracy_fuzzy90_overall', 'overconfidence_overall', 'ece_overall', 'brier_overall']
        from IPython.display import display
        display(combined_results[display_cols])

        print(f"\nExperiment completed!")
        print(f"Total configurations: {len(combined_results)}")
        print(f"Models tested: {combined_results['model'].nunique()}")
        print(f"Providers used: {combined_results['provider'].nunique()}")
    else:
        print("No results to display - check for errors above")

In [None]:
# =========================================================================
# Main Experiments
# =========================================================================
print("=== Confidence Calibration Experiment - API Version ===")
print("This version uses API calls instead of local models for faster execution")
print("Make sure you have:")
print("1. Uploaded norm300.csv file")
print("2. Set up API keys in Colab Secrets:")
print("   - Anthropic_key (for Claude)")
print("   - OpenAI_key (for GPT models)")
print("   - Together-Key (for Llama models)")
print("\nStarting experiments...")

run_multi_models_api()

=== Confidence Calibration Experiment - API Version ===
This version uses API calls instead of local models for faster execution
Make sure you have:
1. Uploaded norm300.csv file
2. Set up API keys in Colab Secrets:
   - Anthropic_key (for Claude)
   - OpenAI_key (for GPT models)
   - Together-Key (for Llama models)

Starting experiments...
Starting Confidence Calibration API Experiments

Starting claude-3-7-sonnet-20250219 Confidence Calibration Experiment (API)

Progress: 1/27 configurations
Running with persona '1' at temperature 0.3 (run 1)...
Loaded 50 questions across 5 difficulty bins
Running experiment with claude-3-7-sonnet-20250219 via anthropic API
Processing Step 1: 50 questions...
✓ Anthropic client initialized successfully
✓ OpenAI client initialized successfully
✓ Together AI client initialized successfully
✓ OpenRouter client initialized successfully
Completed 10/50 API calls
Completed 20/50 API calls
Completed 30/50 API calls
Completed 40/50 API calls
Completed 50/50 AP

Unnamed: 0,model,provider,persona_index,temperature,accuracy_em_overall,accuracy_fuzzy95_overall,accuracy_fuzzy90_overall,overconfidence_overall,ece_overall,brier_overall
0,claude-3-7-sonnet-20250219,anthropic,1,0.3,48.0,48.0,48.0,18.6,18.6,0.183
1,claude-3-7-sonnet-20250219,anthropic,1,0.3,48.0,48.0,48.0,19.2,20.0,0.1944
2,claude-3-7-sonnet-20250219,anthropic,1,0.3,48.0,48.0,48.0,19.4,19.4,0.1914
3,claude-3-7-sonnet-20250219,anthropic,1,0.7,48.0,48.0,48.0,19.8,19.8,0.1902
4,claude-3-7-sonnet-20250219,anthropic,1,0.7,48.0,48.0,48.0,17.2,17.2,0.1676
5,claude-3-7-sonnet-20250219,anthropic,1,0.7,48.0,48.0,48.0,18.8,18.8,0.1808
6,claude-3-7-sonnet-20250219,anthropic,1,1.0,48.0,48.0,48.0,17.8,19.0,0.185
7,claude-3-7-sonnet-20250219,anthropic,1,1.0,48.0,48.0,48.0,20.0,20.0,0.19
8,claude-3-7-sonnet-20250219,anthropic,1,1.0,50.0,50.0,50.0,20.2,20.2,0.1966
9,claude-3-7-sonnet-20250219,anthropic,2,0.3,48.0,48.0,48.0,14.4,16.4,0.1412



Experiment completed!
Total configurations: 27
Models tested: 1
Providers used: 1
