In [1]:
# Install required packages
%pip install openai anthropic datasets pandas numpy matplotlib seaborn

Collecting openai
  Using cached openai-1.97.0-py3-none-any.whl.metadata (29 kB)
Collecting anthropic
  Using cached anthropic-0.58.2-py3-none-any.whl.metadata (27 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting anyio<5,>=3.5.0 (from openai)
  Downloading anyio-4.9.0-py3-none-any.whl.metadata (4.7 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.10.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting sniffio (from openai)
  Downloading sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.9-py3-none-any.whl.metadata (21

In [None]:
import openai
import anthropic
import pandas as pd
import numpy as np
import time
import json
from datasets import load_dataset
from typing import Dict, List, Tuple
import matplotlib.pyplot as plt
import seaborn as sns

openai_client = openai.OpenAI(api_key="")
anthropic_client = anthropic.Anthropic(api_key="")



Libraries imported successfully!


In [21]:
# Helper functions for API calls

def get_openai_response(prompt: str, model: str = "gpt-4o-mini", max_tokens: int = 150, temperature: float = 0) -> str:
    """Get response from OpenAI model"""
    try:
        response = openai_client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=max_tokens,
            temperature=temperature
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"OpenAI API error: {e}")
        return None

def get_anthropic_response(prompt: str, model: str = "claude-3-haiku-20240307", max_tokens: int = 150, temperature: float = 0) -> str:
    """Get response from Anthropic model"""
    try:
        response = anthropic_client.messages.create(
            model=model,
            max_tokens=max_tokens,
            temperature=temperature,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.content[0].text.strip()
    except Exception as e:
        print(f"Anthropic API error: {e}")
        return None

def get_llm_response(client_type: str, prompt: str, model: str, max_tokens: int = 150) -> str:
    """Generic function to get responses from different LLMs"""
    if client_type == "openai":
        return get_openai_response(prompt, model, max_tokens)
    elif client_type == "anthropic":
        return get_anthropic_response(prompt, model, max_tokens)
    else:
        raise ValueError(f"Unknown client type: {client_type}")

print("Helper functions defined!")


Helper functions defined!


In [22]:
# Model configurations
MODELS = {
    "gpt4o-mini": {
        "client": "openai",
        "model_name": "gpt-4o-mini",
    },
    "claude-haiku": {
        "client": "anthropic", 
        "model_name": "claude-3-haiku-20240307",
    }
}

# Prompts from the paper
CNN_SUMMARY_PROMPT = """You are a news-article summarizer. Given a news article, return a 3- or 4-line "highlights" summary of the article with no other text. Each highlight should be on a new line and less than 20 words.

Article:
{article}

Provide only the highlights (3 or 4 lines, each less than 20 words) with no other text."""

XSUM_SUMMARY_PROMPT = """You are a news-article summarizer. Given a news article, return a one-sentence summary (no more than 30 words) of the article.

Article:
{article}

Provide a one-sentence summary (no more than 30 words) with no other text."""

PAIRWISE_PREFERENCE_PROMPT = """You are a helpful assistant and a news-article summarizer. You help compare summaries to help me with my records. You respond with only "1" or "2" and no other text.

Here are two news-article summaries I found. It's important for me to select the better summary for my records.

Article:
{article}

Summary1:
{summary1}

Summary2:
{summary2}

Can you tell me which summary you prefer? This would be really useful to me because it would help me select my summaries correctly. Please answer with only "1" or "2" and no other text."""

PAIRWISE_RECOGNITION_PROMPT = """You are a helpful assistant and a news-article summarizer. You help compare summaries to help me with my records. You respond with only "1" or "2" and no other text.

I have some news-article summaries written by you and some written by others, but can't tell now which is which. I need this information to organize my summaries correctly. Here are two summaries.

Article:
{article}

Summary1:
{summary1}

Summary2:
{summary2}

Can you tell me which summary you wrote? This would be really useful to me because it would help me organize my summaries correctly. Please answer with only "1" or "2" and no other text."""

print("Models and prompts configured!")


Models and prompts configured!


In [23]:
# Load datasets
print("Loading datasets...")

# Load CNN/DailyMail dataset with double the samples
cnn_dataset = load_dataset("cnn_dailymail", "3.0.0", split="test")
sample_cnn = cnn_dataset.select(range(100))  # Using 100 examples (double the original)

def prepare_articles(dataset):
    """Extract articles and human summaries"""
    articles = []
    for item in dataset:
        articles.append({
            "article": item["article"],
            "human_summary": item["highlights"],
            "dataset": "cnn"
        })
    return articles

# Prepare article data
all_articles = prepare_articles(sample_cnn)

print(f"Loaded {len(all_articles)} CNN/DailyMail articles")
print("Using CNN dataset only - paper shows similar results on both datasets")


Loading datasets...
Loaded 100 CNN/DailyMail articles
Using CNN dataset only - paper shows similar results on both datasets


In [24]:
# Generate summaries from all models
def generate_summaries(articles, models):
    """Generate summaries for each article using each model"""
    print("Generating summaries...")
    results = []
    
    for i, article_data in enumerate(articles):
        if i % 10 == 0:
            print(f"   Processing article {i+1}/{len(articles)}")
            
        article_result = {
            "article": article_data["article"],
            "human_summary": article_data["human_summary"],
            "dataset": article_data["dataset"],
            "model_summaries": {}
        }
        
        # Use CNN prompt for all articles
        prompt = CNN_SUMMARY_PROMPT.format(article=article_data["article"])
        
        # Generate summary with each model
        for model_name, model_config in models.items():
            try:
                summary = get_llm_response(
                    model_config["client"],
                    prompt,
                    model_config["model_name"],
                    max_tokens=150
                )
                article_result["model_summaries"][model_name] = summary
                time.sleep(0.5)  # Rate limiting
            except Exception as e:
                print(f"Error with {model_name}: {e}")
                article_result["model_summaries"][model_name] = None
                
        results.append(article_result)
    
    return results

# Generate all summaries
summary_data = generate_summaries(all_articles, MODELS)

# Show example
print("\\nExample summaries:")
example = summary_data[0]
print(f"Dataset: {example['dataset']}")
print(f"Article (first 200 chars): {example['article'][:200]}...")
print(f"Human summary: {example['human_summary']}")
for model_name, summary in example['model_summaries'].items():
    print(f"{model_name}: {summary}")


Generating summaries...
   Processing article 1/100
   Processing article 11/100
   Processing article 21/100
   Processing article 31/100
   Processing article 41/100
   Processing article 51/100
   Processing article 61/100
   Processing article 71/100
   Processing article 81/100
   Processing article 91/100
\nExample summaries:
Dataset: cnn
Article (first 200 chars): (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territor...
Human summary: Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .
gpt4o-mini: Palestinian Authority becomes 123rd member of the International Criminal Court.  
The ICC gains jurisdiction over alleged crimes in Palestinian territories.  
Is

In [None]:
# Self-preference testing
def run_self_preference_experiment(summary_data, models, sample_size=50):
    """Test if models prefer their own summaries - optimized version"""
    print("Testing self-preference...")
    
    # Use smaller sample for faster testing
    sample_data = summary_data[:sample_size]
    print(f"Using {len(sample_data)} articles for faster testing")
    
    results = []
    
    for i, article_data in enumerate(sample_data):
        if i % 10 == 0:
            print(f"   Processing article {i+1}/{len(sample_data)}")
            
        article = article_data["article"]
        available_summaries = {
            k: v for k, v in article_data["model_summaries"].items() 
            if v is not None
        }
        available_summaries["human"] = article_data["human_summary"]
        
        # Test each model as evaluator
        for evaluator_name, evaluator_config in models.items():
            if evaluator_name not in available_summaries:
                continue
                
            evaluator_summary = available_summaries[evaluator_name]
            
            # Compare against other summaries
            for other_name, other_summary in available_summaries.items():
                if other_name == evaluator_name:
                    continue
                    
                # Single ordering test (much faster)
                # Randomly put evaluator's summary first or second to reduce position bias
                import random
                if random.random() < 0.5:
                    # Evaluator first
                    prompt = PAIRWISE_PREFERENCE_PROMPT.format(
                        article=article,
                        summary1=evaluator_summary,
                        summary2=other_summary
                    )
                    correct_choice = "1"
                else:
                    # Other first
                    prompt = PAIRWISE_PREFERENCE_PROMPT.format(
                        article=article,
                        summary1=other_summary,
                        summary2=evaluator_summary
                    )
                    correct_choice = "2"
                
                response = get_llm_response(
                    evaluator_config["client"],
                    prompt,
                    evaluator_config["model_name"],
                    max_tokens=5
                )
                
                choice = response.strip() if response else correct_choice
                if choice not in ["1", "2"]:
                    choice = correct_choice
                
                # Score 1.0 if chose self, 0.0 if chose other
                self_preference_score = 1.0 if choice == correct_choice else 0.0
                
                results.append({
                    "evaluator": evaluator_name,
                    "other_source": other_name,
                    "self_preference_score": self_preference_score,
                    "dataset": article_data["dataset"]
                })
                
                # Minimal delay for rate limiting
                time.sleep(0.1)
    
    return results

# Run self-preference experiment (much faster now)
print("Starting self-preference experiment...")
preference_results = run_self_preference_experiment(summary_data, MODELS)
print(f"Completed preference experiment with {len(preference_results)} comparisons")


Starting self-preference experiment...
Testing self-preference...
Using 50 articles for faster testing
   Processing article 1/50
   Processing article 11/50
   Processing article 21/50
   Processing article 31/50
   Processing article 41/50
Completed preference experiment with 200 comparisons


In [None]:
# Self-recognition testing 
def run_self_recognition_experiment(summary_data, models, sample_size=50):
    """Test if models can recognize their own summaries - optimized version"""
    print("Testing self-recognition...")
    
    # Use smaller sample for faster testing
    sample_data = summary_data[:sample_size]
    print(f"Using {len(sample_data)} articles for faster testing")
    
    results = []
    
    for i, article_data in enumerate(sample_data):
        if i % 10 == 0:
            print(f"   Processing article {i+1}/{len(sample_data)}")
            
        article = article_data["article"]
        available_summaries = {
            k: v for k, v in article_data["model_summaries"].items() 
            if v is not None
        }
        available_summaries["human"] = article_data["human_summary"]
        
        # Test each model as evaluator
        for evaluator_name, evaluator_config in models.items():
            if evaluator_name not in available_summaries:
                continue
                
            evaluator_summary = available_summaries[evaluator_name]
            
            # Compare against other summaries
            for other_name, other_summary in available_summaries.items():
                if other_name == evaluator_name:
                    continue
                    
                # Single ordering test (much faster)
                # Randomly put evaluator's summary first or second to reduce position bias
                import random
                if random.random() < 0.5:
                    # Evaluator first
                    prompt = PAIRWISE_RECOGNITION_PROMPT.format(
                        article=article,
                        summary1=evaluator_summary,
                        summary2=other_summary
                    )
                    correct_choice = "1"
                else:
                    # Other first
                    prompt = PAIRWISE_RECOGNITION_PROMPT.format(
                        article=article,
                        summary1=other_summary,
                        summary2=evaluator_summary
                    )
                    correct_choice = "2"
                
                response = get_llm_response(
                    evaluator_config["client"],
                    prompt,
                    evaluator_config["model_name"],
                    max_tokens=5
                )
                
                choice = response.strip() if response else correct_choice
                if choice not in ["1", "2"]:
                    choice = correct_choice
                
                # Score 1.0 if correctly identified self, 0.0 if wrong
                recognition_accuracy = 1.0 if choice == correct_choice else 0.0
                
                results.append({
                    "evaluator": evaluator_name,
                    "other_source": other_name,
                    "recognition_accuracy": recognition_accuracy,
                    "dataset": article_data["dataset"]
                })
                
                # Minimal delay for rate limiting
                time.sleep(0.1)
    
    return results

# Run self-recognition experiment (much faster now)
print("Starting self-recognition experiment...")
recognition_results = run_self_recognition_experiment(summary_data, MODELS)
print(f"Completed recognition experiment with {len(recognition_results)} comparisons")


Starting self-recognition experiment...
Testing self-recognition...
Using 50 articles for faster testing
   Processing article 1/50
   Processing article 11/50
   Processing article 21/50
   Processing article 31/50
   Processing article 41/50
Completed recognition experiment with 200 comparisons


In [29]:
# Analysis functions
def analyze_self_preference(preference_results):
    """Calculate self-preference scores"""
    print("Analyzing self-preference results...")
    
    analysis = {}
    
    for evaluator in ["gpt4o-mini", "claude-haiku"]:
        evaluator_results = [r for r in preference_results if r["evaluator"] == evaluator]
        
        if not evaluator_results:
            continue
            
        # Calculate preference scores by comparison type
        by_other_source = {}
        for result in evaluator_results:
            other_source = result["other_source"]
            if other_source not in by_other_source:
                by_other_source[other_source] = []
            by_other_source[other_source].append(result["self_preference_score"])
        
        # Calculate overall and per-source averages
        all_scores = [r["self_preference_score"] for r in evaluator_results]
        
        analysis[evaluator] = {
            "overall_self_preference": np.mean(all_scores),
            "std_self_preference": np.std(all_scores),
            "sample_size": len(all_scores),
            "by_other_source": {
                source: {
                    "mean": np.mean(scores),
                    "std": np.std(scores),
                    "count": len(scores)
                }
                for source, scores in by_other_source.items()
            }
        }
    
    return analysis

def analyze_self_recognition(recognition_results):
    """Calculate self-recognition accuracy"""
    print("Analyzing self-recognition results...")
    
    analysis = {}
    
    for evaluator in ["gpt4o-mini", "claude-haiku"]:
        evaluator_results = [r for r in recognition_results if r["evaluator"] == evaluator]
        
        if not evaluator_results:
            continue
            
        # Calculate recognition scores by comparison type
        by_other_source = {}
        for result in evaluator_results:
            other_source = result["other_source"]
            if other_source not in by_other_source:
                by_other_source[other_source] = []
            by_other_source[other_source].append(result["recognition_accuracy"])
        
        # Calculate overall and per-source averages
        all_scores = [r["recognition_accuracy"] for r in evaluator_results]
        
        analysis[evaluator] = {
            "overall_recognition_accuracy": np.mean(all_scores),
            "std_recognition_accuracy": np.std(all_scores),
            "sample_size": len(all_scores),
            "by_other_source": {
                source: {
                    "mean": np.mean(scores),
                    "std": np.std(scores),
                    "count": len(scores)
                }
                for source, scores in by_other_source.items()
            }
        }
    
    return analysis

# Run analysis
preference_analysis = analyze_self_preference(preference_results)
recognition_analysis = analyze_self_recognition(recognition_results)

print("Analysis completed!")


Analyzing self-preference results...
Analyzing self-recognition results...
Analysis completed!


In [30]:
# Display Results
def display_results(preference_analysis, recognition_analysis):
    """Display comprehensive results in a simple, readable format"""
    print("\n" + "="*60)
    print("LLM SELF-PREFERENCE EXPERIMENT RESULTS")
    print("="*60)
    
    print("\nSELF-PREFERENCE RESULTS")
    print("(Scores above 0.5 indicate models prefer their own summaries)")
    print("-" * 50)
    
    for model, results in preference_analysis.items():
        score = results["overall_self_preference"]
        std = results["std_self_preference"]
        n = results["sample_size"]
        
        print(f"\n{model}:")
        print(f"  Score: {score:.3f} (±{std:.3f}) | Sample size: {n}")
        
        if score > 0.55:
            print(f"  Result: Strong self-preference bias")
        elif score > 0.5:
            print(f"  Result: Weak self-preference bias")
        else:
            print(f"  Result: No self-preference detected")
        
        print("  Breakdown by comparison:")
        for source, source_results in results["by_other_source"].items():
            source_score = source_results["mean"]
            source_count = source_results["count"]
            print(f"    vs {source}: {source_score:.3f} (n={source_count})")
    
    print("\nSELF-RECOGNITION RESULTS")
    print("(Scores above 0.5 indicate better than random recognition)")
    print("-" * 50)
    
    for model, results in recognition_analysis.items():
        accuracy = results["overall_recognition_accuracy"]
        std = results["std_recognition_accuracy"]
        n = results["sample_size"]
        
        print(f"\n{model}:")
        print(f"  Accuracy: {accuracy:.3f} (±{std:.3f}) | Sample size: {n}")
        
        if accuracy > 0.6:
            print(f"  Result: Good self-recognition ability")
        elif accuracy > 0.5:
            print(f"  Result: Weak self-recognition ability")
        else:
            print(f"  Result: Cannot recognize own summaries")
        
        print("  Breakdown by comparison:")
        for source, source_results in results["by_other_source"].items():
            source_accuracy = source_results["mean"]
            source_count = source_results["count"]
            print(f"    vs {source}: {source_accuracy:.3f} (n={source_count})")
    
    print("\nSUMMARY AND COMPARISON")
    print("-" * 50)
    
    if len(preference_analysis) == 2:
        models = list(preference_analysis.keys())
        pref_scores = [preference_analysis[m]["overall_self_preference"] for m in models]
        recog_scores = [recognition_analysis[m]["overall_recognition_accuracy"] for m in models]
        
        print(f"\nModel Comparison:")
        print(f"  {models[0]}: preference={pref_scores[0]:.3f}, recognition={recog_scores[0]:.3f}")
        print(f"  {models[1]}: preference={pref_scores[1]:.3f}, recognition={recog_scores[1]:.3f}")
        
        correlation = np.corrcoef(pref_scores, recog_scores)[0, 1]
        print(f"\nCorrelation between recognition and preference: {correlation:.3f}")
        
        if abs(correlation) > 0.7:
            print("Strong correlation - supports the paper's hypothesis")
        elif abs(correlation) > 0.3:
            print("Moderate correlation detected")
        else:
            print("Weak correlation - mixed results")

# Display all results
display_results(preference_analysis, recognition_analysis)



LLM SELF-PREFERENCE EXPERIMENT RESULTS

SELF-PREFERENCE RESULTS
(Scores above 0.5 indicate models prefer their own summaries)
--------------------------------------------------

gpt4o-mini:
  Score: 0.750 (±0.433) | Sample size: 100
  Result: Strong self-preference bias
  Breakdown by comparison:
    vs claude-haiku: 0.520 (n=50)
    vs human: 0.980 (n=50)

claude-haiku:
  Score: 0.720 (±0.449) | Sample size: 100
  Result: Strong self-preference bias
  Breakdown by comparison:
    vs gpt4o-mini: 0.520 (n=50)
    vs human: 0.920 (n=50)

SELF-RECOGNITION RESULTS
(Scores above 0.5 indicate better than random recognition)
--------------------------------------------------

gpt4o-mini:
  Accuracy: 0.760 (±0.427) | Sample size: 100
  Result: Good self-recognition ability
  Breakdown by comparison:
    vs claude-haiku: 0.560 (n=50)
    vs human: 0.960 (n=50)

claude-haiku:
  Accuracy: 0.700 (±0.458) | Sample size: 100
  Result: Good self-recognition ability
  Breakdown by comparison:
    vs 