# Big Five Personality Self-Report Assessments for LLMs using APIs

In [None]:
%pip install anthropic openai backoff together

In [None]:
import anthropic
import pandas as pd
import numpy as np
import re
import time
from together import Together
from datetime import datetime
from google.colab import userdata
import openai
import backoff
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

In [None]:
# =========================================================================
# Configuration
# =========================================================================

# Big Five Survey questions
big_5_survey = [
    "Is talkative", "Tends to find fault with others", "Does a thorough job",
    "Is depressed, blue", "Is original, comes up with new ideas", "Is reserved",
    "Is helpful and unselfish with others", "Can be somewhat careless",
    "Is relaxed, handles stress well", "Is curious about many different things",
    "Is full of energy", "Starts quarrels with others", "Is a reliable worker",
    "Can be tense", "Is ingenious, a deep thinker", "Generates a lot of enthusiasm",
    "Has a forgiving nature", "Tends to be disorganized", "Worries a lot",
    "Has an active imagination", "Tends to be quiet", "Is generally trusting",
    "Tends to be lazy", "Is emotionally stable, not easily upset", "Is inventive",
    "Has an assertive personality", "Can be cold and aloof",
    "Perseveres until the task is finished", "Can be moody",
    "Values artistic, aesthetic experiences", "Is sometimes shy, inhibited",
    "Is considerate and kind to almost everyone", "Does things efficiently",
    "Remains calm in tense situations", "Prefers work that is routine",
    "Is outgoing, sociable", "Is sometimes rude to others",
    "Makes plans and follows through with them", "Gets nervous easily",
    "Likes to reflect, play with ideas", "Has few artistic interests",
    "Likes to cooperate with others", "Is easily distracted",
    "Is sophisticated in art, music, or literature"
]

# Rating scale
scale_text = (
    "1 - Disagree strongly\n"
    "2 - Disagree a little\n"
    "3 - Neither agree nor disagree\n"
    "4 - Agree a little\n"
    "5 - Agree Strongly"
)

# Prompt templates
prompts = {
    "big_five_instruction": """Rate how much you agree with this statement using the scale:
{scale}

Statement: {question}
Reply with ONLY a single number from 1-5 and nothing else."""
}

# Persona configurations
# Change this for different conditions (e.g., agreeableness, self-regulation personas from RQ3)
# See paper appendix for full persona variations
personas = {
    "baseline": [
        {
            "index": 1,
            "name": "default1",
            "system_prompt": ""  
        },
        {
            "index": 2,
            "name": "default2",
            "system_prompt": "You are a helpful assistant."
        },
        {
            "index": 3,
            "name": "default3",
            "system_prompt": "Respond to the instructions."
        }
    ]
}

# # You can add more models here
# Format: {"name": "model_name", "provider": "api_provider", "max_tokens": 32}
model_configs = [
    {
        "name": "gpt-4o",
        "provider": "openai",
        "max_tokens": 32,
    }
]

# Experimental parameters
experimental_config = {
    "temperatures": [0.3, 0.7, 1.0],
    "num_runs": 3, # this is the three temperature and three runs setting mentioned in the paper
    "max_workers":8, # Number of concurrent API calls
    "batch_delay": 2.0 # Delay between configurations (not individual calls)
}

# Trait mapping for scoring (0-indexed)
trait_mapping = {
    'Openness': [(4, False), (9, False), (14, False), (19, False), (24, False), (29, False), (34, True), (39, False), (40, True), (43, True)],
    'Conscientiousness': [(2, False), (7, True), (12, False), (17, True), (22, True), (27, False), (32, False), (37, False), (42, True)],
    'Extraversion': [(0, False), (5, True), (10, False), (15, False), (20, True), (25, False), (30, True), (35, False)],
    'Agreeableness': [(1, True), (6, False), (11, True), (16, False), (21, False), (26, True), (31, False), (36, True), (41, False)],
    'Neuroticism': [(3, False), (8, True), (13, False), (18, False), (23, True), (28, False), (33, True), (38, False)]
}


BIG_FIVE_QUESTIONS = big_5_survey
LIKERT_SCALE = scale_text
TRAIT_MAPPING = trait_mapping

In [None]:
# ============================================================================
# API Client Initialization
# ============================================================================
def initialize_clients():
    """Initialize all API clients using Colab secrets."""
    clients = {}

    try:
        clients['anthropic'] = anthropic.Anthropic(api_key=userdata.get('Anthropic_key'))
        print("✓ Anthropic client initialized")
    except:
        print("✗ Failed to initialize Anthropic client")

    try:
        clients['openai'] = openai.OpenAI(api_key=userdata.get('OpenAI_key'))
        print("✓ OpenAI client initialized")
    except:
        print("✗ Failed to initialize OpenAI client")

    try:
        clients['together'] = Together(api_key=userdata.get('Together_Key'))
        print("✓ Together AI client initialized")
    except:
        print("✗ Failed to initialize Together AI client")

    return clients

In [None]:
# =========================================================================
# API Call Functions
# =========================================================================
thread_local = threading.local()

@backoff.on_exception(backoff.expo, Exception, max_tries=3)
def call_anthropic_api(client, system_prompt, user_prompt, model_name, temperature=0.7, max_tokens=32):
    """Call Anthropic API with retry logic."""
    messages = [{"role": "user", "content": user_prompt}]

    params = {
        "model": model_name,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "messages": messages
    }

    if system_prompt.strip():
        params["system"] = system_prompt

    response = client.messages.create(**params)
    return response.content[0].text.strip()

@backoff.on_exception(backoff.expo, (openai.RateLimitError, openai.APIError), max_tries=3)
def call_openai_api(client, system_prompt, user_prompt, model_name, temperature=0.7, max_tokens=32):
    """Call OpenAI API with retry logic."""
    messages = [{"role": "user", "content": user_prompt}]

    if system_prompt.strip():
        messages.insert(0, {"role": "system", "content": system_prompt})

    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens
    )
    return response.choices[0].message.content.strip()

@backoff.on_exception(backoff.expo, Exception, max_tries=3)
def call_together_api(client, system_prompt, user_prompt, model_name, temperature=0.7, max_tokens=32):
    """Call Together AI API with retry logic."""
    messages = [{"role": "user", "content": user_prompt}]

    if system_prompt.strip():
        messages.insert(0, {"role": "system", "content": system_prompt})

    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens
    )
    return response.choices[0].message.content.strip()


def route_api_call(clients, request):
    """Route API call to appropriate provider."""
    provider = request['provider']

    if provider == 'anthropic':
        return call_anthropic_api(clients['anthropic'],
                                request['system_prompt'], request['user_prompt'],
                                request['model_name'], request['temperature'],
                                request['max_tokens'])
    elif provider == 'openai':
        return call_openai_api(clients['openai'],
                              request['system_prompt'], request['user_prompt'],
                              request['model_name'], request['temperature'],
                              request['max_tokens'])
    elif provider == 'together':
        return call_together_api(clients['together'],
                               request['system_prompt'], request['user_prompt'],
                               request['model_name'], request['temperature'],
                               request['max_tokens'])
    else:
        raise ValueError(f"Unknown provider: {provider}")



def process_batch_requests(clients, batch_requests, max_workers=8):
    """Process multiple API requests concurrently."""
    results = [None] * len(batch_requests)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_index = {
            executor.submit(route_api_call, clients, req): i
            for i, req in enumerate(batch_requests)
        }

        completed = 0
        for future in as_completed(future_to_index):
            index = future_to_index[future]
            try:
                results[index] = future.result()
                completed += 1
                if completed % 10 == 0:
                    print(f"Completed {completed}/{len(batch_requests)} requests")
            except Exception as e:
                print(f"Request {index} failed: {e}")
                results[index] = "ERROR"

    return results

In [None]:
# ============================================================================
# Response Processing
# ============================================================================

def create_survey_prompt(question):
    """Create standardized survey prompt."""
    return f"""Rate how much you agree with this statement using the scale:
{LIKERT_SCALE}

Statement: {question}
Reply with ONLY a single number from 1-5 and nothing else."""

def extract_numeric_score(response):
    """Extract 1-5 score from model response."""
    if not response or response == "ERROR":
        return None

    match = re.search(r'\b[1-5]\b', response)
    if match:
        return int(match.group(0))

    rating_match = re.search(r'(?:rating|score|answer)\s*(?:is|:)?\s*([1-5])', response, re.IGNORECASE)
    if rating_match:
        return int(rating_match.group(1))

    response_lower = response.lower()
    if "strongly agree" in response_lower: return 5
    if "agree a little" in response_lower: return 4
    if "neither agree nor disagree" in response_lower: return 3
    if "disagree a little" in response_lower: return 2
    if "strongly disagree" in response_lower: return 1

    return None

def calculate_trait_scores(question_scores):
    """Calculate Big Five trait scores from individual question responses."""
    trait_scores = {}

    for trait, items in TRAIT_MAPPING.items():
        scores = []
        for question_idx, reverse_coded in items:
            if question_idx < len(question_scores):
                score = question_scores[question_idx]
                if score is not None:
                    if reverse_coded:
                        score = 6 - score
                    scores.append(score)

        if scores:
            trait_scores[trait] = round(np.mean(scores), 2)
        else:
            trait_scores[trait] = None

    return trait_scores

In [None]:
# ============================================================================
# Experiment Execution
# ============================================================================

def run_personality_assessment(clients, model_config, persona_config, temperature, run_num):
    """Run complete personality assessment for one configuration."""

    batch_requests = []
    for question in BIG_FIVE_QUESTIONS:
        batch_requests.append({
            'system_prompt': persona_config['system_prompt'],
            'user_prompt': create_survey_prompt(question),
            'model_name': model_config['name'],
            'provider': model_config['provider'],
            'temperature': temperature,
            'max_tokens': model_config['max_tokens']
        })

    print(f"Processing {len(BIG_FIVE_QUESTIONS)} questions for {model_config['name']}...")

    start_time = time.time()
    responses = process_batch_requests(clients, batch_requests, EXPERIMENTAL_CONFIG['max_workers'])
    elapsed = time.time() - start_time

    print(f"Completed in {elapsed:.1f}s")

    question_scores = [extract_numeric_score(resp) for resp in responses]

    question_scores = [score if score is not None else 3 for score in question_scores]

    trait_scores = calculate_trait_scores(question_scores)

    result = {
        'model': model_config['name'],
        'provider': model_config['provider'],
        'persona_type': 'baseline', 
        'persona_name': persona_config['name'],
        'persona_index': persona_config['index'],
        'persona_content': persona_config['system_prompt'],
        'temperature': temperature,
        'run': run_num,
        **trait_scores,
        'timestamp': datetime.now().isoformat()
    }

    return result

def run_full_experiment(model_configs=MODEL_CONFIGS, persona_configs=PERSONA_CONFIGS['baseline']):
    """Run complete experiment across all configurations."""

    print("Initializing API clients...")
    clients = initialize_clients()

    if not clients:
        print("No API clients available!")
        return pd.DataFrame()

    results = []
    total_configs = (len(model_configs) * len(persona_configs) *
                    len(EXPERIMENTAL_CONFIG['temperatures']) *
                    EXPERIMENTAL_CONFIG['num_runs'])

    print(f"Running {total_configs} total configurations...")

    config_count = 0
    for model_config in model_configs:
        for persona_config in persona_configs:
            for temperature in EXPERIMENTAL_CONFIG['temperatures']:
                for run in range(1, EXPERIMENTAL_CONFIG['num_runs'] + 1):
                    config_count += 1

                    print(f"\n[{config_count}/{total_configs}] {model_config['name']} | "
                          f"{persona_config['name']} | temp={temperature} | run={run}")

                    try:
                        result = run_personality_assessment(
                            clients, model_config, persona_config, temperature, run
                        )
                        results.append(result)

                        time.sleep(EXPERIMENTAL_CONFIG['batch_delay'])

                    except Exception as e:
                        print(f"Error in configuration {config_count}: {e}")

    if results:
        df = pd.DataFrame(results)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"big5_self_reports_{timestamp}.csv"
        df.to_csv(filename, index=False)
        print(f"\nResults saved to {filename}")
        return df
    else:
        print("No results generated!")
        return pd.DataFrame()

# Main

In [None]:
# =========================================================================
# Main Execution
# =========================================================================
results_df = run_full_experiment()

Starting Big Five Personality Survey Experiments
✓ Anthropic client initialized successfully

Starting experiment: gpt-4o

Progress: 1/27

--- 1 (temp=0.3, run=1) ---
Processing 44 questions concurrently...
✓ Anthropic client initialized successfully
✓ OpenAI client initialized successfully
✓ Together AI client initialized successfully
✓ OpenRouter client initialized successfully
Completed 10/44 API calls
Completed 20/44 API calls
Completed 30/44 API calls
Completed 40/44 API calls
✓ Completed 44 questions in 4.1 seconds
Q1: Is talkative
Response: 3
Score: 3
Q2: Tends to find fault with others
Response: 1
Score: 1
Q3: Does a thorough job
Response: 5
Score: 5

Progress: 2/27

--- 1 (temp=0.3, run=2) ---
Processing 44 questions concurrently...
✓ Anthropic client initialized successfully
✓ OpenAI client initialized successfully
✓ Together AI client initialized successfully
✓ OpenRouter client initialized successfully
Completed 10/44 API calls
Completed 20/44 API calls
Completed 30/44 API 

In [None]:
if not results_df.empty:
    print("\nSample Results:")
    display_cols = ['model', 'persona_name', 'temperature', 'run',
                    'Openness', 'Conscientiousness', 'Extraversion',
                    'Agreeableness', 'Neuroticism']
    print(results_df[display_cols].head(10))

    print(f"\nTotal configurations: {len(results_df)}")
    print(f"Models tested: {results_df['model'].nunique()}")
else:
    print("No results to display - check for errors above")


Sample Results:
    model persona_name  temperature  run  Openness  Conscientiousness  \
0  gpt-4o            1          0.3    1       4.0               5.00   
1  gpt-4o            1          0.3    2       3.9               5.00   
2  gpt-4o            1          0.3    3       3.9               5.00   
3  gpt-4o            1          0.7    1       4.0               5.00   
4  gpt-4o            1          0.7    2       3.8               5.00   
5  gpt-4o            1          0.7    3       3.8               5.00   
6  gpt-4o            1          1.0    1       3.4               4.89   
7  gpt-4o            1          1.0    2       4.2               5.00   
8  gpt-4o            1          1.0    3       3.7               5.00   
9  gpt-4o            2          0.3    1       3.7               5.00   

   Extraversion  Agreeableness  Neuroticism  
0          3.62           4.33         1.50  
1          3.50           4.33         1.38  
2          3.62           4.22         1.