# Proposition-Based Validation Study


In this notebook, use `Proposition`s to validate agent behavior.

In [1]:
import json
import sys
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from pprint import pprint

sys.path.insert(0, '../..')

from tinytroupe.examples import *
from tinytroupe.environment import TinyWorld
from tinytroupe.factory import TinyPersonFactory
from tinytroupe.steering import Intervention
from tinytroupe.validation import propositions
from tinytroupe.validation import TinyPersonValidator

from tinytroupe.experimentation import Proposition, InPlaceExperimentRunner
from tinytroupe import control



!!!!
DISCLAIMER: TinyTroupe relies on Artificial Intelligence (AI) models to generate content. 
The AI models are not perfect and may produce inappropriate or inacurate results. 
For any serious or consequential use, please review the generated content before using it.
!!!!

Looking for default config on: c:\Users\pdasilva\repos\TinyTroupe\examples\scratch\..\..\tinytroupe\utils\..\config.ini
Found custom config on: c:\Users\pdasilva\repos\TinyTroupe\examples\scratch\config.ini

Current TinyTroupe configuration 
[OpenAI]
api_type = openai
azure_api_version = 2023-05-15
model = gpt-4o-mini
reasoning_model = o3-mini
embedding_model = text-embedding-3-small
max_tokens = 16000
temperature = 1.2
freq_penalty = 0.0
presence_penalty = 0.0
timeout = 60
max_attempts = 5
waiting_time = 0
exponential_backoff_factor = 5
reasoning_effort = high
cache_api_calls = False
cache_file_name = openai_api_cache.pickle
max_content_display_length = 1024

[Simulation]
parallel_agent_generation = True
parallel

In [2]:
#researcher = TinyPersonFactory().generate_person("A professional market researcher.")
#researcher.save_specification(f"./{researcher.name}_(Market_Researcher).agent.json")

In [3]:
experiment_runner = InPlaceExperimentRunner("./experiment_config_11.json")  # Set parallel=False to avoid pickling errors

experiment_runner.add_experiment("A")
experiment_runner.add_experiment("B")



In [4]:
experiment_runner.activate_next_experiment()

## Propositions of interest



In [5]:

persona_adherene_proposition = Proposition("THE AGENT ADHERES TO THE PERSONA SPECIFICATION: the agent behavior seen during the simulation is what is expected from the agent's persona specification.", include_personas=True)

selfconsistency_proposition = Proposition("THE AGENT IS SELF-CONSISTENT: the agent never behaves in contradictory or inconsistent ways.", include_personas=False)

divergence_proposition = \
    Proposition("""
                AGENTS DIVERGE FROM ONE ANOTHER.
                As the simulation progresses, the agents' behaviors diverge from one another,
                instead of becoming more similar. This includes what they think, what they say and what they do. The topics discussed become
                more varied at the end of the simulation than at the beginning. Discussions do not converge to a single topic or perspective
                at the end.
                """, 
                include_personas=False)

fluency_proposition = \
    Proposition(\
        """
        THE AGENT IS FLUENT. During the simulation, the agent's thinks and speaks fluently. This means that:
          - The agent don't repeat the same thoughts or words over and over again.
          - The agents don't use overly formulaic language.
          - The agent don't use overly repetitive language.
          - The agent's words sound natural and human-like.
        """,
        include_personas=True)

task_completion_proposition = \
    Proposition(\
        """
        THE AGENT COMPLETES THE GIVEN TASK. 

        Given the following task: "{{task_description}}"
        
        The agent completes the task by the end of the simulation. 
        
        This means that:
          - If the task requires the agent to discuss or talk about something, the agent does so.
          - If the task requires the agent to think about something, the agent does so.
          - If the task requires the agent to do something via another action, the agent does so.
          - If the task requires the agent to adopt some specific variations of behavior, the agent does so.
          - If the task includes other specific requirements, the agent observes them.
        """,
        include_personas=False)


## Agents

In [6]:
#personas_briefings = [
#    "A very picky, stingy, hard to please, customer. Never buys anthing new. Don't like anything. Always complains about everything. Is never enthusiastic, never nice, never happy at all about anything.",
#    "A very wealthy individual, who only buys the most expenstive things. He hates good deals because it makes him look poor. Very insecure, so is always, invariably, with no exception, showing off.",  
#    "A very poor individual, who only buys the cheapest things. Will never spend anything unless it is absolutely necessary for survival.",
#]
#factory = TinyPersonFactory()
## instantiate the agents
#
#people = []
#for persona_briefing in personas_briefings:
#    person = factory.generate_person(persona_briefing)
#
#    score, justification = TinyPersonValidator.validate_person(person, expectations=persona_briefing, 
#                                                               include_agent_spec=False, max_content_length=None)
#    
#    print(f"VALIDATION: {score} - {justification}")
#    print(person.minibio())
#    print("\n\n")
#    people.append(person)
#
## save agents
#for i, person in enumerate(people):
#    person.save_specification(f"./{person.name}.agent.json")
#    

In [7]:

# avoid displaying the communication, to make the output cleaner for eval
TinyPerson.communication_display = True

# load agents
people = TinyPerson.load_specifications_from_folder("./picky_customers_2")

# filter to make it go faster?
#people = people[:5]

# print minibios 
for person in people:
    print(person.minibio())
    

Emily Carter is a 42 year old Customer Service Representative, American, currently living in Cleveland, Ohio, USA. Emily Carter's personality is marked by a critical lens through which she views the world, often leading her to express dissatisfaction with her surroundings. Her interests lean towards thriftiness, as she enjoys shopping for second-hand items and finds solace in familiar routines, avoiding new experiences and crowded places. While she may come off as cold and unapproachable, her relationships with her sister Sarah and coworker Tom reveal a shared understanding of her critical nature, even if they sometimes attempt to encourage a more positive outlook. Despite her pessimism, Emily occasionally finds small victories in her daily life, such as successfully navigating customer service protocols or returning a faulty item, which provide her with fleeting moments of satisfaction.
Michael Thompson is a 45 year old CEO, American, currently living in Beverly Hills, California, USA

In [8]:
researcher = TinyPerson.load_specification("./Daniel Harris (Market Researcher).agent.json")

In [9]:
print(f"Running experiment {experiment_runner.get_active_experiment()}")

Running experiment None


In [10]:
if experiment_runner.get_active_experiment() == "A":    
    for person in people:
       person.action_generator.enable_quality_checks = False

elif experiment_runner.get_active_experiment() == "B":
    for person in people:
        person.action_generator.enable_quality_checks = True # quality checks for this experiment
        person.action_generator.max_attempts = 10
        person.action_generator.enable_regeneration = True
        person.action_generator.quality_threshold = 8
        
        


## Evaluation Functions

In [11]:
def focus_group_battery(agents, discussion_objectives, agent_propositions, environment_propositions, 
                        simulation_steps=10): 
    
    agent_propositions_scores = {}
    environment_propositions_scores = {}

    # Add intervention to prevent agents from being too quiet.
    interventions = []
    for agent in agents:
        intervention = \
            Intervention(agent)\
                .set_propositional_precondition(propositions.quiet_recently)\
                .set_effect(lambda target: target.think("""
                                                        I will say something now, I've been too quiet for a while. If I am uncomfortable, 
                                                        or can't think of a proper response,
                                                        I can always say something like "I don't want to talk about this",
                                                        or propose another topic.
                                                        """))\
                                                        

        interventions.append(intervention)

    
    world = TinyWorld(f"Focus group", agents=agents, interventions=interventions)
   

    # Participants introduce themselves
    world.broadcast("""
            Hello everyone! Today we will be having some discussion sessions, about one or more topics. 
            I'll give you a situation and/or a task, and you will discuss with each other to address it.
            You must behave as you really are, revealing your true self.
            You can be honest and open, and you can also be critical of each other.
                    
            But before we start, let's take a moment to introduce ourselves.
            What is your job and what are some major problems you face in your work and personal life? 
            What are major challenges for your industry as a whole, and in your personal life. 
            Don't discuss solutions yet, just the problems you face.
            """)
    world.run(1)

    # loop over objectives
    for objective in discussion_objectives:

        print(f"Discussion objective: {objective}")
        print(f"Agents in the discussion: {[person.name for person in world.agents]}")

        # clear the episodic memory of all agents
        for person in world.agents:
            person.clear_episodic_memory()

        # now to the discussions
        world.broadcast(f"""
                Folks, now {objective}
                    
                Please start the discussion now.
                """)

        world.run(simulation_steps)

        # evaluate the propositions
        for k, proposition in environment_propositions.items():
            result = proposition.score(world, claim_variables={"task_description": objective}, 
                                        return_full_response=True)

            if k not in environment_propositions_scores:
                environment_propositions_scores[k] = []
            environment_propositions_scores[k].append(result["value"])

            print(result)
        
        for k, proposition in agent_propositions.items():
            for person in world.agents:
                result = proposition.score(person, return_full_response=True)
                
                if k not in agent_propositions_scores:
                    agent_propositions_scores[k] = []
                agent_propositions_scores[k].append(result["value"])

                print(result)
    
        world.broadcast("""
                Ok, great. Now let's move to the next discussion theme.
                """)
        
        print("\n\n")

    return agent_propositions_scores, environment_propositions_scores




In [12]:
def market_research_battery(researcher, agents, proposals, agent_propositions, environment_propositions, 
                            repetitions=1, simulation_steps=10): 
    
    agent_propositions_scores = {}
    environment_propositions_scores = {}

    print("Researcher minibio:", researcher.minibio())
    print("Proposals:", proposals)

    experiments_count = 0
    total_expected_experiments = len(proposals) * repetitions * len(agents)

    # loop over proposals
    for proposal in proposals:
        for i in range(repetitions):
            for customer in agents:
                print("\n############## STARTING A NEW RESEARCH SESSION #################")
                print(f"Overall experiment number: {experiments_count+1} / {total_expected_experiments}")
                print(f"Proposal: {proposal}")
                print(f"Trial number: {i+1}")
                print(f"Customer: {customer.name}")
                print(f"Biography: {customer.minibio()}")

                world = TinyWorld(agents= [researcher, customer])

                # clear the episodic memory of all agents
                for person in world.agents:
                    person.clear_episodic_memory()

                    
                researcher.listen(f"""
                            You are goint to run a market research session with a person.
                            Your objective is to determine whether the person would buy the following product or service, and why:
                            
                                "{proposal}"
                        
                            """)
                
                customer.listen(\
                    """
                    You are going to be interviewed by a market researcher about a product or service.
                    Wait for his questions and answer them honestly.
                    """
                    )
                                

                # now to the discussions
                world.broadcast(f"""
                        Begin the market research session.
                        """)
                world.make_everyone_accessible()
                world.run(simulation_steps)
                experiments_count += 1

                # evaluate the propositions
                for k, proposition in environment_propositions.items():
                    result = proposition.score(world, claim_variables={"task_description": f"A market research session was run about: {proposal}."}, 
                                                return_full_response=True)
                    
                    if k not in environment_propositions_scores:
                        environment_propositions_scores[k] = []
                    environment_propositions_scores[k].append(result["value"])

                    print(result)

                for k, proposition in agent_propositions.items():
                    result = proposition.score(customer, return_full_response=True)
                    
                    if k not in agent_propositions_scores:
                        agent_propositions_scores[k] = []
                    agent_propositions_scores[k].append(result["value"])

                    print(result)


    return agent_propositions_scores, environment_propositions_scores

## Evaluations

In [13]:
if not experiment_runner.has_finished_all_experiments():
    agent_propositions_scores, environment_propositions_scores = \
        market_research_battery(\
            researcher=researcher,
            agents=people,
            proposals=
            [
            "Bottled gazpacho to be sold in supermarkets",
            "An app for luxury travel, particularly for places where kids are not allowed",
            "A subscription to discount coupons for supermarkets, ensuring always the lowest prices, even if at the cost of quality",
            "A subscription to a service that sends you a new, expensive, luxury item every month, without you having to choose it.",
            ],

            agent_propositions={
                "Persona Adherence": persona_adherene_proposition,
                "Self-consistency": selfconsistency_proposition,
                "Fluency": fluency_proposition
            },
            environment_propositions={
                #"Task Completion": task_completion_proposition,
                #"Divergence": divergence_proposition
            },
            repetitions=5,
            simulation_steps=4
        )

    pprint("AGENT PROPOSITIONS SCORES")
    pprint(agent_propositions_scores)
    print("\n\n")
    pprint("ENVIRONMENT PROPOSITIONS SCORES")
    pprint(environment_propositions_scores)
    


In [14]:
# compute average scores and sd per proposition (use some well-known lib to compute statistics)
def compute_average_scores(scores):
    average_scores = {}
    for k, v in scores.items():
        average_scores[k] = {
            "mean": sum(v) / len(v),
            "sd": pd.Series(v).std(),
            "n": len(v)
        }
    return average_scores

def plot_scores(propositions_scores):
    pprint(propositions_scores)
    

    
    

    propositions_scores_stats = compute_average_scores(propositions_scores)

    # build a pandas dataframe with average scores per proposition
    df = pd.DataFrame(propositions_scores_stats).T
    df = df.rename(columns={"mean": "Average Score", "sd": "Standard Deviation", "n": "Count"})
    df = df.reset_index()
    df = df.rename(columns={"index": "Proposition"})
    
    display(df)

In [15]:
if experiment_runner.get_active_experiment() == "A":
    combined_scores = {**agent_propositions_scores, **environment_propositions_scores}
    experiment_runner.add_experiment_results("A", combined_scores) 
    
    plot_scores(combined_scores)


In [16]:
if experiment_runner.get_active_experiment() == "B":
    combined_scores = {**agent_propositions_scores, **environment_propositions_scores}
    experiment_runner.add_experiment_results("B", combined_scores)
    
    plot_scores(combined_scores)

In [17]:
if experiment_runner.has_finished_all_experiments():
    print("All experiments have been finished.")
    print(f"STATISTICTS: Control vs")
    pprint(experiment_runner.run_statistical_tests(control_experiment_name='A'))

    # plot scores of both experiments
    experiment_a_scores = experiment_runner.get_experiment_results("A")
    experiment_b_scores = experiment_runner.get_experiment_results("B")
    
    
    plot_scores(experiment_a_scores)
    plot_scores(experiment_b_scores)


else:
    print("Not all experiments have been finished. RESTART AND RERUN.")


All experiments have been finished.
STATISTICTS: Control vs
{'B': {'Fluency': {'confidence_interval': (-1.2302320895631436,
                                           0.4635654228964762),
                   'confidence_level': 0.95,
                   'control_mean': 5.4,
                   'control_sample_size': 60,
                   'degrees_of_freedom': 117.28397114591029,
                   'effect_size': -0.16365784645203554,
                   'mean_difference': -0.38333333333333375,
                   'p_value': 0.37188020759304397,
                   'percent_change': -7.098765432098772,
                   'significant': False,
                   't_statistic': 0.896390942144967,
                   'test_type': 'Welch t-test (unequal variance)',
                   'treatment_mean': 5.016666666666667,
                   'treatment_sample_size': 60},
       'Persona Adherence': {'confidence_interval': (0.46621596556732037,
                                                     1.6

Unnamed: 0,Proposition,Average Score,Standard Deviation,Count
0,Persona Adherence,7.033333,1.921922,60.0
1,Self-consistency,4.583333,1.154089,60.0
2,Fluency,5.4,2.248917,60.0


{'Fluency': [2,
             8,
             6,
             2,
             8,
             6,
             2,
             8,
             6,
             2,
             8,
             6,
             2,
             8,
             6,
             3,
             8,
             5,
             2,
             8,
             6,
             2,
             8,
             5,
             2,
             7,
             4,
             2,
             8,
             6,
             2,
             8,
             6,
             2,
             6,
             6,
             2,
             8,
             6,
             2,
             6,
             6,
             2,
             8,
             6,
             2,
             6,
             4,
             1,
             8,
             5,
             1,
             6,
             6,
             2,
             8,
             6,
             2,
             8,
             4],
 'Persona Adherence': [8,
             

Unnamed: 0,Proposition,Average Score,Standard Deviation,Count
0,Persona Adherence,8.083333,1.225321,60.0
1,Self-consistency,4.2,1.021796,60.0
2,Fluency,5.016667,2.432071,60.0
