In [None]:
import json
import random

all_personas = json.load(open("../src/agent/prompts/jsons/personas.json"))
seed_personas = {}
for site, personas in all_personas.items():
    print(f"{site}: {len(personas)}")
    seed_personas[site] = random.sample(personas, 10)
    assert len(seed_personas[site]) == 10, f"Expected 10 personas for {site}, got {len(seed_personas[site])}"

In [None]:
from openai import OpenAI
from tqdm import tqdm
client = OpenAI(api_key=json.load(open("./secrets.json"))["openai"])
N_PERSONAS = 10

generate_persona_prompt = """
You are an expert at creating realistic user personas given a website. You are given the name of a website and some example outputs for the website, generate a diverse list of {N_PERSONAS} personas, each with a unique "persona" name and a 1-2 sentence "description" of their typical behavior, interests, or motivations on that site.

Format your output as a JSON array, where each element is an object with two fields:
- "persona": a short, descriptive name for the persona (e.g., "Casual Browser", "Tech Enthusiast")
- "description": a brief description of how this persona typically uses the website

Now, generate a list of personas in the specified JSON format. You are given the name of the website and some example outputs for the website.
"""

generated_personas = {}
for site, personas in tqdm(seed_personas.items(), desc="Generating personas"):
    example_output = json.dumps(personas, indent=2)
    system_prompt = generate_persona_prompt.format(N_PERSONAS=N_PERSONAS)
    response = client.chat.completions.create(
        model="gpt-4.1-2025-04-14",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"Website: {site}\nExample output: {example_output}"}],
        response_format={"type": "json_object"},
        temperature=1.0,
        top_p=0.95
    )
    try:
        response_json = json.loads(response.choices[0].message.content)
    except:
        response_json = response.choices[0].message.content
    generated_personas[site] = response_json

In [None]:
final_generated_personas = {}
for site in generated_personas.keys():
    final_generated_personas[site] = generated_personas[site]['personas']
    print(f"site: {site}\t length: {len(final_generated_personas[site])}")

In [None]:
with open("generated_personas.json", "w") as f:
    json.dump(final_generated_personas, f, indent=2)

In [None]:
merged_personas = {}
for site in all_personas.keys():
    merged_personas[site] = all_personas[site] + final_generated_personas[site]
    print(f"site: {site}\t length: {len(merged_personas[site])}")