In [4]:
# Multi-Provider LLM Generation with Langchain
import json
import numpy as np
import random

from tqdm import tqdm

# LLM utilities (clients, cost tracking, helpers)
from llm_utils import (
    get_client, make_langchain_messages, CostTracker,
    format_person, extract_json, sample_distribution
)

In [5]:
import dill

with open('draft_people_v1.pkl', 'rb') as f:
    people = dill.load(f)

In [6]:
to_keep = ['Birth year',  'Age at death',  'Sex',  'Lifestyle',  'Birthplace latitude',  
           'Birthplace longitude',  'Birthplace modern region',  'Birthplace modern country',  
           'Birthplace modern address',  'Birthplace altitude',  'Birthplace map url',  'Birthplace population density']

filtered_people = [{k: d[k] for k in to_keep} for d in people]

## Prompts

In [7]:
CONTEXT = """This is part of a "random lives" project - simulating randomly selecting a person from human history. 
Try to answer the binary questions provided as best you can. If the question is a historical impossible or certainty, answer 0 or 1.

Format: First provide brief reasoning (1-2 paragraphs), then give the probability distribution as:
{"1": probability1, "2": probability2, ...}
where 1 corresponds to the probability that the first question was true, 
2 corresponds to the probability that the second question was true, etc. Do not give any annotation or deviate from this format in any way.

Here are the basic facts for the person:"""

In [8]:
def initialize_messages(person):
    """Initialize conversation with person context"""
    return [
        {"role": "user", "content": CONTEXT + "\n\n" + format_person(person)},
        {"role": "assistant", "content": "I'll help generate demographic information based on the historical context. Please ask your questions."}
    ]

In [9]:
def answer_questions(person, questions, model="haiku", show_result = False, **kwargs):
    enhanced = person.copy()
    
    # Filter out custom parameters
    api_kwargs = {k: v for k, v in kwargs.items() 
                  if k not in {'show_cost', 'quiet'}}
    
    # Set up client and cost tracking
    client = get_client(model, **api_kwargs)
    
    # Initialize conversation
    messages = initialize_messages(person)
    
    # Add question and get response
    messages.append({"role": "user", "content": questions})
    lc_messages = make_langchain_messages(messages)
    response = client.invoke(lc_messages)
    content = response.content.strip()
    if show_result:
        print(content)
    # Extract and sample from probability distribution
    probs = extract_json(content)
    return probs

# Workspace

### Religion Example

In [10]:
question = '''
    1. What is the probability this person was Christian?'
    2. What is the probability this person was Muslim?
    3. What is the probability this person was Hindu?
    4. What is the probability this person was Buddhist?
    5. What is the probability this person followed Chinese non-Buddhist beliefs?'''

In [11]:
answers = []
for i in filtered_people:
    answers += [answer_questions(i, question, model = 'gpt-5.1', show_result = True)]

Christianity, Islam, Hinduism, Buddhism, and organized Chinese religious/ philosophical systems (like Confucianism, Daoism, and later folk-religious complexes) all arose many millennia after 8198 BC. At this time, in the Levant, people would have practiced prehistoric animist/ancestor/spirits-related beliefs, not any of these later organized religions. So for a child dying at age 3 in 8198 BC, all of the listed affiliations are effectively impossible.

{"1": 0, "2": 0, "3": 0, "4": 0, "5": 0}
Christianity in 3rd-century rural eastern India would have been nearly nonexistent; Christian communities were limited and much more plausible in the far southwest (Malabar) several centuries later, and there is no evidence for Christian presence in interior Odisha at this time. Islam did not yet exist (it began in the 7th century AD), so its probability is effectively 0. In contrast, this region was firmly within the cultural-religious sphere of early Hindu traditions (Brahmanical/Vedic religion 

KeyboardInterrupt: 

In [None]:
collated = {'1':0,'2':0,'3':0,'4':0,'5':0}
for i in answers:
    for j in i:
        collated[j] += i[j]

In [None]:
collated

###  Records example

In [14]:
question = '''
    1. What is the probability that this person's existence was recorded in some form or another?
    2. What is the probability that this person has living direct descendants?'''

In [15]:
answers = []
for i in filtered_people:
    answers += [answer_questions(i, question, model = 'gpt-5.1', show_result = True)]

For someone born around 8198 BC in a small, mobile hunter-gatherer group in what is now Lebanon, there were no writing systems, and durable symbolic recording (e.g., rock art, ritual depictions) would overwhelmingly concern group-level or exceptional figures, not individual infants or toddlers, and certainly not with personal identity. At age 3, before any remarkable accomplishments and in a very small-scale society, the chance that this particular child's existence was individually "recorded" in any recoverable sense is effectively zero.

By contrast, the probability of having living direct descendants depends only on whether he reproduced before death. Dying at age 3 in a foraging society makes biological reproduction impossible; prehistoric child mortality was high, but reproduction did not occur at that age. Thus the probability that he has any direct descendants today is exactly zero.

{"1": 0, "2": 0}
For a male infant born and dying in the same year (252 AD) in a sparsely popula

In [16]:
collated = {'1':0,'2':0}
for i in answers:
    for j in i:
        collated[j] += i[j]

In [17]:
collated

{'1': 27.9643121, '2': 48.45999999999996}