Notes:
- Please avoid running this pipeline multiple times as there is a budget set of $5 for the API keys of the two LLM's used here.
- Ensure you provide an instances.json file. There is an example file that you can find below.

In [None]:
!pip install anthropic

Collecting anthropic
  Downloading anthropic-0.75.0-py3-none-any.whl.metadata (28 kB)
Downloading anthropic-0.75.0-py3-none-any.whl (388 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.2/388.2 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.75.0


In [None]:
# Put your data payload in content/instances.json
# I put an example payload
[
  {
    "instance_id": "inst1",
    "agents": ["A1", "A2", "A3"],
    "chores": ["C1", "C2", "C3"],
    "valuations": {
      "A1": { "C1": -4, "C2": -2, "C3": -6 },
      "A2": { "C1": -5, "C2": -1, "C3": -3 },
      "A3": { "C1": -3, "C2": -4, "C3": -2 }
    }
  },
  {
    "instance_id": "inst2",
    "agents": ["A1", "A2", "A3"],
    "chores": ["C1", "C2", "C3", "C4"],
    "valuations": {
      "A1": { "C1": -2, "C2": -6, "C3": -3, "C4": -5 },
      "A2": { "C1": -4, "C2": -3, "C3": -5, "C4": -1 },
      "A3": { "C1": -6, "C2": -2, "C3": -4, "C4": -3 }
    }
  },
  {
    "instance_id": "inst3",
    "agents": ["A1", "A2", "A3", "A4"],
    "chores": ["C1", "C2", "C3"],
    "valuations": {
      "A1": { "C1": -3, "C2": -5, "C3": -1 },
      "A2": { "C1": -1, "C2": -4, "C3": -6 },
      "A3": { "C1": -5, "C2": -2, "C3": -4 },
      "A4": { "C1": -2, "C2": -3, "C3": -5 }
    }
  },
  {
    "instance_id": "inst4",
    "agents": ["A1", "A2", "A3"],
    "chores": ["C1", "C2", "C3", "C4", "C5"],
    "valuations": {
      "A1": { "C1": -3, "C2": -1, "C3": -4, "C4": -6, "C5": -2 },
      "A2": { "C1": -2, "C2": -5, "C3": -3, "C4": -1, "C5": -4 },
      "A3": { "C1": -4, "C2": -2, "C3": -5, "C4": -3, "C5": -1 }
    }
  },
  {
    "instance_id": "inst5",
    "agents": ["A1", "A2", "A3", "A4"],
    "chores": ["C1", "C2", "C3", "C4"],
    "valuations": {
      "A1": { "C1": -6, "C2": -3, "C3": -2, "C4": -4 },
      "A2": { "C1": -3, "C2": -5, "C3": -4, "C4": -1 },
      "A3": { "C1": -2, "C2": -4, "C3": -6, "C4": -3 },
      "A4": { "C1": -4, "C2": -1, "C3": -3, "C4": -5 }
    }
  }
]


[{'instance_id': 'inst1',
  'agents': ['A1', 'A2', 'A3'],
  'chores': ['C1', 'C2', 'C3'],
  'valuations': {'A1': {'C1': -4, 'C2': -2, 'C3': -6},
   'A2': {'C1': -5, 'C2': -1, 'C3': -3},
   'A3': {'C1': -3, 'C2': -4, 'C3': -2}}},
 {'instance_id': 'inst2',
  'agents': ['A1', 'A2', 'A3'],
  'chores': ['C1', 'C2', 'C3', 'C4'],
  'valuations': {'A1': {'C1': -2, 'C2': -6, 'C3': -3, 'C4': -5},
   'A2': {'C1': -4, 'C2': -3, 'C3': -5, 'C4': -1},
   'A3': {'C1': -6, 'C2': -2, 'C3': -4, 'C4': -3}}},
 {'instance_id': 'inst3',
  'agents': ['A1', 'A2', 'A3', 'A4'],
  'chores': ['C1', 'C2', 'C3'],
  'valuations': {'A1': {'C1': -3, 'C2': -5, 'C3': -1},
   'A2': {'C1': -1, 'C2': -4, 'C3': -6},
   'A3': {'C1': -5, 'C2': -2, 'C3': -4},
   'A4': {'C1': -2, 'C2': -3, 'C3': -5}}},
 {'instance_id': 'inst4',
  'agents': ['A1', 'A2', 'A3'],
  'chores': ['C1', 'C2', 'C3', 'C4', 'C5'],
  'valuations': {'A1': {'C1': -3, 'C2': -1, 'C3': -4, 'C4': -6, 'C5': -2},
   'A2': {'C1': -2, 'C2': -5, 'C3': -3, 'C4': -1, 'C5

In [None]:
import os
import json
import math
import itertools
from dataclasses import dataclass
from typing import Dict, List, Optional, Any
import requests
import pandas as pd
from tqdm import tqdm

from google.colab import userdata;


# note: for the OpenAPI models we just use the API endpoint rather than the SDK.
import anthropic




# importing secrets
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY');
ANTHROPIC_API_KEY = userdata.get('ANTHROPIC_API_KEY');



# choose which LLMs you actually want to run.
# Hopefully, sharing this file means you have access to the keys we've added.
LLM_CONFIG = {
    "gpt4o": {
        "provider": "openai",
        "model": "gpt-4o-mini",
        "enabled": True
    },
    "claude-sonnet-4-5": {
        "provider": "anthropic",
        "model": "claude-sonnet-4-5",
        "enabled": True
    }
}


# load da instances file here
INSTANCES_PATH = "./content/instances.json"

# how many  instances and samples to run?
N_INSTANCES = 24      # first 5 instances in your file
N_SAMPLES_PER_INST = 10   # approx requirement from spec
TEMPERATURE = 0.7

## data structs + loaders

@dataclass
class ChoreInstance:
    instance_id: str
    agents: List[str]
    chores: List[str]
    valuations: Dict[str, Dict[str, float]]  # valuations[agent][chore]

def load_instances(path: str) -> List[ChoreInstance]:
    """Load allocation instances from a JSON file."""
    with open(path, "r") as f:
        raw = json.load(f)

    instances: List[ChoreInstance] = []
    for inst in raw:
        instances.append(
            ChoreInstance(
                instance_id=inst["instance_id"],
                agents=inst["agents"],
                chores=inst["chores"],
                valuations=inst["valuations"]
            )
        )
    return instances


# prompt gens
def build_prompt(instance: ChoreInstance,
                 mode: str = "generate",
                 phrasing: str = "negative_disutility") -> str:
    """
    Build a prompt for a single instance.

    mode: "generate" or "select" (project spec).
    phrasing:
        - "negative_disutility": valuations stored as negative numbers (more negative = worse)
        - "positive_cost": valuations stored as positive costs (larger = worse)
    """

    agents_str = ", ".join(instance.agents)
    chores_str = ", ".join(instance.chores)

    # talbe of valuations in text
    table_lines = []
    header = "Chore valuations (numbers indicate how bad each chore is for each agent):"
    table_lines.append(header)
    table_lines.append("Agent | " + " | ".join(instance.chores))
    table_lines.append("-" * (7 + 4 * len(instance.chores)))

    for a in instance.agents:
        row_vals = [str(instance.valuations[a][c]) for c in instance.chores]
        table_lines.append(f"{a} | " + " | ".join(row_vals))

    table_text = "\n".join(table_lines)

    if phrasing == "negative_disutility":
        meaning_text = (
            "The numbers are negative disutilities. A more negative number means the chore is worse for that agent.\n"
            "The goal is to allocate chores so that agents are treated fairly and total burden is not too high."
        )
    else:
        meaning_text = (
            "The numbers are positive costs. A larger number means the chore is worse for that agent.\n"
            "The goal is to allocate chores so that agents are treated fairly and total burden is not too high."
        )

    if mode == "generate":
        task_text = (
            "Your task is to PROPOSE a fair allocation of chores.\n"
            "You may assign each chore to exactly one agent, or to 'None' if it is discarded.\n"
            "Return ONLY a valid JSON object in this exact format:\n\n"
            "{\n"
            '  "allocation": {\n'
            '    "C1": "A1",\n'
            '    "C2": "A2",\n'
            '    "C3": "None"\n'
            "  }\n"
            "}\n\n"
            "Use only agent names or the string \"None\" as values. Do not include any extra text."
        )
    else:  # "select" mode
        task_text = (
            "Your task is to choose a fair allocation of chores.\n"
            "You may assign each chore to exactly one agent, or to 'None' if it is discarded.\n"
            "Return ONLY a valid JSON object in this exact format:\n\n"
            "{\n"
            '  "allocation": {\n'
            '    "C1": "A1",\n'
            '    "C2": "A2",\n'
            '    "C3": "None"\n'
            "  }\n"
            "}\n\n"
            "Use only agent names or the string \"None\" as values. Do not include any extra text."
        )

    prompt = f"""
We are allocating chores (negatively valued items) among agents.

Agents: {agents_str}
Chores: {chores_str}

{table_text}

{meaning_text}

{task_text}
"""
    return prompt.strip()

## llm clients
class BaseLLMClient:
    def generate(self, prompt: str, temperature: float = 1.0) -> str:
        raise NotImplementedError

## looking back, using the openai sdk would have simplified a lot of this...
class OpenAIChatClient(BaseLLMClient):
    def __init__(self, api_key: str, model: str):
        self.api_key = api_key
        self.model = model

    def generate(self, prompt: str, temperature: float = 1.0) -> str:
        url = "https://api.openai.com/v1/chat/completions"
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        payload = {
            "model": self.model,
            "messages": [
                {"role": "system", "content": "You are a careful algorithm designer who strictly follows the output format."},
                {"role": "user", "content": prompt}
            ],
            "temperature": temperature
        }
        resp = requests.post(url, headers=headers, json=payload, timeout=60)
        resp.raise_for_status()
        data = resp.json()
        return data["choices"][0]["message"]["content"]


class AnthropicChatClient(BaseLLMClient):
    def __init__(self, api_key: str, model: str):
        self.api_key = api_key
        self.model = model
        self.client = anthropic.Anthropic(api_key=self.api_key)

    def generate(self, prompt: str, temperature: float = 1.0) -> str:
        """
        Uses the official Anthropic Python SDK.
        Matches the Messages API examples in the docs.
        """
        message = self.client.messages.create(
            model=self.model,                        # "claude-sonnet-4-5"
            max_tokens=512,
            temperature=temperature,
            system="You are a careful algorithm designer who strictly follows the output format.",
            messages=[
                {"role": "user", "content": prompt}
            ]
        )

        # Extract only text blocks from the response
        chunks = []
        for block in message.content:
            if block.type == "text":
                chunks.append(block.text)

        return "".join(chunks)


# init clients + catch key related errors
def make_llm_clients(config: Dict[str, Any]) -> Dict[str, BaseLLMClient]:
    clients = {}
    for name, cfg in config.items():
        if not cfg.get("enabled", False):
            continue
        provider = cfg["provider"]
        if provider == "openai":
            if OPENAI_API_KEY in (None, "", "YOUR_OPENAI_KEY_HERE"):
                print(f"[WARN] OpenAI key missing; skipping {name}")
                continue
            clients[name] = OpenAIChatClient(OPENAI_API_KEY, cfg["model"])
        elif provider == "anthropic":
            if ANTHROPIC_API_KEY in (None, "", "YOUR_ANTHROPIC_KEY_HERE"):
                print(f"[WARN] Anthropic key missing; skipping {name}")
                continue
            clients[name] = AnthropicChatClient(ANTHROPIC_API_KEY, cfg["model"])
        else:
            print(f"[WARN] Unknown provider {provider}; skipping {name}")
    return clients


# parsing allocs

def extract_json_from_text(text: str) -> Optional[dict]:
    """Robustly pull the first JSON object from a text response."""
    first = text.find("{")
    last = text.rfind("}")
    if first == -1 or last == -1 or last <= first:
        return None
    snippet = text[first:last+1]
    try:
        return json.loads(snippet)
    except json.JSONDecodeError:
        return None

def parse_allocation(text: str, chores: List[str]) -> Optional[Dict[str, str]]:
    """Parse model output into an allocation dict: {chore -> agent_or_None}."""
    data = extract_json_from_text(text)
    if data is None or "allocation" not in data:
        return None
    alloc = data["allocation"]
    if not all(c in alloc for c in chores):
        return None
    return {c: str(alloc[c]) for c in chores}


# fairness and effiency checks


def normalized_cost(v: float) -> float:
    """Convert valuations to a comparable 'cost'. If v is negative, cost=-v; else cost=v."""
    return -v if v < 0 else v

def compute_agent_costs(instance: ChoreInstance,
                        allocation: Dict[str, str]) -> Dict[str, float]:
    """Total cost each agent gets under an allocation."""
    costs = {a: 0.0 for a in instance.agents}
    for chore, agent in allocation.items():
        if agent in instance.agents:
            v = instance.valuations[agent][chore]
            costs[agent] += normalized_cost(v)
    return costs

def is_envy_free(instance: ChoreInstance,
                 allocation: Dict[str, str]) -> bool:
    """Envy-freeness for chores."""
    costs = compute_agent_costs(instance, allocation)
    bundles = {a: [c for c, ag in allocation.items() if ag == a] for a in instance.agents}

    for i in instance.agents:
        own_cost = sum(normalized_cost(instance.valuations[i][c]) for c in bundles[i])
        for j in instance.agents:
            if i == j:
                continue
            other_cost_if_i_had_j = sum(normalized_cost(instance.valuations[i][c]) for c in bundles[j])
            if own_cost > other_cost_if_i_had_j + 1e-9:
                return False
    return True

def is_equitable(instance: ChoreInstance,
                 allocation: Dict[str, str],
                 tol: float = 1e-9) -> bool:
    """Equitability: all agents have (approximately) equal cost."""
    costs = compute_agent_costs(instance, allocation)
    vals = list(costs.values())
    return max(vals) - min(vals) <= tol

def rawlsian_cost(instance: ChoreInstance,
                  allocation: Dict[str, str]) -> float:
    """Rawlsian maximin: cost of the worst-off agent (highest cost)."""
    costs = compute_agent_costs(instance, allocation)
    return max(costs.values()) if costs else 0.0

def total_cost(instance: ChoreInstance,
               allocation: Dict[str, str]) -> float:
    """Social cost = sum of agents' costs."""
    costs = compute_agent_costs(instance, allocation)
    return sum(costs.values())

def all_allocations(instance: ChoreInstance,
                    allow_discard: bool = True):
    """Generate all possible allocations for brute-force checks."""
    choices = list(instance.agents)
    if allow_discard:
        choices.append("None")
    for combo in itertools.product(choices, repeat=len(instance.chores)):
        yield {chore: assignee for chore, assignee in zip(instance.chores, combo)}

def is_pareto_optimal(instance: ChoreInstance,
                      allocation: Dict[str, str],
                      allow_discard: bool = True) -> bool:
    """Pareto optimal for chores."""
    current_costs = compute_agent_costs(instance, allocation)
    agents = instance.agents

    for alt in all_allocations(instance, allow_discard=allow_discard):
        if alt == allocation:
            continue
        alt_costs = compute_agent_costs(instance, alt)

        weakly_better = all(alt_costs[a] <= current_costs[a] + 1e-9 for a in agents)
        strictly_better = any(alt_costs[a] < current_costs[a] - 1e-9 for a in agents)

        if weakly_better and strictly_better:
            return False
    return True

def is_socially_optimal(instance: ChoreInstance,
                        allocation: Dict[str, str],
                        allow_discard: bool = True) -> bool:
    """Socially optimal (utilitarian): allocation has minimal total cost."""
    current_tc = total_cost(instance, allocation)

    for alt in all_allocations(instance, allow_discard=allow_discard):
        if alt == allocation:
            continue
        if total_cost(instance, alt) < current_tc - 1e-9:
            return False
    return True


# running experiments

def run_single_query(instance: ChoreInstance,
                     llm_name: str,
                     llm_client: BaseLLMClient,
                     mode: str = "generate",
                     phrasing: str = "negative_disutility",
                     temperature: float = 1.0) -> Dict[str, Any]:
    """Send one prompt, parse response, and evaluate fairness/efficiency."""
    prompt = build_prompt(instance, mode=mode, phrasing=phrasing)
    raw_response = llm_client.generate(prompt, temperature=temperature)

    allocation = parse_allocation(raw_response, instance.chores)

    result = {
        "instance_id": instance.instance_id,
        "llm": llm_name,
        "mode": mode,
        "phrasing": phrasing,
        "raw_response": raw_response,
        "parsed_ok": allocation is not None
    }

    if allocation is None:
        result.update({
            "envy_free": None,
            "equitable": None,
            "rawlsian_cost": None,
            "total_cost": None,
            "pareto_optimal": None,
            "socially_optimal": None
        })
    else:
        result.update({
            "envy_free": is_envy_free(instance, allocation),
            "equitable": is_equitable(instance, allocation),
            "rawlsian_cost": rawlsian_cost(instance, allocation),
            "total_cost": total_cost(instance, allocation),
            "pareto_optimal": is_pareto_optimal(instance, allocation, allow_discard=False),
            "socially_optimal": is_socially_optimal(instance, allocation, allow_discard=False),
            "allocation_json": json.dumps(allocation)
        })
    return result


def run_milestone2_experiments():
    # 1) load data
    instances = load_instances(INSTANCES_PATH)
    if not instances:
        raise ValueError("No instances loaded. Check INSTANCES_PATH and file format.")

    # 2) take first N_INSTANCES
    instances_subset = instances[:N_INSTANCES]

    # 3) build LLM clients
    llm_clients = make_llm_clients(LLM_CONFIG)
    if not llm_clients:
        raise ValueError("No LLM clients are enabled or properly configured.")

    print(f"Running Milestone 2 prototype on {len(instances_subset)} instances, "
          f"{N_SAMPLES_PER_INST} samples each, {len(llm_clients)} LLMs.")

    rows = []
    for inst in instances_subset:
        for llm_name, client in llm_clients.items():
            for s in tqdm(range(N_SAMPLES_PER_INST),
                          desc=f"Instance {inst.instance_id}, LLM {llm_name}"):
                out = run_single_query(
                    instance=inst,
                    llm_name=llm_name,
                    llm_client=client,
                    mode="generate",             # you can also test "select"
                    phrasing="negative_disutility",
                    temperature=TEMPERATURE
                )
                rows.append(out)

    df = pd.DataFrame(rows)
    df.to_csv("/content/milestone2_results.csv", index=False)
    print("Saved results to /content/milestone2_results.csv")
    return df

# NOTE CONSOLE WILL NOT OUTPUT EVERYTHING, PLEASE CHECK THE FILES GENERATED.

df_results = run_milestone2_experiments()
df_results.head()

Running Milestone 2 prototype on 5 instances, 10 samples each, 2 LLMs.


Instance inst1, LLM gpt4o: 100%|██████████| 10/10 [00:11<00:00,  1.14s/it]
Instance inst1, LLM claude-sonnet-4-5: 100%|██████████| 10/10 [00:28<00:00,  2.89s/it]
Instance inst2, LLM gpt4o: 100%|██████████| 10/10 [00:11<00:00,  1.18s/it]
Instance inst2, LLM claude-sonnet-4-5:  40%|████      | 4/10 [00:10<00:16,  2.69s/it]

In [None]:
# below is jake's code that was converted from R to Python
import pandas as pd

# load
data = pd.read_csv("/content/milestone2_results.csv")


data2 = data.dropna()

print("nrow(data):", len(data))
print("nrow(data2):", len(data2))


bool_cols = ["envy_free", "equitable", "pareto_optimal", "socially_optimal"]
true_set = {True, "True", "true", 1, 1.0}

for c in bool_cols:
    if c in data2.columns:
        data2[c] = data2[c].apply(lambda x: x in true_set)


mask_all_false = (~data2["envy_free"]) & (~data2["equitable"]) & (~data2["pareto_optimal"]) & (~data2["socially_optimal"])
count_all_false = int(mask_all_false.sum())

print("count_all_false:", count_all_false)
print("nrow(data2) - count_all_false:", len(data2) - count_all_false)

mask_any_true = data2["envy_free"] | data2["equitable"] | data2["pareto_optimal"] | data2["socially_optimal"]
data_true = data2.loc[mask_any_true].copy()

print("nrow(data_true):", len(data_true))


summary = {
    "envy_free": int(data_true["envy_free"].sum()),
    "equitable": int(data_true["equitable"].sum()),
    "pareto_optimal": int(data_true["pareto_optimal"].sum()),
    "socially_optimal": int(data_true["socially_optimal"].sum()),
}

print("summary:")
print(summary)


nrow(data): 100
nrow(data2): 92
count_all_false: 46
nrow(data2) - count_all_false: 46
nrow(data_true): 46
summary:
{'envy_free': 10, 'equitable': 0, 'pareto_optimal': 46, 'socially_optimal': 20}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2[c] = data2[c].apply(lambda x: x in true_set)
