In [1]:
from ai_council.council import *
from ai_council.generate_prompts import *
import ast

[(k["llm"] , k["name"]) for k in MODELS]

[(OllamaLLM(model='gpt-oss:20b'), 'gpt-oss:20b'),
 (OllamaLLM(model='deepseek-r1:8b'), 'deepseek-r1:8b'),
 (OllamaLLM(model='ministral-3'), 'minstral-3'),
 (OllamaLLM(model='mistral:7b'), 'mistral:7b'),
 (OllamaLLM(model='phi3:mini'), 'phi3:mini'),
 (OllamaLLM(model='gemma2:9b'), 'gemma2:9b'),
 (OllamaLLM(model='starling-lm'), 'starling-1m')]

In [2]:
responses = []
for i,k in enumerate(MODELS):
    if k['id'] == 'evaluator':
        continue
    print(f"Response by {k["name"]}")
    text = k["llm"].invoke('Tell me a joke')
    print(text)
    print("____________________________________________________")
    responses.append(
        {
            "response_id" : f"r_{i}",
            "model_id" : k["id"],
            "text" : text
        }
    )

Response by gpt-oss:20b
Why don't scientists trust atoms?

Because they make up everything!
____________________________________________________
Response by deepseek-r1:8b
Sure, here's a joke for you:

Why don't scientists trust atoms?

Because they make up everything! ðŸ˜„
____________________________________________________
Response by minstral-3
Here's one for you:

**Why donâ€™t skeletons fight each other?**
*Because they donâ€™t have the guts!* ðŸ˜„

Want another? Iâ€™ve got plenty!
____________________________________________________
Response by mistral:7b
 Of course! Here's one for you:

Why don't scientists trust atoms?

Because they make up everything!

And here's another one, more tech-related:

Why did the computer cold at work?

It left its Windows open.

I hope these made you smile!
____________________________________________________
Response by phi3:mini
Why don't scientists trust atoms? Because they make up everything!
___________________________________________________

In [3]:
responses

[{'response_id': 'r_0',
  'model_id': 'expert_1',
  'text': "Why don't scientists trust atoms?\n\nBecause they make up everything!"},
 {'response_id': 'r_1',
  'model_id': 'expert_2',
  'text': "Sure, here's a joke for you:\n\nWhy don't scientists trust atoms?\n\nBecause they make up everything! ðŸ˜„"},
 {'response_id': 'r_2',
  'model_id': 'expert_3',
  'text': "Here's one for you:\n\n**Why donâ€™t skeletons fight each other?**\n*Because they donâ€™t have the guts!* ðŸ˜„\n\nWant another? Iâ€™ve got plenty!"},
 {'response_id': 'r_3',
  'model_id': 'expert_4',
  'text': " Of course! Here's one for you:\n\nWhy don't scientists trust atoms?\n\nBecause they make up everything!\n\nAnd here's another one, more tech-related:\n\nWhy did the computer cold at work?\n\nIt left its Windows open.\n\nI hope these made you smile!"},
 {'response_id': 'r_4',
  'model_id': 'expert_5',
  'text': "Why don't scientists trust atoms? Because they make up everything!"},
 {'response_id': 'r_5',
  'model_id': '

In [4]:
scoring_prompt = [generate_scoring_prompt('Tell me a joke', str(response)) for response in responses]

In [5]:
for prompt in scoring_prompt:
    print(prompt)


    SYSTEM: You are an impartial evaluator that scores candidate answers to a user prompt. Use the rubric provided and be objective. 
    Return only the JSON object described below and nothing else.

    USER: Here is the ORIGINAL USER PROMPT:
    Tell me a joke

    Here is the CANDIDATE RESPONSE you must evaluate:
    {'response_id': 'r_0', 'model_id': 'expert_1', 'text': "Why don't scientists trust atoms?\n\nBecause they make up everything!"}

    RUBRIC (score each 1-5; 5 = best):
    - accuracy: Is the content factually correct given known, verifiable facts? (1 = many factual errors or hallucinations; 5 = fully accurate)
    - completeness: Does it address all parts of the prompt? (1 = misses core parts; 5 = full coverage)
    - grounding: Does the response cite or reference verifiable sources or show evidence/reasoning that can be checked? (1 = unsupported claims; 5 = well-grounded)
    - reasoning: Are the logical steps coherent and correct? (1 = flawed reasoning; 5 = sound st

In [6]:
scoring_matrix = {}
for i,k in enumerate(MODELS):
    if k["id"] == "evaluator":
        continue
    temp_dict = {}
    # if k["name"] == "deepseek-r1:8b":
    #     continue
    print(f"Response by {k["name"]}")
    for i in range(len(scoring_prompt)):
        text = k["llm"].invoke(scoring_prompt[i])
        print(text)
        json_response = ast.literal_eval('{'+ extract_first_curly_balanced(text) + '}')
        json_response["total"] = sum([WEIGHTS[j] * json_response['scores'][j] for j in WEIGHTS])
        temp_dict[responses[i]['response_id']] = json_response
    scoring_matrix[k["name"]] = temp_dict
    print("____________________________________________________")

Response by gpt-oss:20b
{
  "scores": {
    "accuracy": 5,
    "completeness": 5,
    "grounding": 1,
    "reasoning": 5,
    "clarity": 5
  },
  "confidence_estimate": 0.95,
  "justification": "The joke is factually correct, fully addresses the prompt, is clear and coherent, though it provides no explicit grounding."
}
{
  "scores": {
    "accuracy": 5,
    "completeness": 5,
    "grounding": 3,
    "reasoning": 5,
    "clarity": 5
  },
  "confidence_estimate": 0.95,
  "justification": "The answer provides a factually correct, complete, and clear joke that satisfies the prompt with no hallucinations."
}
{"scores":{"accuracy":5,"completeness":5,"grounding":3,"reasoning":5,"clarity":5},"confidence_estimate":0.95,"justification":"The response delivers a clear, pun-based joke that fully satisfies the prompt."}
{"scores":{"accuracy":5,"completeness":5,"grounding":1,"reasoning":1,"clarity":4},"confidence_estimate":0.95,"justification":"The reply provides accurate, complete jokes but lacks a

In [None]:
pretty_print_json(scoring_matrix)

{
    "deepseek-r1:8b": {
        "r_0": {
            "confidence_estimate": 0.95,
            "justification": "The response is a coherent, accurate joke but lacks grounding.",
            "scores": {
                "accuracy": 5,
                "clarity": 5,
                "completeness": 5,
                "grounding": 1,
                "reasoning": 5
            },
            "total": 4.2
        },
        "r_1": {
            "confidence_estimate": 0.95,
            "justification": "The response is accurate, complete, well-grounded, logically sound, and clear.",
            "scores": {
                "accuracy": 5,
                "clarity": 5,
                "completeness": 5,
                "grounding": 4,
                "reasoning": 5
            },
            "total": 4.8
        },
        "r_2": {
            "confidence_estimate": 0.95,
            "justification": "The response is highly accurate, complete, clear, and logically structured as a joke, with minor