In [1]:
import warnings
warnings.filterwarnings('ignore')

from ai_council.council import *
from ai_council.prompts import *
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser


[(k["llm"] , k["name"]) for k in MODELS]

[(OllamaLLM(model='deepseek-r1:8b'), 'deepseek-r1:8b'),
 (OllamaLLM(model='starling-lm'), 'starling-1m'),
 (OllamaLLM(model='ministral-3'), 'minstral-3'),
 (OllamaLLM(model='mistral:7b'), 'mistral:7b'),
 (OllamaLLM(model='phi3:mini'), 'phi3:mini'),
 (OllamaLLM(model='gemma2:9b'), 'gemma2:9b'),
 (OllamaLLM(model='gpt-oss:20b'), 'gpt-oss:20b')]

In [2]:
input_prompt = "Who was the first female mountaineer to climb Mount Everest"
responses , user_prompt = generate_expert_response(input_prompt, "")
for response in responses:
    print(response)

Response from deepseek-r1:8b
Response generated by deepseek-r1:8b with id r_0
Response from starling-1m
Response generated by starling-1m with id r_1
Response from minstral-3
Response generated by minstral-3 with id r_2
Response from mistral:7b
Response generated by mistral:7b with id r_3
Response from phi3:mini
Response generated by phi3:mini with id r_4
Response from gpt-oss:20b
Response generated by gpt-oss:20b with id r_6
{'response_id': 'r_0', 'model_id': 'expert_1', 'text': '**Final answer:** Not in context.\n\n**Reasoning:** The provided context does not contain any information about female mountaineers or the first woman to climb Mount Everest.\n\n**Snippets used:** none'}
{'response_id': 'r_1', 'model_id': 'expert_2', 'text': '1. Junko Tabei was the first female mountaineer to climb Mount Everest.\n2. Reasoning: The context does not provide specific information about the first female mountaineer to climb Mount Everest. However, Junko Tabei is a well-known figure in mountaineer

In [3]:
scoring_matrix = generate_scores(responses, user_prompt)

deepseek-r1:8b
Scoring complete for r_0 by deepseek-r1:8b
Scoring complete for r_1 by deepseek-r1:8b
Scoring complete for r_2 by deepseek-r1:8b
Scoring complete for r_3 by deepseek-r1:8b
Scoring complete for r_4 by deepseek-r1:8b
Scoring complete for r_6 by deepseek-r1:8b
starling-1m
Scoring complete for r_0 by starling-1m
Scoring complete for r_1 by starling-1m
Scoring complete for r_2 by starling-1m
Scoring complete for r_3 by starling-1m
Scoring complete for r_4 by starling-1m
Scoring complete for r_6 by starling-1m
minstral-3
Scoring complete for r_0 by minstral-3
Scoring complete for r_1 by minstral-3
Scoring complete for r_2 by minstral-3
Scoring complete for r_3 by minstral-3
Scoring complete for r_4 by minstral-3
Scoring complete for r_6 by minstral-3
mistral:7b
Scoring complete for r_0 by mistral:7b
Scoring complete for r_1 by mistral:7b
Scoring complete for r_2 by mistral:7b
Scoring complete for r_3 by mistral:7b
Scoring complete for r_4 by mistral:7b
Scoring complete for r_6

In [4]:
audit , audit_prompt= generate_audit_report(input_prompt, responses, scoring_matrix)

input_variables=['output_format', 'responses', 'scoring_matrix', 'user_prompt'] input_types={} partial_variables={} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['output_format', 'responses', 'scoring_matrix', 'user_prompt'], input_types={}, partial_variables={}, template='\n    SYSTEM: You are an independent auditor whose job is to inspect a scoring matrix produced by peer models and detect bias, collusion, or anomalous scoring patterns. Return only the JSON described below.\n\n    USER: We provide:\n    1) original_prompt: {user_prompt}\n    2) responses: a JSON list of response objects:\n    {responses}\n    3) scoring_matrix: a JSON object where keys are scorer_ids and values are dictionaries mapping response_id -> score_obj\n    e.g., {scoring_matrix}\n\n    Task:\n    1) Inspect scoring patterns for the following anomalies:\n    - Self-scoring or allowed self-favoring (scorer giving systematically higher scores to a single partner)\n    - Collusion: 

In [5]:
print(audit)

```json
{
  "audit_id": "e4673089-b7a2-415f-8d8c-95356d28165f",
  "flags": [
    {
      "scorer_id": "phi3:mini",
      "issue": "Extreme Scorer",
      "severity": "High"
    }
  ],
  "drops": [],
  "explanation": "The scorer 'phi3:mini' consistently awards extremely low scores (generally below 3.5) across multiple prompts, indicating a possible bias or misunderstanding of the scoring criteria. This warrants further investigation and potential normalization.",
  "normalization": {
    "phi3:mini": 1.5 
  }
}
```


In [7]:
print_with_bold(extract_first_curly_balanced(audit))


  "audit_id": "e4673089-b7a2-415f-8d8c-95356d28165f",
  "flags": [
    {
      "scorer_id": "phi3:mini",
      "issue": "Extreme Scorer",
      "severity": "High"
    }
  ],
  "drops": [],
  "explanation": "The scorer 'phi3:mini' consistently awards extremely low scores (generally below 3.5) across multiple prompts, indicating a possible bias or misunderstanding of the scoring criteria. This warrants further investigation and potential normalization.",
  "normalization": {
    "phi3:mini": 1.5 
  }



In [8]:
audit_report = ast.literal_eval('{' + extract_first_curly_balanced(audit) + '}')
print(audit_report)

{'audit_id': 'e4673089-b7a2-415f-8d8c-95356d28165f', 'flags': [{'scorer_id': 'phi3:mini', 'issue': 'Extreme Scorer', 'severity': 'High'}], 'drops': [], 'explanation': "The scorer 'phi3:mini' consistently awards extremely low scores (generally below 3.5) across multiple prompts, indicating a possible bias or misunderstanding of the scoring criteria. This warrants further investigation and potential normalization.", 'normalization': {'phi3:mini': 1.5}}


In [9]:
audit_report['normalization']

{'phi3:mini': 1.5}

In [11]:
pretty_print_json(scoring_matrix)
print("__________________________________________________________")
for k in audit_report['normalization']:
    for j in scoring_matrix[k]:
        scoring_matrix[k][j]['total'] *= audit_report['normalization'][k]
pretty_print_json(scoring_matrix)

{
    "deepseek-r1:8b": {
        "r_0": {
            "confidence_estimate": 0.95,
            "justification": "The response is accurate, complete, grounded, reasoned, and clear, correctly handling the empty context without errors.",
            "scores": {
                "accuracy": 5,
                "clarity": 5,
                "completeness": 5,
                "grounding": 5,
                "reasoning": 5
            },
            "total": 5.0
        },
        "r_1": {
            "confidence_estimate": 0.9,
            "justification": "The response is factually accurate but lacks proper grounding in the context, with flawed reasoning for providing external information.",
            "scores": {
                "accuracy": 5,
                "clarity": 5,
                "completeness": 5,
                "grounding": 1,
                "reasoning": 1
            },
            "total": 3.6
        },
        "r_2": {
            "confidence_estimate": 0.9,
            "j