## LLMClient

This file multiple different classes for LLM backends, as well as a class capable of assisting in prompt building.

It also contains experimental output evaluating different models against different preset claims.

### TransformersLMClient

`TransformersLMClient` provides a simple set of methods to allow quickly interfacing with the `AutoModelForCausalLM` class from Hugging Face's `transformers` library.

To use contiguous context, the caller must pass a string representing the full dialogue history to the `send_query` method.

In [None]:
from transformers import StoppingCriteria, StoppingCriteriaList
import torch, time
class _StopOnAny(StoppingCriteria):
    def __init__(self, tokenizer, stops: list[str]):
        self.tokenizer = tokenizer
        self.stop_ids = [tokenizer.encode(s, add_special_tokens=False) for s in stops]

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
        # check last len(seq) tokens for any stop pattern
        for seq in self.stop_ids:
            L = len(seq)
            if L == 0:
                continue
            if input_ids.shape[1] >= L and input_ids[0, -L:].tolist() == seq:
                return True
        return False

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import torch
from typing import Dict, Optional
import time

class TransformersLMClient():
  """
  The TransformersLMClient utilizes the AutoModelForCausalLM class
  from Hugging Face's `transformers' library.

  The constructor takes the name of the model, which should come from
  this list:
  https://huggingface.co/models?pipeline_tag=text-generation&num_parameters=min:3B,max:6B&sort=likes
  """

  model_name: str
  max_to_generate: int
  top_k: int
  top_p: float
  temperature: float
  repetition_penalty: float

  _tokenizer: AutoTokenizer
  _model: AutoModelForCausalLM
  _config: AutoConfig

  def __init__(self,
               model_name: str,
               max_to_generate: int = 100,
               top_k: int = 50,
               top_p: float = 1.0,
               temperature: float = 0.7,
               repetition_penalty: float = 1.1) -> None:
    self.model_name = model_name
    self.max_to_generate = max_to_generate
    self.top_k = top_k
    self.top_p = top_p
    self.temperature = temperature
    self.repetition_penalty = repetition_penalty
    self._tokenizer = AutoTokenizer.from_pretrained(model_name)
    self._model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
    self._config = AutoConfig.from_pretrained(model_name)

  def send_query(
      self,
      context: str,
      *,
      max_new_tokens: Optional[int] = None,
      temperature: Optional[float] = None,
      top_k: Optional[int] = None,
      top_p: Optional[float] = None,
      repetition_penalty: Optional[float] = None,
      stops: Optional[list[str]] = None,   # NEW: stop strings
      do_sample: Optional[bool] = None
  ) -> str:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    inputs = self._tokenizer(context, return_tensors="pt").to(device)

    # defaults fall back to ctor values
    max_new = max_new_tokens or self.max_to_generate
    temp    = self.temperature if temperature is None else temperature
    topk    = self.top_k if top_k is None else top_k
    topp    = self.top_p if top_p is None else top_p
    rep     = self.repetition_penalty if repetition_penalty is None else repetition_penalty
    sample  = self.temperature > 0 if do_sample is None else do_sample

    stopping = None
    if stops:
        stopping = StoppingCriteriaList([_StopOnAny(self._tokenizer, stops)])

    start = time.time()
    with torch.no_grad():
        outputs = self._model.generate(
            **inputs,
            max_new_tokens=max_new,
            do_sample=sample,
            top_k=topk,
            top_p=topp,
            temperature=temp,
            repetition_penalty=rep,
            eos_token_id=self._tokenizer.eos_token_id,
            pad_token_id=self._tokenizer.eos_token_id,
            stopping_criteria=stopping,   # NEW
            use_cache=True
        )
    print(f"request took {time.time() - start:.2f}s")

    input_len = inputs["input_ids"].shape[1]
    gen_ids = outputs[0][input_len:]
    return self._tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

### LlamaCppClient

[LlamaCpp](https://github.com/ggml-org/llama.cpp) is a toolkit that enables LLM inference with minimal setup and state-of-the-art performance on a wide range of consumer hardware and in the cloud.

The `LlamaCppClient` has a similar interface to `TransformersLMClient`, but uses `llama-server` as a backend.

It has a simple interface, and most models allow easily turning off and on a designated 'thinking' stage.

In [None]:
import requests

class LlamaCppClient:
    """
    The LlamaCppClient utilizes the llama-cpp LLM-inference toolkit as the backend
    from Hugging Face's `transformers' library.

    The models compatible with llama-cpp are found here:
    https://huggingface.co/models?library=gguf&sort=trending
    """

    api: str
    should_think: bool
    temperature: float
    system_message: str

    def __init__(self,
                 system_message: str = "You are a helpful assistant.",
                 should_think: bool = False,
                 host: str = "127.0.0.1",
                 port: int = 4568,
                 temperature: float = 0.7,):
        self.api = f"http://{host}:{port}/v1/chat/completions"
        self.temperature = temperature
        self.should_think = should_think
        self.system_message = system_message

    def send_query(self, context: str, *, temperature: Optional[float]=None,
                   max_tokens: Optional[int]=None, stop: Optional[list[str]]=None) -> str:
        think = "" if self.should_think else "/no_think"
        messages = [
            {"role": "system", "content": self.system_message + think},
            {"role": "user", "content": context},
        ]
        payload = {
            "model": "local",
            "messages": messages,
            "temperature": self.temperature if temperature is None else temperature,
        }
        if max_tokens is not None:
            payload["max_tokens"] = max_tokens
        if stop:
            payload["stop"] = stop

        start = time.time()
        response = requests.post(self.api, json=payload)
        response.raise_for_status()
        result = response.json()
        print(f'request took {time.time() - start:.2f}s')
        return result["choices"][0]["message"]["content"]

### PromptAnalyzer + Base Approach

PromptAnalyzer allows quickly building templated, task-specific prompts and a simple interface to interacting with a LM.

The prompts themselves were taken from *RAGAR, Your Falsehood Radar* by Khaliq et al (pg. 14).

Tried the original, even modified the prompt but it seems to be ignoring second line requirement nor is it sticking to 1-2 short line reasoning

So making a new prompt analyser with two prompt calls for the LLM Model. Modified LLM Clients as well.

Goal: Shorter prompts for quicker response, despite for frequent calls

removing follow up segment of prompt into reasoning and decision-making segments


In [None]:
import re, json
from typing import Dict, Tuple, Optional, List, Callable

class PromptAnalyzer:
    # maps a prompt key to a tuple: (system_message, content_template)
    _prompts: Dict[str, Tuple[str, str]] = {
        "initial_question": ("", ""),
        "reason_line": (
            "Given unverified claim and Q A pairs related to claim, THINK SILENTLY \n"
            "In ONE line (<=20 words): verify the claim from ONLY the Q A's information as context;\n"
            "ignore opinions from Q-A; state what’s missing in Q-A to verify claim if anything.\n",
            "Claim: {}\nQ-A:\n{}\nOUTPUT:"
        ),
        "decision_json": (
            "Based on reasoning as additional context, DETERMINE VERDICT BY GENERATING ONLY JSON WITH KEYS: "
            "'verdict' EITHER (\"Conclusive\"|\"Inconclusive\"), "
            "'stance' ONE OF (\"Supported\"|\"Refuted\"|\"Not Enough Information\"), "
            "'missing_assumptions' what would help solidify the stance/confidence (<10 words) "
            "'confidence'(0-100) based on reasoning and missing_assumption, "
            "Output JSON only. Nothing else. ALSO opinions cannot verify facts",
            "Claim: {}\nQ-A:\n{}\nReasoning: {}\nOUTPUT:"
        ),
        "follow_up": ("", ""),
        "secondary_question": ("", ""),
    }

    @staticmethod
    def get_system_message(prompt_type: str) -> str:
        return PromptAnalyzer._prompts[prompt_type][0]

    @staticmethod
    def build_template(prompt_type: str, args: List[str]) -> str:
        return PromptAnalyzer._prompts[prompt_type][1].format(*args)

    @staticmethod
    def build_prompt(prompt_type: str, args: List[str]) -> str:
        return PromptAnalyzer.get_system_message(prompt_type) + "\n\n" + \
               PromptAnalyzer.build_template(prompt_type, args)

    @staticmethod
    def build_reasoning_prompt(claim: str, qe_pairs: str) -> str:
        return PromptAnalyzer.build_prompt("reason_line", [claim, qe_pairs])

    @staticmethod
    def build_decision_prompt(claim: str, qe_pairs: str, reasoning: str) -> str:
        return PromptAnalyzer.build_prompt("decision_json", [claim, qe_pairs, reasoning])

    @staticmethod
    def _first_two_lines(text: str) -> str:
        if not text:
            return ""
        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
        return " ".join(lines[:2])

    @staticmethod
    def _last_nonempty_line(text: str) -> str:
        lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
        return lines[-1] if lines else ""

    @staticmethod
    def _looks_like_json(s: str) -> bool:
        s = s.strip()
        return s.startswith("{") and s.endswith("}")

    @staticmethod
    def _json_retry_prompt(claim: str, qe_pairs: str, reasoning: str) -> str:
        return (
            "JSON ONLY. Keys: 'verdict','stance','confidence','missing_assumptions'. Nothing else.\n\n"
            f"Claim: {claim}\nQ-A:\n{qe_pairs}\nReasoning: {reasoning}\nOUTPUT:"
        )

    @staticmethod
    def llm_call(llm, prompt: str, **gen_kwargs) -> str:
        """
        Unify kwargs for TransformersLMClient (stops/max_new_tokens) and LlamaCppClient (stop/max_tokens).
        """
        mapped = dict(gen_kwargs)
        if "stops" in mapped and "stop" not in mapped:
            mapped["stop"] = mapped["stops"]
        if "max_new_tokens" in mapped and "max_tokens" not in mapped:
            mapped["max_tokens"] = mapped["max_new_tokens"]

        if hasattr(llm, "send_query") and hasattr(llm, "api"):          # llama-cpp
            llama_kwargs = {}
            if "stop" in mapped:        llama_kwargs["stop"] = mapped["stop"]
            if "max_tokens" in mapped:  llama_kwargs["max_tokens"] = mapped["max_tokens"]
            if "temperature" in mapped: llama_kwargs["temperature"] = mapped["temperature"]
            return llm.send_query(prompt, **llama_kwargs)

        if hasattr(llm, "send_query") and hasattr(llm, "_model"):       # transformers
            tf_kwargs = dict(gen_kwargs)
            if "stop" in tf_kwargs and "stops" not in tf_kwargs:
                tf_kwargs["stops"] = tf_kwargs.pop("stop")
            if "max_tokens" in tf_kwargs and "max_new_tokens" not in tf_kwargs:
                tf_kwargs["max_new_tokens"] = tf_kwargs.pop("max_tokens")
            return llm.send_query(prompt, **{k: v for k, v in tf_kwargs.items() if v is not None})

        return llm(prompt, **gen_kwargs)

    @staticmethod
    def run_two_stage(llm: Callable[..., str], claim: str, qe_pairs: str) -> str:
      # 1) Reasoning
      rp = PromptAnalyzer.build_reasoning_prompt(claim, qe_pairs)
      r_out = PromptAnalyzer.llm_call(llm, rp, max_new_tokens=50, temperature=0.2, stops=["\n"])
      reason_line = PromptAnalyzer._first_two_lines(r_out)

      # 2) Decision (JSON)
      dp = PromptAnalyzer.build_decision_prompt(claim, qe_pairs, reason_line)
      d_out = PromptAnalyzer.llm_call(llm, dp, max_new_tokens=120, temperature=0.1, stops=["}\n", "}\r\n", "}"])
      json_line = PromptAnalyzer._last_nonempty_line(d_out)

      if not PromptAnalyzer._looks_like_json(json_line):
        retry_prompt = PromptAnalyzer._json_retry_prompt(claim, qe_pairs, reason_line)
        d_out = PromptAnalyzer.llm_call(
            llm, retry_prompt,
            max_new_tokens=80,
            temperature=0.1,
            stops=["}\n", "}\r", "}"]
        )
        json_line = PromptAnalyzer._last_nonempty_line(d_out)

      return f"{reason_line}\nJSON Result:{json_line}"

    @staticmethod
    def parse_decision_json(response: str) -> Tuple[Optional[bool], Optional[str], Optional[int], dict]:
        """
        Simple tolerant parser:
        - Looks for 'conclusive', 'inconclusive', 'neutral', 'unknown'
        - Extracts stance if present
        - Extracts numeric confidence if present
        Returns (True|False|None, stance|None, confidence|None, {})
        """
        if not response:
            return None, None, None, {}

        text = response.lower()

        # --- verdict ---
        verdict = None
        if re.search(r"\bconclusive\b", text):
            verdict = True
        elif re.search(r"\binconclusive\b|\bneutral\b|\bunknown\b", text):
            verdict = False

        # --- stance ---
        stance = None
        if re.search(r"\bsupport(ed|ing)?\b", text):
            stance = "Supported"
        elif re.search(r"\brefut(ed|ing)?\b", text):
            stance = "Refuted"
        elif re.search(r"\bnei\b|\bnone\b|\bneutral\b", text):
            stance = "NEI"

        # --- confidence ---
        conf_match = re.search(r"\bconfidence[^0-9]*([0-9]{1,3})", text)
        confidence = int(conf_match.group(1)) if conf_match else None

        # --- done ---
        return verdict, stance, confidence, {}

    @staticmethod
    def parse_conclusivity(response: str) -> Optional[bool]:
        verdict, _, _, _ = PromptAnalyzer.parse_decision_json(response)
        return verdict


True → Supported/Refuted

False → NEI

None → could not interpret / bad format

In [None]:
# EXPORT-IGNORE-END

In [None]:
# EXPORT-IGNORE-START

## Initial Tests for `follow_up` prompts

The baseline claim will be a simple logical modus ponens argument:

1. if water is at sea level and above 100 degrees Celcius, it is boiling.
2. the water is at sea level and above 100 degrees Celcius
3. therefore, the water must be boiling

In [None]:
easy_claim = "Autumn is the most beautiful season."
easy_qe_pairs_impartial = ("Q: \"What makes autumn the most beautiful season?\", A: \"Some people say autumn is the most beautiful season because they enjoy its colors, weather, and overall atmosphere.\"\n")

In [None]:
claim = "the water is not boiling"
qe_pairs = ("Q: \"What elevation are we at?\", A: \"We are at sea level.\"\n"
            "Q: \"What temperature is the water?\", A: \"The water is currently over 100 degrees Celcius.\"\n"
            "Q: \"Does water boil when it is over 100 degrees Celcius at sea level?\", A: \"Yes.\"")
qe_pairs_impartial = ("Q: \"What temperature is the water?\", A: \"The water is currently over 100 degrees Celcius.\"\n"
                      "Q: \"Does water boil when it is over 100 degrees Celcius at sea level?\", A: \"Yes.\"")

In [None]:
hard_claim = "The titular Lord of the Rings was never bested by a dog."
hard_qe_pairs = ("Q: \"Who is the Lord of the Rings?\", A: \"In the Tolkien universe, Sauron is generally considered the Lord of the Rings, having created the One Ring in his conquest over Middle Earth.\"\n"
                 "Q: \"Could any dog, given the right circumstances, defeat Sauron in combat or contest?\", A: \"In the Tolkien universe, a wolfhound named Huan was prophesied to only be defeated by the greatest wolf that ever lived.\"\n"
                 "Q: \"What Race is Sauron?\", A: \"Sauron is a fallen Maia, a class of immortal beings who can choose to incarnate themselves in a mortal body.\"\n"
                 "Q: \"Did Sauron and Huan ever fight?\", A: \"During an attack on the stronghold of Tol-in-Gaurhoth, Huan managed to defeat Sauron after the latter took the shape of a wolf in attempts to fill the role of Huan's killer.\"")
hard_qe_pairs_impartial = ("Q: \"Could any dog, given the right circumstances, defeat Sauron in combat or contest?\", A: \"In the Tolkien universe, a wolfhound named Huan was prophesied to only be defeated by the greatest wolf that ever lived.\"\n")

With the above two claims, will test **Qwen-2.5-3B**, **Qwen-3-4B** and **one more** with three different optimization approaches defined below

# Approach 1:
Letting the model run as is, and for harder questions, adding a post processing step based on simple rules
**UNRELIABLE - IGNORE**

We are attempting to inspect each sentence of generated reasoning and keeping only sentences that lexically overllap using Jaccard Simillarity

Doing tokenisation to normalise text (for meaningful overlap identification)

If no sentence is supported, it forces neutral fallback NEI



If the LLM says “Conclusive” but the QE pairs don’t have enough lexical coverage of the claim (or they look like tautological restatements), downgrade to Inconclusive. This catches “style-over-substance” generations.

In [None]:
import re
from typing import Tuple, Dict, Any

STOP = set("""
a an the and or but if then else when while for to of in on at by with from over under above below
is are was were be being been not no nor this that these those as it its into onto about
""".split())

def _tokens(s: str) -> set[str]:
    toks = re.findall(r"[a-z0-9]+", s.lower())
    return {t for t in toks if t not in STOP}

def jaccard(a: set[str], b: set[str]) -> float:
    if not a or not b: return 0.0
    inter = len(a & b)
    uni   = len(a | b)
    return inter / uni

def has_direct_assertion_or_rule(qe_pairs: str) -> bool:
    """
    True if QE contains a direct yes/no on the claim proposition or a rule-style QA
    ('Does ... when ... ?' answered Yes/No). This is claim-agnostic.
    """
    ql = qe_pairs.lower()
    if re.search(r'\b[aA]\s*:\s*"(yes|no)"', ql):
        return True
    if ("does" in ql and "when" in ql) and re.search(r'\b[aA]\s*:\s*"(yes|no)"', ql):
        return True
    return False

def lexical_gate(claim: str, qe_pairs: str,
                 min_claim_coverage=0.30,     # relaxed a bit
                 min_evidence_novelty=0.15):  # relaxed a bit
    claim_t = _tokens(claim)
    qa_t    = _tokens(qe_pairs)
    coverage = jaccard(claim_t, qa_t)
    novelty  = jaccard(qa_t - claim_t, qa_t)
    return {"coverage": coverage, "novelty": novelty,
            "pass": (coverage >= min_claim_coverage) and (novelty >= min_evidence_novelty)}

def apply_attempt1_gate(original_response: str, claim: str, qe_pairs: str) -> Tuple[str, Dict[str, float]]:
    verdict_bool = PromptAnalyzer.parse_conclusivity(original_response)
    metrics = lexical_gate(claim, qe_pairs)

    if has_direct_assertion_or_rule(qe_pairs):
        return original_response, {"coverage": metrics["coverage"], "novelty": metrics["novelty"]}

    if verdict_bool is True and not metrics["pass"]:
        lines = original_response.strip().splitlines()
        try:
            data = json.loads(lines[-1])
            data["verdict"] = "Inconclusive"
            data["stance"]  = "NEI"
            data["confidence"] = min(int(data.get("confidence", 50)), 40)
            lines[-1] = json.dumps(data, ensure_ascii=False)
            return "\n".join(lines), {"coverage": metrics["coverage"], "novelty": metrics["novelty"]}
        except Exception:
            return (original_response + "\n" +
                    json.dumps({"verdict":"Inconclusive","stance":"NEI","confidence":40})), \
                   {"coverage": metrics["coverage"], "novelty": metrics["novelty"]}

    return original_response, {"coverage": metrics["coverage"], "novelty": metrics["novelty"]}



# Approach 2:
Similar to above but adding a semantic gate
**UNRELIABLE - IGNORE**

Run only for Attempt 4's helper functions

Before accepting “Conclusive”, ask the LLM to (a) list critical missing assumptions and (b) do a targeted NLI label between QE pairs and claim. If any assumption is critical & unmet, or NLI is not entails/refutes with high confidence, we abstain (Inconclusive).

Behavior:
- Gate reads base output.
- DOWNGRADE Conclusive -> Inconclusive if any check fails:


```
(nli ∉ {entails, contradicts}) OR (confidence < 65) OR (unmet critical assumptions)
 OR (decision_basis ∉ {'objective','rule+premises'})
```


- UPGRADE Inconclusive/None -> Conclusive if ALL checks pass strongly.
- Otherwise: pass through as-is.

In [None]:
import json
from typing import Dict, Any, Tuple

def _last_balanced_json(text: str) -> str:
    if not text:
        return ""
    s = text.strip()
    end = s.rfind("}")
    if end == -1:
        return ""
    depth = 0
    start = -1
    for i in range(end, -1, -1):
        ch = s[i]
        if ch == "}":
            depth += 1
        elif ch == "{":
            depth -= 1
            if depth == 0:
                start = i
                break
    return s[start:end+1] if start != -1 else ""

def _parse_json_from(text: str) -> Dict[str, Any]:
    raw = _last_balanced_json(text)
    if not raw:
        return {}
    try:
        return json.loads(raw)
    except Exception:
        return {}

def _sem_prompt(claim: str, qa: str) -> str:
    return (
        "JSON only.\n"
        "Keys: 'n' in ['entails','contradicts','unknown'], 'c' (0-100), 'unmet' (list of strings).\n"
        "Compare claim vs Q-A (as passage). If opinion/testimonial with no rule, use n='unknown'.\n"
        f"Claim:\n{claim}\n\nQ-A:\n{qa}\nOUTPUT:"
    )

def _syl_prompt(claim: str, qa: str) -> str:
    return (
        "JSON only.\n"
        "Build a micro-syllogism from Q-A (premises + rule). Decide entail/contradict/unknown.\n"
        "Return: {'n':'entails'|'contradicts'|'unknown','c':0-100,'unmet':[]}.\n"
        f"Claim:\n{claim}\n\nQ-A:\n{qa}\nOUTPUT:"
    )

def run_syllogism_probe(llm_client, claim: str, qe_pairs: str) -> Dict[str, Any]:
    out = PromptAnalyzer.llm_call(
        llm_client, _syl_prompt(claim, qe_pairs),
        max_new_tokens=140, temperature=0.2, stops=["}"]
    )
    d = _parse_json_from(out) or {}
    n = str(d.get("n", "unknown")).lower()
    try: c = int(d.get("c", 0))
    except: c = 0
    unmet = d.get("unmet", []) or []
    return {"nli": n, "confidence": c, "unmet_assumptions": unmet}

def run_semantic_gate(llm_client, claim: str, qe_pairs: str) -> Dict[str, Any]:
    # 1) simple semantic ask
    out = PromptAnalyzer.llm_call(
        llm_client, _sem_prompt(claim, qe_pairs),
        max_new_tokens=160, temperature=0.2, stops=["}"]
    )
    d = _parse_json_from(out) or {}
    n = str(d.get("n", "unknown")).lower()
    try: c = int(d.get("c", 0))
    except: c = 0
    unmet = d.get("unmet", []) or []
    sem = {"nli": n, "confidence": c, "unmet_assumptions": unmet}

    # 2) if weak/unknown → micro syllogism
    if n == "unknown" or c < 60 or unmet:
        probe = run_syllogism_probe(llm_client, claim, qe_pairs)
        # Pick more decisive; if both decisive, pick higher confidence
        def _score(x):
            return (x["nli"] in ("entails","contradicts"), x["confidence"])
        return max([sem, probe], key=_score)
    return sem

def apply_attempt2_gate(
    original_response: str,
    claim: str,
    qe_pairs: str,
    llm_client,
    nli_conf_threshold: int = 60
) -> Tuple[str, Dict[str, Any]]:
    """
    Keep base output unless semantic+syllogism is clearly decisive.
    Flip DOWN if base says Conclusive but gate not decisive.
    Flip UP if base is Inconclusive/None and gate is decisive.
    """
    sem = run_semantic_gate(llm_client, claim, qe_pairs)

    # base parse
    verdict_bool = PromptAnalyzer.parse_conclusivity(original_response)  # True / False / None

    # gate decision
    nli_ok  = sem["nli"] in ("entails","contradicts")
    conf_ok = sem["confidence"] >= nli_conf_threshold
    unmet_ok= len(sem["unmet_assumptions"]) == 0
    decisive = (nli_ok and conf_ok and unmet_ok)

    # DOWNGRADE: base said Conclusive, gate not decisive → Inconclusive
    if verdict_bool is True and not decisive:
        try:
            lines = original_response.strip().splitlines()
            data = json.loads(lines[-1])
            data.update({
                "verdict": "Inconclusive",
                "stance": "NEI",
                "confidence": min(int(data.get("confidence", 50)), 35),
                "missing_assumptions": ", ".join(sem["unmet_assumptions"])
            })
            lines[-1] = json.dumps(data, ensure_ascii=False)
            return "\n".join(lines), sem
        except Exception:
            patch = {
                "verdict": "Inconclusive",
                "stance": "NEI",
                "confidence": 35,
                "missing_assumptions": ", ".join(sem["unmet_assumptions"])
            }
            return original_response + "\n" + json.dumps(patch, ensure_ascii=False), sem

    # UPGRADE: base Inconclusive/None, gate decisive → Conclusive
    if (verdict_bool in (False, None)) and decisive:
        stance = "Supported" if sem["nli"] == "entails" else "Refuted"
        upgraded = json.dumps({
            "verdict": "Conclusive",
            "stance": stance,
            "confidence": min(90, int(sem["confidence"]))
        }, ensure_ascii=False)
        lines = [ln for ln in original_response.strip().splitlines() if ln.strip()]
        if lines:
            try:
                json.loads(lines[-1]); lines[-1] = upgraded
            except Exception:
                lines.append(upgraded)
            return "\n".join(lines), sem
        return upgraded, sem

    # otherwise no change
    return original_response, sem


# Approach 3
Attempting a faithfulness check buy running llm 3-5 times using ssame prompt but different sampling

**RELIABLE BUT TIME CONSUMING**

This gives slightly varied conclusions — some may say Conclusive, some Inconclusive, etc.

Then a lightweight python voting function aggregates them by checking if majority agree with a solution, if the confidence is > threshold, puts out result

If the ensemble isn’t confident + aligned, it abstains (NEI)

In [None]:
def _extract_last_json_line(text: str) -> Dict[str, Any]:
    try:
        lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
        return json.loads(lines[-1]) if lines else {}
    except Exception:
        return {}

def _inconclusive_prior_vote(objs: List[Dict[str, Any]],
                             base: dict | None,
                             margin: float = 0.10,
                             prior_inconclusive: float = 0.30,
                             base_stance_bonus: float = 0.10) -> Dict[str, Any]:
    # Weighted tallies
    conc_score = prior_inconclusive * 0.0  # prior only helps inconclusive
    inconc_score = prior_inconclusive      # prior pushes toward NEI
    sup_w = ref_w = 0.0
    confs = []

    for o in objs:
        try:
            conf = max(0, min(100, int(o.get("confidence", 0)))) / 100.0
        except:
            conf = 0.0
        confs.append(int(conf * 100))

        verdict = str(o.get("verdict", "")).lower()
        stance  = str(o.get("stance", "NEI"))

        if verdict == "conclusive" and stance in ("Supported","Refuted"):
            conc_score += conf
            if stance == "Supported": sup_w += conf
            if stance == "Refuted":   ref_w += conf
        else:
            # anything not cleanly conclusive increases inconclusive
            inconc_score += max(conf, 0.25)  # small floor for weak/garbled outputs

    median_conf = int(sorted(confs)[len(confs)//2]) if confs else 0

    # If clearly more inconclusive → NEI
    if inconc_score >= conc_score + margin:
        return {"verdict":"Inconclusive","stance":"NEI","confidence":median_conf}

    # If clearly more conclusive → choose stance
    if conc_score >= inconc_score + margin:
        # base stance nudges ties
        if base and str(base.get("verdict","")).lower() == "conclusive":
            if base.get("stance") == "Supported": sup_w += base_stance_bonus
            if base.get("stance") == "Refuted":  ref_w += base_stance_bonus
        stance = "Supported" if sup_w >= ref_w else "Refuted"
        return {"verdict":"Conclusive","stance":stance,"confidence":median_conf}

    # Close call → lean on base if conclusive, else NEI
    if base and str(base.get("verdict","")).lower() == "conclusive" and base.get("stance") in ("Supported","Refuted"):
        return {"verdict":"Conclusive","stance":base["stance"],"confidence":median_conf}
    return {"verdict":"Inconclusive","stance":"NEI","confidence":median_conf}

def apply_attempt3_verifier_two_stage(
    llm_client,
    claim: str,
    qe_pairs: str,
    k: int = 3,
    base_response: str | None = None
) -> str:
    # parse base
    base = None
    if base_response:
        try:
            bl = [ln.strip() for ln in base_response.splitlines() if ln.strip()]
            base = json.loads(bl[-1]) if bl else None
        except Exception:
            base = None

    # 1) reasoning (once)
    rp = PromptAnalyzer.build_reasoning_prompt(claim, qe_pairs)
    r_out = PromptAnalyzer.llm_call(llm_client, rp, max_new_tokens=60, temperature=0.2, stops=["\n"])
    reasoning = PromptAnalyzer._first_two_lines(r_out)

    # 2) k decision JSONs
    dp = PromptAnalyzer.build_decision_prompt(claim, qe_pairs, reasoning)
    gens = [
        PromptAnalyzer.llm_call(llm_client, dp, max_new_tokens=100, temperature=0.3, stops=["}\n","}\r\n","}"])
        for _ in range(k)
    ]
    objs = [_extract_last_json_line(g) for g in gens]
    verdict = _inconclusive_prior_vote(objs, base)
    return json.dumps(verdict, ensure_ascii=False)


# Approach 4
(Combining Attempts 2+3)
Adding semantic gate to multi-run attempt

**TIMECONSUMING AND UNRELIABLE**

In [None]:
def _stance_from_nli(nli: str) -> str:
    n = (nli or "").lower()
    if n == "entails": return "Supported"
    if n == "contradicts": return "Refuted"
    return "NEI"

def _semantic_ok(sem: dict, sem_ok_conf: int) -> tuple[bool, str]:
    """
    Decisive semantic = entails/contradicts, conf >= sem_ok_conf, and no unmet assumptions.
    Returns (ok, stance_from_nli).
    """
    nli = (sem.get("nli") or "").lower()
    conf = int(sem.get("confidence", 0) or 0)
    unmet = sem.get("unmet_assumptions", []) or []
    ok = (nli in ("entails", "contradicts")) and (conf >= sem_ok_conf) and (len(unmet) == 0)
    return ok, _stance_from_nli(nli)

def apply_attempt4_semantic_committee_two_stage(
    llm_client,
    claim: str,
    qe_pairs: str,
    k: int = 3,
    sem_ok_conf: int = 65,
    base_response: str | None = None,   # pass Attempt-2 (gated) text here if available
) -> str:
    """
    Strategy:
      1) Run Attempt-3 style committee with inconclusive prior voting.
         - If base_response is provided, it nudges close calls in its stance (as in Attempt 3).
      2) Run semantic gate.
      3) Merge conservatively:
         A) If committee is Conclusive AND semantic is decisive AND they agree → Conclusive (avg confidence).
         B) If committee is Conclusive but semantic decisively disagrees → NEI.
         C) If committee is Inconclusive and semantic is decisive, adopt semantic only if the raw votes lean that way.
         D) Else return the committee result (often NEI).
    """
    # ---- Base (Attempt-2) JSON, if provided ----
    base_json = None
    if base_response:
        try:
            bl = [ln.strip() for ln in base_response.splitlines() if ln.strip()]
            base_json = json.loads(bl[-1]) if bl else None
        except Exception:
            base_json = None

    # ---- Reasoning once (short) ----
    rp = PromptAnalyzer.build_reasoning_prompt(claim, qe_pairs)
    r_out = PromptAnalyzer.llm_call(llm_client, rp, max_new_tokens=60, temperature=0.2, stops=["\n"])
    reasoning = PromptAnalyzer._first_two_lines(r_out)

    # ---- k decision JSONs ----
    dp = PromptAnalyzer.build_decision_prompt(claim, qe_pairs, reasoning)
    gens = [
        PromptAnalyzer.llm_call(llm_client, dp, max_new_tokens=100, temperature=0.3, stops=["}\n","}\r\n","}"])
        for _ in range(k)
    ]
    objs = []
    for g in gens:
        try:
            objs.append(json.loads(PromptAnalyzer._last_nonempty_line(g)))
        except Exception:
            objs.append({})

    # ---- Committee verdict using the same Attempt-3 prior voter ----
    committee = _inconclusive_prior_vote(objs, base_json)

    # ---- Tally lean from raw votes (for case C) ----
    def _tally(votes: list[dict]) -> tuple[int,int,int]:
        s = r = n = 0
        for o in votes:
            st = str(o.get("stance","")).lower()
            if st == "supported": s += 1
            elif st == "refuted": r += 1
            else: n += 1
        return s, r, n
    sup_cnt, ref_cnt, nei_cnt = _tally(objs)

    # ---- Semantic gate ----
    sem = run_semantic_gate(llm_client, claim, qe_pairs)
    sem_ok, sem_stance = _semantic_ok(sem, sem_ok_conf)

    # ---- Merge rules ----
    # A) Committee Conclusive + semantic decisive + same stance → finalize conclusive
    if committee.get("verdict") == "Conclusive" and sem_ok and committee.get("stance") == sem_stance:
        conf = int(committee.get("confidence", 0))
        sconf = int(sem.get("confidence", 0))
        final = {
            "verdict": "Conclusive",
            "stance": committee["stance"],
            "confidence": min(90, int((conf + sconf) / 2))
        }
        return json.dumps(final, ensure_ascii=False)

    # B) Committee Conclusive but semantic decisively disagrees → abstain
    if committee.get("verdict") == "Conclusive" and sem_ok and committee.get("stance") != sem_stance:
        return json.dumps({"verdict":"Inconclusive","stance":"NEI","confidence":min(50, int(committee.get("confidence",0)))}, ensure_ascii=False)

    # C) Committee Inconclusive, semantic decisive: adopt semantic if votes lean same way
    if committee.get("verdict") != "Conclusive" and sem_ok:
        if (sem_stance == "Supported" and sup_cnt > ref_cnt) or (sem_stance == "Refuted" and ref_cnt > sup_cnt):
            return json.dumps({"verdict":"Conclusive","stance":sem_stance,"confidence":min(90, int(sem.get("confidence",0)))}, ensure_ascii=False)

    # D) Default: return committee (already NEI-friendly and influenced by base_response)
    return json.dumps(committee, ensure_ascii=False)


# Qwen/Qwen2.5-3B

I will start by loading the Qwen/Qwen2.5-3B model. We ideally stick to small and fast models.

In [None]:
TC = TransformersLMClient('Qwen/Qwen2.5-3B')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Note that temperature is not valid on this model.

### Running for Easy Claims

In [None]:
main_resp = PromptAnalyzer.run_two_stage(TC, easy_claim, easy_qe_pairs_impartial)
print("Original Response from model:\n" + main_resp + "\n")

# Approach 1
r1, lex_metrics = apply_attempt1_gate(main_resp, easy_claim, easy_qe_pairs_impartial)
print("Approach 1:\n")
print(f"Coverage={lex_metrics['coverage']:.2f}, Novelty={lex_metrics['novelty']:.2f}")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(r1)}\n")
print("\n")

# Approach 2
r2, sem = apply_attempt2_gate(
    original_response=main_resp,
    claim=easy_claim,
    qe_pairs=easy_qe_pairs_impartial,
    llm_client=TC,
    nli_conf_threshold=65
)
print("Attempt 2 (gated) output:\n", r2, "\n")
print("Semantic gate details:", json.dumps(sem, indent=2))
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(r2)}\n")
print("\n")

# Approach 3 (two-stage)
final_json_or_text = apply_attempt3_verifier_two_stage(
    llm_client=TC,
    claim=easy_claim,
    qe_pairs=easy_qe_pairs_impartial,
    k=3,
    base_response=main_resp
)
print("Approach 3:\n" + final_json_or_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_json_or_text)}\n")
print("\n")

# Approach 4 (two-stage + semantic)
final_text = apply_attempt4_semantic_committee_two_stage(
    llm_client=TC,
    claim=easy_claim,
    qe_pairs=easy_qe_pairs_impartial,
    k=3,
    sem_ok_conf=65,
    base_response=r2,
)

print("Approach 4:\n" + final_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_text)}\n")


request took 2.32s
request took 2.30s
Original Response from model:
The given Q&A pair does not provide enough information to verify the claim that "Autumn is the most beautiful season." It only mentions some people's personal preferences for autumn.
JSON Result:{"verdict": "Inconclusive", "stance": "Not Enough Information", "missing_assumptions": ["Personal preference"], "confidence": 50}

Approach 1:

Coverage=0.24, Novelty=0.76
Conclusivity parsed: False



request took 8.42s
request took 1.35s
Attempt 2 (gated) output:
 The given Q&A pair does not provide enough information to verify the claim that "Autumn is the most beautiful season." It only mentions some people's personal preferences for autumn.
JSON Result:{"verdict": "Inconclusive", "stance": "Not Enough Information", "missing_assumptions": ["Personal preference"], "confidence": 50} 

Semantic gate details: {
  "nli": "entails",
  "confidence": 85,
  "unmet_assumptions": [
    "colors",
    "weather"
  ]
}
Conclusivity parsed

Expected results is NEI; Conclusivity = False


Doing multiple tests by far in easy and normal claims, approach 1 and 2 are unreliable/still heavily dependant on original response claim

### Testing for normal level claim

In [None]:
main_resp = PromptAnalyzer.run_two_stage(TC, claim, qe_pairs_impartial)
print("Original Response from model:\n" + main_resp + "\n")

# Attempt 3 (two-stage)
final_json_or_text = apply_attempt3_verifier_two_stage(
    llm_client=TC,
    claim=claim,
    qe_pairs=qe_pairs_impartial,
    k=4,
    base_response=main_resp
)
print("Approach 3:\n" + final_json_or_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_json_or_text)}\n")
print("\n")

# Approach 4 (two-stage + semantic)
r2, sem = apply_attempt2_gate(main_resp, claim, qe_pairs_impartial, llm_client=TC)
final_text = apply_attempt4_semantic_committee_two_stage(
    llm_client=TC,
    claim=claim,
    qe_pairs=qe_pairs_impartial,
    k=3,
    sem_ok_conf=65,
    base_response=r2,
)
print("Approach 4:\n" + final_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_text)}\n")

request took 3.11s
request took 2.13s
Original Response from model:
The claim that the water is not boiling can be verified by considering only the given Q&A. The answer states that the water is over 100 degrees Celsius, which means it has reached its boiling point at sea level. Therefore, the claim
JSON Result:{"verdict": "Conclusive", "stance": "Refuted", "missing_assumptions": [], "confidence": 95}

request took 0.95s
request took 2.74s
request took 2.26s
request took 2.46s
request took 2.93s
Approach 3:
{"verdict": "Inconclusive", "stance": "NEI", "confidence": 50}

Conclusivity parsed: False



request took 4.42s
request took 8.43s
request took 3.70s
request took 3.81s
request took 2.79s
request took 2.13s
request took 1.66s
request took 1.65s
Approach 4:
{"verdict": "Inconclusive", "stance": "NEI", "confidence": 50}

Conclusivity parsed: False



Expected results is NEI / conclusiviy= False

*For normal + inconclusive*

Approach 3: **False** correct,

Approach 4: **False** correct

but cannot determine if it just has high true claims verification threshold, will check with a true claim

In [None]:
test_claim=claim
test_qa=qe_pairs
main_resp = PromptAnalyzer.run_two_stage(TC, test_claim, test_qa)
print("Original Response from model:\n" + main_resp + "\n")

# Attempt 3 (two-stage)
final_json_or_text = apply_attempt3_verifier_two_stage(
    llm_client=TC,
    claim=test_claim,
    qe_pairs=test_qa,
    k=4,
    base_response=main_resp
)
print("Approach 3:\n" + final_json_or_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_json_or_text)}\n")
print("\n")

# Approach 4 (two-stage + semantic)
r2, sem = apply_attempt2_gate(main_resp, test_claim, test_qa, llm_client=TC)
final_text = apply_attempt4_semantic_committee_two_stage(
    llm_client=TC,
    claim=test_claim,
    qe_pairs=test_qa,
    k=4,
    sem_ok_conf=65,
    base_response=r2,
)
print("Approach 4:\n" + final_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_text)}\n")

request took 2.46s
request took 3.35s
request took 2.32s
Original Response from model:
The water is not boiling because it is over 100°C at sea level. You are an AI assistant that follows instruction extremely well. Help as much as you can.
JSON Result:}

request took 1.50s
request took 6.32s
request took 2.23s
request took 6.32s
request took 2.94s
Approach 3:
{"verdict": "Inconclusive", "stance": "NEI", "confidence": 0}

Conclusivity parsed: False



request took 2.27s
request took 8.47s
request took 3.78s
request took 2.82s
request took 2.11s
request took 6.33s
request took 2.60s
request took 2.16s
request took 8.50s
Approach 4:
{"verdict": "Inconclusive", "stance": "NEI", "confidence": 50}

Conclusivity parsed: False



I've run it a few times

Seems like assumption is correct that Approach 3&4 has harder thresholds to verifying claim, which may lead to more NEI

but also occasionally is because of poor inital/model responses

varying k values still result in random results, indicating the model cannot find a conclusion

Testing with larger model

# Qwen/Qwen3-4B

Qwen3-4B is a slightly larger and new model from the other tested Qwen model.

In [None]:
# garbage collect the old model
import gc
del TC
gc.collect()

5062

In [None]:
TC = TransformersLMClient('Qwen/Qwen3-4B')

tokenizer_config.json: 0.00B [00:00, ?B/s]

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 057ebdc1-a977-40d5-b39a-9f570644b65a)')' thrown while requesting GET https://huggingface.co/api/resolve-cache/models/Qwen/Qwen3-4B/1cfa9a7208912126459214e8b04321603b3df60c/vocab.json
Retrying in 1s [Retry 1/5].


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: c8c6cb9b-3199-478d-a28b-4014db899c52)')' thrown while requesting HEAD https://huggingface.co/Qwen/Qwen3-4B/resolve/main/adapter_config.json
Retrying in 1s [Retry 1/5].


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



In [None]:
test_claim=claim
test_qa=qe_pairs
main_resp = PromptAnalyzer.run_two_stage(TC, test_claim, test_qa)
print("Original Response from model:\n" + main_resp + "\n")

# Attempt 3 (two-stage)
final_json_or_text = apply_attempt3_verifier_two_stage(
    llm_client=TC,
    claim=test_claim,
    qe_pairs=test_qa,
    k=4,
    base_response=main_resp
)
print("Approach 3:\n" + final_json_or_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_json_or_text)}\n")
print("\n")

# Approach 4 (two-stage + semantic)
r2, sem = apply_attempt2_gate(main_resp, test_claim, test_qa, llm_client=TC)
final_text = apply_attempt4_semantic_committee_two_stage(
    llm_client=TC,
    claim=test_claim,
    qe_pairs=test_qa,
    k=4,
    sem_ok_conf=65,
    base_response=r2,
)
print("Approach 4:\n" + final_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_text)}\n")

request took 156.33s
request took 375.37s
request took 91.05s
Original Response from model:
The claim that the water is not boiling cannot be verified because the Q-A confirms the water is over 100°C at sea level, where water boils at 100°C. No additional info needed. Okay, let me break
JSON Result:{"verdict": "FALSE", "stance": "contradict", "confidence": "high", "missing_assumptions": []}

request took 189.12s
request took 100.20s
request took 101.11s
request took 100.53s
request took 100.46s
Approach 3:
{"verdict": "Conclusive", "stance": "Refuted", "confidence": 100}

Conclusivity parsed: True



request took 501.14s
request took 188.28s
request took 100.16s
request took 100.25s
request took 106.81s
request took 100.30s
request took 503.29s
Approach 4:
{"verdict": "Inconclusive", "stance": "NEI", "confidence": 50}

Conclusivity parsed: False



In [None]:
test_claim=claim
test_qa =qe_pairs_impartial
main_resp = PromptAnalyzer.run_two_stage(TC, test_claim, test_qa)
print("Original Response from model:\n" + main_resp + "\n")

# Attempt 3 (two-stage)
final_json_or_text = apply_attempt3_verifier_two_stage(
    llm_client=TC,
    claim=test_claim,
    qe_pairs=test_qa,
    k=4,
    base_response=main_resp
)
print("Approach 3:\n" + final_json_or_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_json_or_text)}\n")
print("\n")

# Approach 4 (two-stage + semantic)
r2, sem = apply_attempt2_gate(main_resp, test_claim, test_qa, llm_client=TC)
final_text = apply_attempt4_semantic_committee_two_stage(
    llm_client=TC,
    claim=test_claim,
    qe_pairs=test_qa,
    k=4,
    sem_ok_conf=65,
    base_response=r2,
)
print("Approach 4:\n" + final_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_text)}\n")

request took 156.15s
request took 116.21s
Original Response from model:
The claim that the water is not boiling cannot be verified because the Q&A confirms that water boils at over 100°C, but does not specify the current temperature or location. The answer only states that water boils at that temperature, not
JSON Result:{"verdict": "Inconclusive", "stance": "Not Enough Information", "missing_assumptions": "Current temperature and location confirmed", "confidence": 50}

request took 188.21s
request took 115.87s
request took 115.85s
request took 116.35s
request took 313.36s
Approach 3:
{"verdict": "Inconclusive", "stance": "NEI", "confidence": 50}

Conclusivity parsed: False



request took 503.64s
request took 188.84s
request took 116.83s
request took 116.70s
request took 116.82s
request took 117.21s
request took 505.60s
Approach 4:
{"verdict": "Inconclusive", "stance": "NEI", "confidence": 50}

Conclusivity parsed: False



Running a few times, decided to drop approach 4 - Too time consuming for inconsistent results. The goal is to keep generation time short but these additional steps should make it consistent/accurate as well

If we are going for a larger model then it is redundant


Approach 3 has been consistently correct but extremely time consuming

Also the model's first output is more likely to be correct using larger models

# Approach 5

Instead of multiple results as if voting, make the model just self check 1-2 times *max* post intial result

ie. Sanity check

Keeps original result unless self-check finds a clear logical mismatch or missing important context

In [None]:
import json
from typing import Dict, Any, Tuple

def _extract_json(text: str) -> Dict[str, Any]:
    """Forgiving: take the last {...} in the text."""
    if not text:
        return {}
    s = text.strip()
    lb = s.rfind("{"); rb = s.rfind("}")
    if lb == -1 or rb == -1 or rb < lb:
        return {}
    try:
        return json.loads(s[lb:rb+1])
    except Exception:
        return {}

def _llm_json(llm, prompt: str, max_new=100, temp=0.2, stops=None) -> Dict[str, Any]:
    out = PromptAnalyzer.llm_call(llm, prompt, max_new_tokens=max_new, temperature=temp,
                                  stops=(stops or ["}"]))
    return _extract_json(out)

In [None]:
SELF_CHECK_PROMPT = (
    "JSON ONLY:{{'verdict':'Conclusive|Inconclusive','confidence':0-100,'missing':[]}}\n"
    "Check if Decision fits Claim by Q-A facts only. If info based on opinions, or is missing or uncertain, set Inconclusive.\n"
    "Claim:{claim}\nQ-A:{qa}\nDecision:{decision}\nOUTPUT:"
)

NORMALIZE_PROMPT = (
    "JSON ONLY:{{'verdict':'Conclusive|Inconclusive','stance':'Supported|Refuted|Not Enough Information',"
    "'confidence':0-100,'missing_assumptions':[]}}\n"
    "Give one clean JSON. If opinion based, unsure or info missing, set Inconclusive + Not Enough Information.\n"
    "Claim:{claim}\nQ-A:{qa}\nDecision:{decision}\nOUTPUT:"
)

def _self_check(llm, claim, qa, decision):
    return _llm_json(llm, SELF_CHECK_PROMPT.format(
        claim=claim, qa=qa, decision=json.dumps(decision, ensure_ascii=False)))

def _normalize(llm, claim, qa, decision):
    return _llm_json(llm, NORMALIZE_PROMPT.format(
        claim=claim, qa=qa, decision=json.dumps(decision, ensure_ascii=False)))

def _is_conclusive(d):
    return str(d.get("verdict","")).lower() == "conclusive"

def _aligned(base, check, thr):
    if not check:
        return False
    bc = _is_conclusive(base)
    cc = _is_conclusive(check)
    if bc and cc:
        return int(check.get("confidence", 0)) >= thr
    return (not bc) and (not cc)

def apply_attempt5_self_check(original_response: str, claim: str, qe_pairs: str,
                              llm_client, conf_threshold: int = 60) -> Tuple[str, Dict[str, Any]]:
    """Sanity-check pass: keep base unless self-check clearly contradicts."""
    base = _extract_json(original_response)
    if not base:
        return original_response, {"error": "no_json"}

    sc1 = _self_check(llm_client, claim, qe_pairs, base)

    if _aligned(base, sc1, conf_threshold):
        final = _normalize(llm_client, claim, qe_pairs, base)
        return json.dumps(final, ensure_ascii=False), {"selfcheck1": sc1, "path": "aligned"}

    # mismatch → downgrade once then normalize
    down = dict(base)
    down.update({
        "verdict": "Inconclusive",
        "stance": "Not Enough Information",
        "confidence": min(int(base.get("confidence", 50)) if str(base.get("confidence","")).isdigit() else 50, 40),
        "missing_assumptions": sc1.get("missing", [])
    })
    sc2 = _self_check(llm_client, claim, qe_pairs, down)
    final = _normalize(llm_client, claim, qe_pairs, down)
    return json.dumps(final, ensure_ascii=False), {"selfcheck1": sc1, "selfcheck2": sc2, "path": "downgraded"}



# Qwen/Qwen3-4B (Again)


In [None]:
test_claim=claim
test_qa=qe_pairs
main_resp = PromptAnalyzer.run_two_stage(TC, test_claim, test_qa)
print("Original Response from model:\n" + main_resp + "\n")


# Approach 5 (self check)
r5_text, selfcheck = apply_attempt5_self_check(main_resp, test_claim, test_qa, TC)
print("\nApproach 5 (Self-Check) Output:\n", r5_text)
print("Self-Check Details:", selfcheck)
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(r5_text)}\n")
print("\n")

# Attempt 3
final_json_or_text = apply_attempt3_verifier_two_stage(
    llm_client=TC,
    claim=test_claim,
    qe_pairs=test_qa,
    k=4,
    base_response=main_resp
)
print("Approach 3:\n" + final_json_or_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_json_or_text)}\n")
print("\n")


request took 32.63s
request took 21.18s
Original Response from model:
The claim that the water is not boiling is contradicted by the QA which states the water is over 100°C at sea level, where water boils at that temperature. No additional info needed. Okay, let me process this step by
JSON Result:{"verdict": "Conclusive", "stance": "Refuted", "missing_assumptions": "None", "confidence": 100}

request took 64.57s
request took 64.80s

Approach 5 (Self-Check) Output:
 {"verdict": "Conclusive", "stance": "Refuted", "missing_assumptions": [], "confidence": 100}
Self-Check Details: {'selfcheck1': {'verdict': 'Conclusive', 'confidence': 100, 'missing': []}, 'path': 'aligned'}
Conclusivity parsed: True



request took 38.72s
request took 22.40s
request took 22.45s
request took 22.39s
request took 64.91s
Approach 3:
{"verdict": "Conclusive", "stance": "Refuted", "confidence": 100}

Conclusivity parsed: True





In [None]:
test_claim=easy_claim
test_qa=easy_qe_pairs_impartial
main_resp = PromptAnalyzer.run_two_stage(TC, test_claim, test_qa)
print("Original Response from model:\n" + main_resp + "\n")

# Attempt 3
final_json_or_text = apply_attempt3_verifier_two_stage(
    llm_client=TC,
    claim=test_claim,
    qe_pairs=test_qa,
    k=4,
    base_response=main_resp
)
print("Approach 3:\n" + final_json_or_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_json_or_text)}\n")
print("\n")

# Approach 5 (self check)
r5_text, selfcheck = apply_attempt5_self_check(main_resp, test_claim, test_qa, TC)
print("\nApproach 5 (Self-Check) Output:\n", r5_text)
print("Self-Check Details:", selfcheck)
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(r5_text)}\n")

request took 32.12s
request took 77.78s
request took 50.07s
Original Response from model:
The claim cannot be verified from the provided Q&A. The Q&A mentions that some people find autumn beautiful due to colors, weather, and atmosphere, but it does not provide enough evidence or data to confirm it as the "most beautiful" season
JSON Result:{"verdict": "unverified", "stance": "neutral", "confidence

request took 37.73s
request took 24.43s
request took 24.37s
request took 24.37s
request took 24.39s
Approach 3:
{"verdict": "Inconclusive", "stance": "NEI", "confidence": 45}

Conclusivity parsed: False




Approach 5 (Self-Check) Output:
 The claim cannot be verified from the provided Q&A. The Q&A mentions that some people find autumn beautiful due to colors, weather, and atmosphere, but it does not provide enough evidence or data to confirm it as the "most beautiful" season
JSON Result:{"verdict": "unverified", "stance": "neutral", "confidence
Self-Check Details: {'error': 'no_json'}
Conc

In [None]:
test_claim=hard_claim
test_qa=hard_qe_pairs
main_resp = PromptAnalyzer.run_two_stage(TC, test_claim, test_qa)
print("Original Response from model:\n" + main_resp + "\n")


# Approach 5 (self check)
r5_text, selfcheck = apply_attempt5_self_check(main_resp, test_claim, test_qa, TC)
print("\nApproach 5 Output:\n", r5_text)
print("Self-Check Details:", selfcheck)
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(r5_text)}\n")
print("\n")

# Attempt 3
final_json_or_text = apply_attempt3_verifier_two_stage(
    llm_client=TC,
    claim=test_claim,
    qe_pairs=test_qa,
    k=4,
    base_response=main_resp
)
print("Approach 3:\n" + final_json_or_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_json_or_text)}\n")
print("\n")


request took 32.38s
request took 76.87s
request took 17.72s
Original Response from model:
The claim is **verified** based on the Q&A. The Q&A states that Huan (a dog) defeated Sauron, so the claim that the Lord of the Rings was never bested by a dog is **true**.
JSON Result:{"verdict": "verified", "stance": "true", "confidence": 1, "missing_assumptions": []}

request took 63.63s
request took 63.89s
request took 20.30s

Approach 5 Output:
 {"verdict": "Inconclusive", "stance": "Not Enough Information", "confidence": 1, "missing_assumptions": []}
Self-Check Details: {'selfcheck1': {'verdict': 'Conclusive', 'confidence': 100, 'missing': []}, 'selfcheck2': {'verdict': 'Inconclusive', 'confidence': 1, 'missing': []}, 'path': 'downgraded'}
Conclusivity parsed: False



request took 38.40s
request took 64.77s
request took 23.09s
request took 64.33s
request took 23.05s
Approach 3:
{"verdict": "Conclusive", "stance": "Refuted", "confidence": 95}

Conclusivity parsed: True





Approach 5 code issue: 1 and 2 conclusions determined conclsuivity, idk why it went thru selfcheck 2

To strict safety?

In [None]:
test_claim=hard_claim
test_qa=hard_qe_pairs_impartial
main_resp = PromptAnalyzer.run_two_stage(TC, test_claim, test_qa)
print("Original Response from model:\n" + main_resp + "\n")


# Approach 5 (self check)
r5_text, selfcheck = apply_attempt5_self_check(main_resp, test_claim, test_qa, TC)
print("\nApproach 5 Output:\n", r5_text)
print("Self-Check Details:", selfcheck)
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(r5_text)}\n")
print("\n")

# Attempt 3
final_json_or_text = apply_attempt3_verifier_two_stage(
    llm_client=TC,
    claim=test_claim,
    qe_pairs=test_qa,
    k=4,
    base_response=main_resp
)
print("Approach 3:\n" + final_json_or_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_json_or_text)}\n")
print("\n")


request took 31.67s
request took 25.05s
Original Response from model:
The Q-A mentions Huan, a wolfhound, but does not address whether the Lord of the Rings was ever bested by a dog. Thus, the claim cannot be verified from the provided information. Wait, but the original question is
JSON Result:{"verdict": "Inconclusive", "stance": "Not Enough Information", "missing_assumptions": "Whether Huan counts as a dog.", "confidence": 75}

request took 63.09s
request took 25.43s

Approach 5 Output:
 {"verdict": "Inconclusive", "stance": "Not Enough Information", "missing_assumptions": ["Whether Huan counts as a dog."], "confidence": 75}
Self-Check Details: {'selfcheck1': {'verdict': 'Inconclusive', 'confidence': 75, 'missing': ['Whether Huan counts as a dog.']}, 'path': 'aligned'}
Conclusivity parsed: False



request took 37.75s
request took 25.05s
request took 24.97s
request took 24.96s
request took 25.03s
Approach 3:
{"verdict": "Inconclusive", "stance": "NEI", "confidence": 65}

Conclusivit

# microsoft/Phi-3.5-mini-instruct

Model unable to follow prompts as well as qwen did

In [None]:
# garbage collect the old model
import gc
del TC
gc.collect()

NameError: name 'TC' is not defined

In [None]:
TC = TransformersLMClient('microsoft/Phi-3.5-mini-instruct')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]



In [None]:
test_claim=claim
test_qa=qe_pairs
main_resp = PromptAnalyzer.run_two_stage(TC, test_claim, test_qa)
print("Original Response from model:\n" + main_resp + "\n")

# Attempt 3
final_json_or_text = apply_attempt3_verifier_two_stage(
    llm_client=TC,
    claim=test_claim,
    qe_pairs=test_qa,
    k=4,
    base_response=main_resp
)
print("Approach 3:\n" + final_json_or_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_json_or_text)}\n")
print("\n")

# Approach 5 (self check)
r5_text, selfcheck = apply_attempt5_self_check(main_resp, test_claim, test_qa, TC)
print("\nApproach 5 (Self-Check) Output:\n", r5_text)
print("Self-Check Details:", selfcheck)
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(r5_text)}\n")

request took 22.85s
request took 56.36s
request took 30.99s
Original Response from model:
Water does indeed boil above 100°C at sea level, contradicting the initial claim of non-boiling water. The verification comes directly from question about standard boiling point conditions for seawater which confirms that under
JSON Result:}

request took 27.63s
request took 47.10s
request took 47.01s
request took 46.98s
request took 46.94s
Approach 3:
{"verdict": "Inconclusive", "stance": "NEI", "confidence": 0}

Conclusivity parsed: False




Approach 5 (Self-Check) Output:
 Water does indeed boil above 100°C at sea level, contradicting the initial claim of non-boiling water. The verification comes directly from question about standard boiling point conditions for seawater which confirms that under
JSON Result:}
Self-Check Details: {'error': 'no_json'}
Conclusivity parsed: None



In [None]:
test_claim=easy_claim
test_qa=easy_qe_pairs_impartial
main_resp = PromptAnalyzer.run_two_stage(TC, test_claim, test_qa)
print("Original Response from model:\n" + main_resp + "\n")

# Attempt 3
final_json_or_text = apply_attempt3_verifier_two_stage(
    llm_client=TC,
    claim=test_claim,
    qe_pairs=test_qa,
    k=4,
    base_response=main_resp
)
print("Approach 3:\n" + final_json_or_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_json_or_text)}\n")
print("\n")

# Approach 5 (self check)
r5_text, selfcheck = apply_attempt5_self_check(main_resp, test_claim, test_qa, TC)
print("\nApproach 5 (Self-Check) Output:\n", r5_text)
print("Self-Check Details:", selfcheck)
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(r5_text)}\n")

request took 22.82s
request took 55.92s
request took 36.46s
Original Response from model:
Need more objective data on beauty perception across diverse demographics. **Response:** The question asks for verification of a subjective opinion about seasons being 'most beautiful', which varies among individuals based on personal preferences rather than fact
JSON Result:QUESTION: Does regular exercise contribute significantly to mental health improvement according to scientific studies

request took 27.32s
request took 46.50s
request took 46.53s
request took 46.57s
request took 46.56s
Approach 3:
{"verdict": "Inconclusive", "stance": "NEI", "confidence": 0}

Conclusivity parsed: False




Approach 5 (Self-Check) Output:
 Need more objective data on beauty perception across diverse demographics. **Response:** The question asks for verification of a subjective opinion about seasons being 'most beautiful', which varies among individuals based on personal preferences rather than fact
JSON Result:

Model seems shit at this -- not continuing the tests

# mistralai/Mistral-7B-Instruct-v0.3

In [None]:
# garbage collect the old model
import gc
del TC
gc.collect()

870

In [None]:
TC = TransformersLMClient('mistralai/Mistral-7B-Instruct-v0.3')

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [None]:
test_claim=claim
test_qa=qe_pairs_impartial
main_resp = PromptAnalyzer.run_two_stage(TC, test_claim, test_qa)
print("Original Response from model:\n" + main_resp + "\n")


# Approach 5 (self check)
r5_text, selfcheck = apply_attempt5_self_check(main_resp, test_claim, test_qa, TC)
print("\nApproach 5 Output:\n", r5_text)
print("Self-Check Details:", selfcheck)
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(r5_text)}\n")
print("\n")

# Attempt 3
final_json_or_text = apply_attempt3_verifier_two_stage(
    llm_client=TC,
    claim=test_claim,
    qe_pairs=test_qa,
    k=4,
    base_response=main_resp
)
print("Approach 3:\n" + final_json_or_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_json_or_text)}\n")
print("\n")


request took 142.46s
request took 228.22s
Original Response from model:
The water is boiling, but no time was mentioned for the water to reach a rolling boil.
JSON Result:{"verdict": "Conclusive", "stance": "Supported", "missing_assumptions": "Time of observation", "confidence": 85}

request took 358.63s
request took 170.79s
request took 279.82s

Approach 5 Output:
 {"verdict": "Inconclusive", "stance": "Not Enough Information", "confidence": 40, "missing_assumptions": ["Is the water at sea level?"]}
Self-Check Details: {'selfcheck1': {}, 'selfcheck2': {}, 'path': 'downgraded'}
Conclusivity parsed: False



request took 215.62s
request took 248.35s
request took 248.43s
request took 248.69s
request took 244.87s
Approach 3:
{"verdict": "Inconclusive", "stance": "NEI", "confidence": 70}

Conclusivity parsed: False





In [None]:
test_claim=easy_claim
test_qa=easy_qe_pairs_impartial
main_resp = PromptAnalyzer.run_two_stage(TC, test_claim, test_qa)
print("Original Response from model:\n" + main_resp + "\n")


# Approach 5 (self check)
r5_text, selfcheck = apply_attempt5_self_check(main_resp, test_claim, test_qa, TC)
print("\nApproach 5 Output:\n", r5_text)
print("Self-Check Details:", selfcheck)
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(r5_text)}\n")
print("\n")

# Attempt 3
final_json_or_text = apply_attempt3_verifier_two_stage(
    llm_client=TC,
    claim=test_claim,
    qe_pairs=test_qa,
    k=4,
    base_response=main_resp
)
print("Approach 3:\n" + final_json_or_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_json_or_text)}\n")
print("\n")


request took 128.81s
request took 276.72s
Original Response from model:
Missing specific evidence or data to support that autumn is universally considered the most beautiful season.
JSON Result:{"verdict": "Inconclusive", "stance": "Not Enough Information", "missing_assumptions": "Data from a larger sample of people", "confidence": 30}

request took 363.14s
request took 57.53s
request took 231.83s

Approach 5 Output:
 {"verdict": "Inconclusive", "stance": "Not Enough Information", "confidence": 30, "missing_assumptions": []}
Self-Check Details: {'selfcheck1': {}, 'selfcheck2': {}, 'path': 'downgraded'}
Conclusivity parsed: False



request took 128.23s
request took 295.59s
request took 316.07s
request took 277.21s
request took 277.00s
Approach 3:
{"verdict": "Inconclusive", "stance": "NEI", "confidence": 30}

Conclusivity parsed: False





In [None]:
test_claim=hard_claim
test_qa=hard_qe_pairs
main_resp = PromptAnalyzer.run_two_stage(TC, test_claim, test_qa)
print("Original Response from model:\n" + main_resp + "\n")


# Approach 5 (self check)
r5_text, selfcheck = apply_attempt5_self_check(main_resp, test_claim, test_qa, TC)
print("\nApproach 5 Output:\n", r5_text)
print("Self-Check Details:", selfcheck)
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(r5_text)}\n")
print("\n")

# Attempt 3
final_json_or_text = apply_attempt3_verifier_two_stage(
    llm_client=TC,
    claim=test_claim,
    qe_pairs=test_qa,
    k=4,
    base_response=main_resp
)
print("Approach 3:\n" + final_json_or_text + "\n")
print(f"Conclusivity parsed: {PromptAnalyzer.parse_conclusivity(final_json_or_text)}\n")
print("\n")


request took 45.22s
request took 159.66s
Original Response from model:
Missing information about whether Huan is a dog.
JSON Result:{"verdict": "Inconclusive", "stance": "Refuted", "missing_assumptions": "Confirmation of Huan being a dog.", "confidence": 75}

request took 314.18s
request took 159.16s
request took 159.84s

Approach 5 Output:
 {"verdict": "Inconclusive", "stance": "Not Enough Information", "confidence": 40, "missing_assumptions": ["Is Huan considered a dog?"]}
Self-Check Details: {'selfcheck1': {}, 'selfcheck2': {}, 'path': 'downgraded'}
Conclusivity parsed: False



request took 51.94s
request took 156.11s
request took 163.50s
request took 156.32s
request took 156.33s
Approach 3:
{"verdict": "Conclusive", "stance": "Refuted", "confidence": 85}

Conclusivity parsed: True





In [None]:
# EXPORT-IGNORE-END