In [1]:
# EXPORT-IGNORE-START

In [2]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [3]:
!pip install datasets==3.6.0

Defaulting to user installation because normal site-packages is not writeable


In [4]:
# EXPORT-IGNORE-END

In [5]:
def in_jupyter_notebook():
    try:
        shell = get_ipython().__class__.__name__
        return shell == 'ZMQInteractiveShell'
    except NameError:
        return False
IN_JUPYTER = in_jupyter_notebook()

In [6]:
if IN_JUPYTER:
    from src import LLMClient
else:
    import LLMClient

from typing import Dict
from datasets import load_dataset
from tqdm import tqdm
from collections import Counter
import time
import textwrap
import json
import numpy as np

Create the client objects:

In [7]:
# we need this temporarily to test the pipeline without RAG
LLMClient.PromptAnalyzer._prompts["answer_question"] = ("Tersely answer this question to the best of your ability. Never ask for more context. Assume you know the answer.", "Question: {}\nAnswer: ")

prompts = [
    "initial_question",
    "follow_up",
    "secondary_question",
    "final_verdict",
    "answer_question"
]

ILC, FLC, SLC, VLC, ALC = [
    LLMClient.LlamaCppClient(LLMClient.PromptAnalyzer.get_system_message(p)) for p in prompts
]


## Verify Claim

In [8]:
def verify_claim(claim: str, max_iters: int = 3) -> str | None:

  # Generate initial question
  initial_prompt = LLMClient.PromptAnalyzer.build_template("initial_question", [claim])
  question = ILC.send_query(initial_prompt).strip()

  qa_pairs = []
  for _ in range(max_iters):
    # Answer question (real pipeline should use RAG here)
    answer_prompt = LLMClient.PromptAnalyzer.build_template("answer_question", [question])
    answer = ALC.send_query(answer_prompt).strip()
    qa_pairs.append((question, answer))

    # Check for termination
    check_prompt = LLMClient.PromptAnalyzer.build_template("follow_up", [claim, qa_pairs])
    check_response = FLC.send_query(check_prompt)
    done = LLMClient.PromptAnalyzer.parse_conclusivity(check_response)
    if done or done == None:  
      break

    # Generate follow-up
    secondary_prompt = LLMClient.PromptAnalyzer.build_template("secondary_question", [claim, qa_pairs])
    question = SLC.send_query(secondary_prompt).strip()

  # Final verification
  verify_prompt = LLMClient.PromptAnalyzer.build_prompt("final_verdict", [claim, qa_pairs])
  verdict_raw = VLC.send_query(verify_prompt).strip()
  verdict_bool = LLMClient.PromptAnalyzer.parse_boolean_answer(verdict_raw)

  result = {
    "claim": claim,
    "qa_pairs": qa_pairs,
    "verdict_raw": verdict_raw,
    "verdict_bool": verdict_bool,
  }

  return result


## Tests

In [9]:
# EXPORT-IGNORE-START

Tests completed using the `Qwen3-1.7B-Q8_0.gguf` model.

In [10]:
def is_correct(expected, actual):
    if actual is None:
        pred = "not enough info"
    if actual:
        pred = "supports"
    else:
        pred = "refutes"

    return pred == expected


In [11]:
ds = load_dataset("fever", "v1.0", trust_remote_code=True)

In [12]:
num_samples = 10
split = ds["labelled_dev"].select(range(num_samples))
results = []

for i in tqdm(range(len(split))):
    claim = split[i]["claim"]
    label = split[i]["label"].lower()

    result = verify_claim(claim)
    result["correct"] = is_correct(label, result["verdict_bool"])
    
    results.append(result)

100%|█████████████████████████████████████████████████████████| 10/10 [11:51<00:00, 71.19s/it]


In [13]:
import pprint
pprint.pprint(results)

[{'claim': 'Colin Kaepernick became a starting quarterback during the 49ers '
           '63rd season in the National Football League.',
  'correct': False,
  'qa_pairs': [("Who became a starting quarterback during the 49ers' 63rd "
                'season in the National Football League?',
                "Step 1: Identify the 49ers' 63rd season.\n"
                "The 49ers' 63rd season was in 2016.\n"
                '\n'
                'Step 2: Determine the quarterback during the 63rd season.\n'
                'The starting quarterback for the 49ers in 2016 was Colin '
                'Kaepernick.\n'
                '\n'
                'Answer: Colin Kaepernick')],
  'verdict_bool': True,
  'verdict_raw': 'True'},
 {'claim': 'Tilda Swinton is a vegan.',
  'correct': False,
  'qa_pairs': [('Is Tilda Swinton a vegan?',
                'No, Tilda Swinton is not a vegan.')],
  'verdict_bool': False,
  'verdict_raw': 'False'},
 {'claim': 'Fox 2000 Pictures released the film Soul Fo

In [14]:
tot_correct = 0
for result in results:
    if result["correct"]:
        tot_correct += 1
print(f'Accuracy: {tot_correct / len(results)}')

Accuracy: 0.1


In [15]:
# EXPORT-IGNORE-END