# **C**hain of **A**gent with chain of **V**erific**A**tion (CAVA)


## Install dependencies

In [None]:
!pip install -qU langchain langchain-core langchain-text-splitters langchain-community langgraph langchain_chroma langchain-huggingface langsmith
!pip install -qU pypdf
!pip install -qU langchain-google-genai

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.8/101.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m473.8/473.8 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.3/157.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 kB[0m [31m28.6 MB/s[0m eta [36m0

In [None]:
#
from google.colab import userdata
import os
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

In [None]:
# Create LLM Model
from langchain.chat_models import init_chat_model

llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")

In [None]:
# Split pure text

def split_text(text, chunk_size=500):
    chunks = []
    chunk_idx = 0
    while chunk_idx < len(text):
        end_idx = min(chunk_idx+chunk_size, len(text))
        chunks.append(text[chunk_idx:end_idx])
        chunk_idx = end_idx
    return chunks
    # return splitter.split_documents(text)

original_long_text = "testing split text" * 10
split_long_text = split_text(original_long_text, chunk_size=50)
split_long_text

['testing split texttesting split texttesting split ',
 'texttesting split texttesting split texttesting sp',
 'lit texttesting split texttesting split texttestin',
 'g split texttesting split text']

In [None]:
# --- Prompts ------------------------------------------------------------------
# For QA Tasks
WORKER_PROMPT = lambda i, query, chunk, prev: f"""
You are Worker {i} in a chain solving a long-context task.
ONLY use the provided chunk and previous message.
You need to read current source text and summary of previous source text (if any),
and generate a summary to include them both and that best helps answer the query.
Keep ≤ 300 tokens. If no new info, forward previous message unchanged.

Query: {query}
Current source text: CHUNK {i} (do NOT reference other chunks):\n{chunk}\n
Previous source text :\n{prev}
"""

MANAGER_PROMPT = lambda query, final_worker_json: f"""
You are the Manager. Synthesize the final answer.
Please keep the final answer as short as possible and do not respond with full sentences.
Just reply with the final answer.
The source is too long and has been summarized. You need to answer based on the summary.

Query: {query}
Final worker Summary: {final_worker_json}
"""

In [None]:
# Define agent graph
from typing import TypedDict, List

class CoAState(TypedDict):
    query: str
    chunks: List[str]
    i: int
    worker_outputs: List[str]
    verbose: bool


In [None]:
def worker_node(state: CoAState):
    i = state["i"]
    chunk = state["chunks"][i]
    if i == 0:
        prev = "No Previous summaries"
    else:
        # Get previous worker's output
        prev = state["worker_outputs"][i-1].content
    prompt = WORKER_PROMPT(i, state["query"], chunk, prev)
    if state["verbose"]:
        print(f"Worker {i} with Prompt: \n######{prompt}\n#######\n")
    out = llm.invoke(prompt)
    # Note new outut
    state["worker_outputs"].append(out)
    state["i"] += 1
    if state["verbose"]:
        print(f"Outputs: {out.content}\n------------------\n\n")
    return state

def manager_node(state:CoAState):
    last_worker_output = state["worker_outputs"][-1].content
    prompt = MANAGER_PROMPT(state["query"], last_worker_output)
    if state["verbose"]:
        print(f"Manager with Prompt: \n######{prompt}\n#######\n")
    final_answer = llm.invoke(prompt)
    # store final summary as last output
    state["worker_outputs"].append(final_answer)
    if state["verbose"]:
        print(f"Manager Final Output: \n#############\n{final_answer.content}")
    return state


In [None]:
def run_coa(query, context, chunk_size=500, verbose=False):
    # Split context
    chunks = split_text(context, chunk_size=chunk_size)
    if verbose:
        print("Text Chunks: ",chunks)
    # assert 1==2
    # Initialize initial CoAState
    init_state = {
        "query": query,
        "chunks": chunks,
        "i": 0,
        "worker_outputs": [],
        "verbose": verbose
    }
    state = init_state
    # Worker nodes, for each chunk
    for i, chunk in enumerate(chunks):
        # Run worker node and get new state
        state = worker_node(state)
    # At the end of the loop, state["i"] should be == len(chunks)
    assert state["i"] == len(chunks), "Total states worked does not equal to number of text chunks"

    # Finally run manager at last
    state = manager_node(state)
    final_ans = state["worker_outputs"][-1].content
    if verbose:
        print(f"Query: {state["query"]}\nFinal Answer from Manager: {final_ans}")
    return final_ans

In [None]:
# Test run CoA
ans = run_coa("what is the meaning?", original_long_text, chunk_size=50)
ans

'Based on the summary, the text does not convey any inherent semantic meaning. Instead, it consistently appears to be a functional sample or placeholder, primarily serving testing purposes related to text splitting or similar text processing.'

In [None]:
ans

'The meaning is that the text does not contain substantive content. It is identified as system test data, fragmented input, or a placeholder, devoid of any real information.'

## Eval

In [None]:
import json
from typing import List, Dict, Callable
from datasets import load_dataset
from tqdm import tqdm
import re
import string


# -----------------------------------------------------------
# 1. HotpotQA Loader
# -----------------------------------------------------------

def load_hotpotqa(split="validation", max_samples=None):
    """
    [source] https://huggingface.co/datasets/hotpotqa/hotpot_qa

    an example in hotpotqa - fullwiki:
    {
        "id": str,
        "question": str,
        "answer": str,
        "type": str,
        "level": str,
        "supporting_facts":
        {
            "title": [str, str, ...], # may repeat
            "sent_id": [int32, int32, ...]
        },
        "context":
        {
            "title": [str, str, ...],
            "sentences": [[str, str, str, ...], [str, str, str, ...], ...]
        }

    }

    Return:
    a list of dicts
    {
        "context":
        [
            { "title": str, "sentences": [str, str, ...] }, # doc 0
            { "title": str, "sentences": [str, str, ...] }, # doc 1
            ...
        ]
        "question": str,
        "answer": str
    }
    """
    raw = load_dataset("hotpot_qa", "fullwiki")[split]

    data = []
    for item in raw:
        context = [
            {
                "title": t,
                "sentences": sents
            }
            for t, sents in zip(item["context"]["title"], item["context"]["sentences"])
        ]
        question = item["question"]
        answer = item["answer"]

        data.append({
            "context": context,
            "question": question,
            "answer": answer
        })

        if max_samples and len(data) >= max_samples:
            break

    return data


# -----------------------------------------------------------
# 2. Context Chunker
# -----------------------------------------------------------

def chunk_context_fullwiki(context, chunk_size=500):
    """
    split sentences of a doc into chuncks of length chunk_size

    Param:
    context (return from load_hotpotqa()):
    [
        { "title": str, "sentences": [str, str, ...] }, # doc 0
        { "title": str, "sentences": [str, str, ...] }, # doc 1
        ...
    ]

    Return:
    a list of text chunks (str).
    """
    chunks = []

    for doc in context:
        text = " ".join(doc["sentences"])
        words = text.split()

        for i in range(0, len(words), chunk_size):
            chunk = " ".join(words[i:i+chunk_size])
            chunks.append(chunk)

    return chunks


# -----------------------------------------------------------
# 3. Evaluation Metrics (EM + F1)
# -----------------------------------------------------------

def normalize_answer(s):
    """
    Lowercase, remove punctuation/articles/extra whitespace.
    """
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


# TODO: does the paper compute this at the token level??
def f1_score(pred, gold):
    pred_tokens = normalize_answer(pred).split()
    gold_tokens = normalize_answer(gold).split()

    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return int(pred_tokens == gold_tokens)

    common = set(pred_tokens) & set(gold_tokens)
    num_same = sum(min(pred_tokens.count(t), gold_tokens.count(t)) for t in common)

    if num_same == 0:
        return 0

    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return (2 * precision * recall) / (precision + recall)


def exact_match(pred, gold):
    return normalize_answer(pred) == normalize_answer(gold)


# -----------------------------------------------------------
# 4. Evaluation loop
# -----------------------------------------------------------

def evaluate(model_fn: Callable, dataset: List[Dict], chunk_size=500):
    """
    model_fn(query, context_chunks) -> str
    """
    qs = []
    ctxs = []
    preds = []
    refs = []
    f1s = []
    ems = []

    for sample in tqdm(dataset, desc="Evaluating"):
        question = sample["question"]
        context = sample["context"]
        gold = sample["answer"]

        chunks = chunk_context_fullwiki(context, chunk_size)

        pred = model_fn(question, chunks) # TODO

        qs.append(question)
        ctxs.append(chunks)
        preds.append(pred)
        refs.append(gold)

        f1s.append(f1_score(pred, gold))
        ems.append(int(exact_match(pred, gold)))

    return {
        "contexts": ctxs,
        "questions": qs,
        "predictions": preds,
        "references": refs,
        "f1": sum(f1s) / len(f1s),
        "em": sum(ems) / len(ems)
    }


# -----------------------------------------------------------
# 5. Placeholder CoA model
# -----------------------------------------------------------

def coa_placeholder(question: str, context_chunks: List[str]) -> str:
    """
    Dummy version to make the pipeline runnable now.
    Replace with Ray's CoA later.
    """
    merged_context = " ".join(context_chunks)
    prompt = f"Context:\n{merged_context}\n\nQuestion: {question}\nAnswer:"
    # TODO
    # print(f"Length of merged context: {len(merged_context)}")
    # return "hello"
    # assert 1==2
    # TODO: Figure out what chunk size is best cost to performance
    final_ans = run_coa(query=question, context=merged_context, chunk_size=2000)
    return final_ans
    return "PLACEHOLDER YES" # "PLACEHOLDER_ANSWER"


# -----------------------------------------------------------
# 6. Running the pipeline
# -----------------------------------------------------------

if __name__ == "__main__":
    data = load_hotpotqa(split="validation", max_samples=3)
    results = evaluate(coa_placeholder, data)


* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 10, model: gemini-2.5-flash
Please retry in 51.327240617s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, retry_delay {
  seconds: 51
}
].
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 10, model: gemini-2.5-flash
Please retry in 49.258330556s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.

In [None]:
results['f1']

0.4444444444444445

In [1]:
for i in range(3):
    print(":)")

:)
:)
:)


In [None]:
for i in range(3):
    print(f"[{i}] Questions:{results['questions'][i]}\nPrediction: {results['predictions'][i]}  ---> Reference: {results['references'][i]}\n\n")

[0] Questions:Were Scott Derrickson and Ed Wood of the same nationality?
Prediction: Yes.  ---> Reference: yes


[1] Questions:What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?
Prediction: No information available.  ---> Reference: Chief of Protocol


[2] Questions:What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?
Prediction: Animorphs; companion books not mentioned.  ---> Reference: Animorphs




In [None]:
results['contexts'][0]

['Adam Collis is an American filmmaker and actor. He attended the Duke University from 1986 to 1990 and the University of California, Los Angeles from 2007 to 2010. He also studied cinema at the University of Southern California from 1991 to 1997. Collis first work was the assistant director for the Scott Derrickson\'s short "Love in the Ruins" (1995). In 1998, he played "Crankshaft" in Eric Koyanagi\'s "Hundred Percent".',
 "Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood. The film concerns the period in Wood's life when he made his best-known films as well as his relationship with actor Bela Lugosi, played by Martin Landau. Sarah Jessica Parker, Patricia Arquette, Jeffrey Jones, Lisa Marie, and Bill Murray are among the supporting cast.",
 'Tyler Bates (born June 5, 1965) is an American musician, music producer, and composer for films, television, and video games. Much of his work

In [None]:
results['questions'][0]

'Were Scott Derrickson and Ed Wood of the same nationality?'

In [None]:
results['references'][0]

'yes'

In [None]:
results['predictions'][0]

'Yes.'

In [None]:
# Testing committing from google colab