In [17]:
import time

In [18]:
import json

In [19]:
import anthropic

In [20]:
import random

In [21]:
from tqdm import tqdm

In [22]:
from pathlib import Path

In [23]:
from pydantic import BaseModel, Field

In [24]:
SONNET = 'claude-sonnet-4-5'
HAIKU = 'claude-haiku-4-5'

In [25]:
CLIENT = anthropic.Anthropic()

In [26]:
ROLE = 'role'
CONTENT = 'content'
SYSTEM = 'system'
USER = 'user'
ASSISTANT = 'assistant'

In [27]:
INPUT_TOKENS = 'input_tokens'
TOKEN_LIMIT = 10_000

In [28]:
FILES = 'files'

In [29]:
FILE_NAME = 'file_name'

In [58]:
SYS_PROMPT = """
You are helping to generate a dataset for RAG evaluation, using only the given documents.
"""
QS_PROMPT = """
You will be given access to information pertaining to a software package.

You are to generate a question only based on the given document.
Each question should be fully answerable using only that one document.
The question should be something that a staff software engineer would want to ask from the documentation.
The question can be sligthly vague, but not too vague.
Avoid lifting technical words and snippets from the document directly.
The question should be something that a staff software engineer would ask, and the answer must be from the documentation.

document : {document}
"""

In [62]:
QS = 'question'
REASON = 'reasoning'
FILE_PATH = 'file_path'

In [66]:
EVAL_QS = 'eval_questions'

In [40]:
BETAS_STRUCTURED_OUTPUT = 'structured-outputs-2025-11-13'

In [31]:
class QuestionGen(BaseModel):
    reasoning: str = Field(description="Thought process and reasoning behind the question")
    question: str = Field(description="Question")

In [32]:
def get_all_files_in_dir(root_dir: str, file_ext:str = '.md') -> list:
    """Get a list of all files in root_dir"""
    root_dir = Path(root_dir)
    file_paths = list(root_dir.glob('*' + file_ext))
    return file_paths

In [33]:
def choose_random_files(file_paths: list, k=200) -> list:
    """Choose k random files"""
    chosen_splits = set()
    while len(chosen_splits) < k:
        file_path = random.choice(file_paths)
        file_md = read_file(file_path)
        num_tokens = get_num_tokens(file_md)
        time.sleep(1)
        if num_tokens <= TOKEN_LIMIT:
            chosen_splits.add(file_path)
            print(f'Added file to sample. Num Files : {len(chosen_splits)}')
    return list(chosen_splits)

In [34]:
def get_num_tokens(input_str: str) -> int:
    """Get number of tokens in input_str"""
    response = CLIENT.messages.count_tokens(
        model=SONNET,
        system='You are a helpful assistant',
        messages=[{
            ROLE: USER,
            CONTENT: input_str
        }],
    )
    response = json.loads(response.model_dump_json())
    tokens = response[INPUT_TOKENS]
    return tokens

In [35]:
def read_file(file_path: str) -> str:
    """Read a file"""
    with open(file_path, 'r') as f:
        return f.read()

In [36]:
def load_eval_samples(sample_path: str) -> list:
    """Load the samples for eval"""
    with open(sample_path, 'r') as f:
        data = json.load(f)
    return data

In [45]:
def generate_question(content_str: str, prompt=QS_PROMPT, model=SONNET):
    """Generate a question"""
    response = CLIENT.beta.messages.parse(
        model=model,
        betas=[BETAS_STRUCTURED_OUTPUT],
        max_tokens=1024,
        output_format=QuestionGen,
        system=SYS_PROMPT,
        messages=[
            {
                ROLE: USER,
                CONTENT: prompt.format(document=content_str),
            }
        ],
    )
    return response.parsed_output

In [63]:
def gen_qs_detail(question_obj: QuestionGen, file_path: str):
    """Generate a question detail"""
    return {
        QS: question_obj.question,
        REASON: question_obj.reasoning,
        FILE_PATH: file_path,
    }

In [74]:
def generate_questions(file_paths, prompt=QS_PROMPT, model=SONNET, save_path:str='../working_dir/eval/eval_questions.json'):
    """Generate questions for all the files"""
    results = []
    for file_path in tqdm(file_paths[FILES]):
        file_md = read_file(file_path)
        question = generate_question(content_str=file_md, prompt=prompt, model=model)
        qs_detail = gen_qs_detail(question, file_path)
        results.append(qs_detail)
        with open(save_path, 'w', encoding='utf-8') as f:
            json.dump({
                EVAL_QS: results
            }, f)
    return results

In [75]:
data = load_eval_samples('../working_dir/eval/sample.json')

In [76]:
results = generate_questions(data)

Generating questions for 1/200
Generating questions for 2/200
Generating questions for 3/200
Generating questions for 4/200
Generating questions for 5/200
Generating questions for 6/200
Generating questions for 7/200
Generating questions for 8/200
Generating questions for 9/200
Generating questions for 10/200
Generating questions for 11/200
Generating questions for 12/200
Generating questions for 13/200
Generating questions for 14/200
Generating questions for 15/200
Generating questions for 16/200
Generating questions for 17/200
Generating questions for 18/200
Generating questions for 19/200
Generating questions for 20/200
Generating questions for 21/200
Generating questions for 22/200
Generating questions for 23/200
Generating questions for 24/200
Generating questions for 25/200
Generating questions for 26/200
Generating questions for 27/200
Generating questions for 28/200
Generating questions for 29/200
Generating questions for 30/200
Generating questions for 31/200
Generating questi

#k Scratch Code

In [38]:
data = load_eval_samples('../working_dir/eval/sample.json')

In [39]:
len(data[FILES])

200

In [59]:
idx = 0
md_str = read_file(data[FILES][idx])
question = generate_question(md_str)

In [60]:
question.reasoning

"This document is about the FaithfulnessEvaluator, which is part of an evaluation SDK for testing agent responses. A staff software engineer would likely want to understand the practical implementation details and behavior of this evaluator. Looking at the scoring system section, I see it uses a 5-level categorical scale (0.0, 0.25, 0.5, 0.75, 1.0) and mentions that 'A response passes the evaluation if the score is >= 0.5'. This is a concrete threshold that would be important for engineers implementing tests. A good question would ask about this passing threshold, as it's a specific implementation detail that affects how the evaluator is used in practice."

In [61]:
question.question

'What is the minimum score threshold required for a response to pass the FaithfulnessEvaluator, and what does this score represent on the categorical scale?'

In [38]:
strands = get_all_files_in_dir('../working_dir/docs/strandsagents.com')

In [49]:
strands_sample = choose_random_files(strands)

Added file to sample. Num Files : 1
Added file to sample. Num Files : 2
Added file to sample. Num Files : 3
Added file to sample. Num Files : 4
Added file to sample. Num Files : 5
Added file to sample. Num Files : 6
Added file to sample. Num Files : 7
Added file to sample. Num Files : 8
Added file to sample. Num Files : 9
Added file to sample. Num Files : 10
Added file to sample. Num Files : 11
Added file to sample. Num Files : 12
Added file to sample. Num Files : 13
Added file to sample. Num Files : 14
Added file to sample. Num Files : 15
Added file to sample. Num Files : 16
Added file to sample. Num Files : 17
Added file to sample. Num Files : 18
Added file to sample. Num Files : 19
Added file to sample. Num Files : 20
Added file to sample. Num Files : 21
Added file to sample. Num Files : 22
Added file to sample. Num Files : 23
Added file to sample. Num Files : 24
Added file to sample. Num Files : 25
Added file to sample. Num Files : 26
Added file to sample. Num Files : 27
Added file

In [50]:
strands_sample[0]

PosixPath('../working_dir/docs/strandsagents.com/latest_documentation_docs_user-guide_evals-sdk_evaluators_faithfulness_evaluator.md')

In [51]:
strans_file_paths = [str(x) for x in strands_sample]

In [63]:
with open('../working_dir/eval/sample.json', 'w', encoding='utf-8') as f:
    json.dump(str, f, ensure_ascii=False)

In [27]:
md_file = read_file(strands_sample[0])

In [48]:
get_num_tokens(md_file)

4078

In [28]:
md_file

'---\nurl: https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/evaluators/trajectory_evaluator\ndepth: 1\n---\n\n[ Skip to content ](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/evaluators/trajectory_evaluator/#trajectory-evaluator)\n# Trajectory Evaluator[¶](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/evaluators/trajectory_evaluator/#trajectory-evaluator "Permanent link")\n## Overview[¶](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/evaluators/trajectory_evaluator/#overview "Permanent link")\nThe `TrajectoryEvaluator` is an LLM-based evaluator that assesses the sequence of actions or tool calls made by an agent during task execution. It evaluates whether the agent followed an appropriate path to reach its goal, making it ideal for evaluating multi-step reasoning and tool usage patterns. A complete example can be found [here](https://github.com/strands-agents/docs/blob/main/docs/exa

In [13]:
p = Path('../working_dir/strandsagents.com')

In [15]:
list(p.glob('*.md'))

[]

In [17]:
len(strands)

268

In [20]:
sample = choose_random_files(strands)

In [21]:
len(sample)

200

In [22]:
sample[0]

PosixPath('../working_dir/docs/strandsagents.com/latest_documentation_docs_user-guide_concepts_model-providers_llamacpp.md')