In [6]:
import sys
import json
sys.path.insert(0, '..')

In [3]:
import docs

raw_documents = docs.read_github_data()
documents = docs.parse_data(raw_documents)
len(documents)

95

In [4]:
selected_docs = []
total_questions = 0

for doc in documents:
    if 'title' not in doc:
        continue

    title = doc['title'].lower()

    content = doc.get('content', '')

    if len(content) < 1000:
        continue
    
    if 'unpublished' in title:
        continue

    if 'legacy' in title:
        continue

    if 'leftovers' in title:
        continue

    if 'updates' in title:
        continue

    num_questions = len(content) // 1000
    total_questions = total_questions + num_questions
    print(title, num_questions)
    selected_docs.append(doc)

data definition 11
descriptors 12
overview 3
metric generators 2
output formats 1
introduction 22
report 4
add tags and metadata 2
tests 9
alerts 1
add dashboard panels (api) 13
add dashboard panels (ui) 4
overview 2
overview 2
work with datasets 2
run evals via api 2
explore view 1
no code evals 4
overview 2
batch monitoring 2
overview 3
introduction 2
manage projects 4
overview 1
overview 1
set up tracing 10
evidently cloud 1
self-hosting 5
evidently and github actions 1
llm evaluations 2
llm as a judge 21
llm-as-a-jury 9
rag evals 13
llm regression testing 21
tutorials and guides 12
evidently cloud v2 1
migration guide 7
open-source vs. cloud 6
telemetry 10
why evidently? 4
what is evidently? 1
all descriptors 31
all metrics 54
overview 1
customize data drift 17
custom text descriptor 3
use huggingface models 10
configure llm judges 26
custom metric 4
classification metrics 8
data stats and quality 7
data drift 8
ranking and recsys metrics 10
regression metrics 9
classification 3
da

In [5]:
len(selected_docs)

68

In [7]:
from openai import OpenAI
openai_client = OpenAI()

def llm_structured(instructions, user_prompt, output_format, model="gpt-4o-mini"):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]

    response = openai_client.responses.parse(
        model=model,
        input=messages,
        text_format=output_format
    )

    return (response.output_parsed, response.usage)

In [8]:
instructions = """
You are given a technical article. Your task is to imagine what a person might type into a search engine 
before finding and reading this article.

Generate realistic, human-like search queries — not formal questions. 
They should sound like what people actually type into Google or Stack Overflow 
when trying to solve a problem, learn a concept, or find code examples.

Guidelines:
- Avoid full-sentence questions with punctuation like "What is..." or "How do I...".
- Use short, natural search phrases instead, such as:
  - "evidently data definition example"
  - "map target and prediction columns evidently"
  - "difference between timestamp and datetime evidently"
- Make queries varied and spontaneous, not repetitive or over-polished.
- Assume users of different knowledge levels:
  - beginner: broad or basic understanding
  - intermediate: knows basic terms but seeks clarification or examples
  - advanced: familiar with the tool, looking for details, edge cases, or integration options

Distribution rules:
- 60% of the queries should target beginner-level users
- 30% should target intermediate-level users
- 10% should target advanced-level users
- 75% of queries should have an intent of "code" (looking for examples or implementation)
- 25% should have an intent of "text" (looking for conceptual or theoretical explanations)

For each generated query, include:
- question: the natural, human-style search phrase
- summary_answer: a short 1–2 sentence summary of how the article addresses it
- difficulty: one of ["beginner", "intermediate", "advanced"]
- intent: one of ["text", "code"]

Also include a description summarizing what kind of article the questions are about.
""".strip()

In [9]:
from pydantic import BaseModel, Field
from typing import List, Literal

class Question(BaseModel):
    """
    Represents a realistic search-engine-style query a user might type before finding the article.
    Each question captures the likely search phrase, a short summary answer,
    the user's assumed skill level, and their intent (conceptual or code-focused).
    """
    question: str = Field(
        ...,
        description="A natural, short search query — not a full-sentence question — phrased like something typed into Google."
    )
    summary_answer: str = Field(
        ...,
        description="A concise 1–2 sentence summary of how the article addresses the query."
    )
    difficulty: Literal["beginner", "intermediate", "advanced"] = Field(
        ...,
        description="The assumed knowledge level of the user making the query."
    )
    intent: Literal["text", "code"] = Field(
        ...,
        description="Specifies if the user's intent is to get a theoretical explanation ('text') or an implementation example ('code')."
    )
class GeneratedQuestions(BaseModel):
    """
    A structured collection of human-like search queries derived from a given article.
    Includes a brief description of the article topic and a list of generated queries.
    Difficulty distribution: 60% beginner, 30% intermediate, 10% advanced.
    Intent distribution: 75% code-focused, 25% concept-focused.
    """
    description: str = Field(
        ...,
        description="A summary of the article or topic these search-style questions were generated for."
    )
    questions: List[Question] = Field(
        ...,
        description="A list of realistic search queries with short summaries, difficulty levels, and user intent."
    )

In [17]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

def map_progress(pool, seq, f):
    """Map function f over seq using the provided executor pool while
    displaying a tqdm progress bar. Returns a list of results in submission order.
    """
    results = []
    
    with tqdm(total=len(seq)) as progress:
        futures = []
    
        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)
        
        return results

In [18]:
def process_document(doc):
    content = doc['content']
    num_questions = len(content) // 1000

    user_prompt = f"""
    generate {num_questions} questions for this document:
{json.dumps(doc)}
    """.strip()

    output, usage = llm_structured(
        instructions=instructions,
        user_prompt=user_prompt,
        output_format=GeneratedQuestions,
    )
   
    return {'doc': doc, 'questions': output, 'usage': usage}

In [19]:
with ThreadPoolExecutor(max_workers=6) as pool:
    results = map_progress(pool, selected_docs, process_document)

100%|█████████████████████████████████████████████████████████████████| 68/68 [01:34<00:00,  1.39s/it]


In [20]:
len(results)

68

In [24]:
for r in results:
    doc = r['doc']
    questions = r['questions']
    break

In [25]:
questions

GeneratedQuestions(description='This article explains how to map input data using the `DataDefinition` object in Evidently. It details the different column types, roles, and mapping methods to ensure accurate data evaluations.', questions=[Question(question='mapping input data in Evidently', summary_answer='The article details how to use the `DataDefinition` object to map input data correctly for data evaluations in Evidently.', difficulty='beginner', intent='text'), Question(question='define column types in DataDefinition', summary_answer='It outlines the various column types that can be defined in a `DataDefinition`, including categorical, numerical, text, and datetime columns.', difficulty='beginner', intent='text'), Question(question='create Dataset object in Evidently', summary_answer='To create a `Dataset` object, you can use `Dataset.from_pandas` with a specified `DataDefinition` to ensure correct data processing.', difficulty='beginner', intent='code'), Question(question='DataD

In [26]:
final_questions = []

for r in results:
    doc = r['doc']
    questions = r['questions']

    for q in questions.questions:
        final_question = q.model_dump()
        final_question['filename'] = doc['filename']
        final_questions.append(final_question)

In [28]:
final_questions

[{'question': 'mapping input data in Evidently',
  'summary_answer': 'The article details how to use the `DataDefinition` object to map input data correctly for data evaluations in Evidently.',
  'difficulty': 'beginner',
  'intent': 'text',
  'filename': 'docs/library/data_definition.mdx'},
 {'question': 'define column types in DataDefinition',
  'summary_answer': 'It outlines the various column types that can be defined in a `DataDefinition`, including categorical, numerical, text, and datetime columns.',
  'difficulty': 'beginner',
  'intent': 'text',
  'filename': 'docs/library/data_definition.mdx'},
 {'question': 'create Dataset object in Evidently',
  'summary_answer': 'To create a `Dataset` object, you can use `Dataset.from_pandas` with a specified `DataDefinition` to ensure correct data processing.',
  'difficulty': 'beginner',
  'intent': 'code',
  'filename': 'docs/library/data_definition.mdx'},
 {'question': 'DataDefinition manual mapping examples',
  'summary_answer': 'The 

In [27]:
len(final_questions)

436

In [21]:
from toyaikit.pricing import PricingConfig

pricing = PricingConfig()

In [22]:
input_tokens = 0
output_tokens = 0

for r in results:
    usage = r['usage']
    input_tokens = input_tokens + usage.input_tokens
    output_tokens = output_tokens + usage.output_tokens
    
pricing.calculate_cost('gpt-4o-mini', input_tokens, output_tokens)

CostInfo(input_cost=0.026059199999999998, output_cost=0.013647000000000001, total_cost=0.0397062)

In [29]:
import pandas as pd

df_questions = pd.DataFrame(final_questions)

In [30]:
df_questions.to_csv('ground_truth_evidently.csv', index=False)

In [31]:
!head ground_truth_evidently.csv

question,summary_answer,difficulty,intent,filename
mapping input data in Evidently,The article details how to use the `DataDefinition` object to map input data correctly for data evaluations in Evidently.,beginner,text,docs/library/data_definition.mdx
define column types in DataDefinition,"It outlines the various column types that can be defined in a `DataDefinition`, including categorical, numerical, text, and datetime columns.",beginner,text,docs/library/data_definition.mdx
create Dataset object in Evidently,"To create a `Dataset` object, you can use `Dataset.from_pandas` with a specified `DataDefinition` to ensure correct data processing.",beginner,code,docs/library/data_definition.mdx
DataDefinition manual mapping examples,"The article provides examples of how to manually define a `DataDefinition`, including specific columns for text, numerical, and categorical data.",intermediate,code,docs/library/data_definition.mdx
default column mappings in Evidently,"It describes how automatic