# Query Generation

Source data does not come with associated questions. Thus we will generate questions for each chunk to evaluate retrieval and RAG responses.

In [1]:
import os
import json
from enum import Enum
from typing import Optional
from pydantic import BaseModel

import notebook_bootstrap 
import inspect
from logs.logger import get_logger
logger = get_logger()

from dotenv import load_dotenv
load_dotenv()
from openai import OpenAI
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
client = OpenAI()

import config
from utilities import load_matches, sample_wo_replacement

In [2]:
FILES = {
    'md':config.MD_JSONL,
    'txt':config.TXT_JSONL
}
KEYWORDS = ("introduction", "detail", "prelim", "result")
NUMBER_RANDOM_ELEMENTS = 50

In [3]:

class TextToQuery(BaseModel):
    query: Optional[str] = None
    complexity: int

class Status(str, Enum):
    OK = "OK"
    update = "update"
    fail = "fail"

class EvalQuery(BaseModel):
    status: Status
    reason: str
    query: Optional[str] = None


schema_text_to_query = {
    "$schema": "https://json-schema.org/draft/2020-12/schema",
    "title": "TextToQuery",
    "type": "object",
    "properties": {
        "query": {"type": ["string", "null"]},
        "complexity": {
            "type": "integer",
            "minimum": 0,   
            "maximum": 10     
        },
    },
    "required": ["query", "complexity"],
    "additionalProperties": False
}

schema_query_eval = {
    "type": "object",
    "properties": {
        "status": {"type": "string", "enum": ["OK", "update", "fail"]},
        "query": {"type": ["string", "null"]}
    },
    "required": ["status"],
    "additionalProperties": False
}

## TEXT TO QUERY PROMPTING

In [4]:
SYSTEM_TEXT_TO_QUERY_KEY = 'text_to_query_v1'

SYSTEM_TEXT_TO_QUERY = """
You are a question generator for advanced STEM texts.

GOAL:
Create one thoughtful, conceptually clear question that could be answered by a student who studied the material summarized in the given text chunk — even if they no longer see that exact text.
"""

INSTRUCTIONS_TEXT_TO_QUERY = """
PROCESS:
1. Read the text and extract its *main conceptual elements*, including:
   - the physical or mathematical system described,
   - key parameters and what varying them does,
   - contrasting viewpoints, interpretations, or domains (e.g., physics vs mathematics),
    - Ignore structural or organizational material (section summaries, theorem references, proof outlines, cross-references, etc.).
    - Focus only on scientific or mathematical statements that describe a system, property, phenomenon, or limitation.

2. Identify ONE intellectually central aspect of the text — this could be:
     • when a normally expected implication fails (e.g., positive Lyapunov exponent does not guarantee localization),
     • a specific regime or parameter case (e.g., criticality, localization, number theoretic properties),
     • when certain dynamical or spectral conditions change qualitative behavior,
     • when a model behaves differently under specific parameter regimes or approximations.
     • discussion of physical and mathematical viewpoints,
     • a mechanism or phenomenon the model aims to explain,

   Then, form a single question that:
     - explicitly names that conceptual situation using neutral descriptive phrases 
     - asks about a *non-trivial property, behavior, or contrast* within that setting,
     - asks the student to explain *why, compare, or predict* something within that context,
     - is specific enough that an expert reader could tell what concept it is testing,
     - could still be answered from a student’s notes without seeing the text itself.
       (e.g., “for ergodic Schrödinger operators with quasi-periodic dynamics,” 
             “when a system is well approximated by periodic transformations,” 
             “under conditions leading to a positive Lyapunov exponent”), 
     - remains concrete and content-anchored (avoid generic “a mathematical model” wording).

3. Phrase the question using domain-accurate terminology but without literal reuse of the source’s wording or symbols.
   Guidelines:
     - Allow: “ergodic Schrödinger operator,” “Lyapunov exponent,” “Anderson localization,” “periodic approximation.” etc.
     - Avoid: “this paper,” “the text,” “Equation (1),” "Theorem 2," or exact symbols (λ, α, θ).
     - When a symbol or name is unavoidable, replace it with a neutral descriptor (e.g., “a coupling parameter,” “the external field,” “the rotation parameter”).


4. The question should read naturally and not start with vague scaffolds like “In this approach…”.

5. If the text is purely definitional, return an empty query with complexity 0.
  OUTPUT FORMAT (JSON ONLY):
  {
    "query": "<conceptually specific question>",
    "complexity": <integer 0–10>
  }
""".strip()

## EVALUATE QUERY PROMPTING

In [5]:
SYSTEM_EVALUATE_QUERY_KEY = 'evaluate_query_v1'

SYSTEM_EVALUATE_QUERY = """
You are an expert in advanced mathematics and mathematical physics. 
Your task is to evaluate whether a student-generated question is well-posed, self-contained, and answerable 
by a knowledgeable reader with access to standard reference papers (but not to the original text chunk).

Provide a concise explanation ("reason") for your decision.

A good question must:
1. Refer to clearly defined mathematical or physical concepts 
   (e.g. Lyapunov exponents, CMV matrices, Hausdorff dimension, Anderson localization, reflection symmetries).
2. Contain an intelligible question or explanatory structure — it must include a clear interrogative or request for explanation, relation, or significance 
   (e.g. “what,” “how,” “why,” “explain,” “describe,” “discuss”).
3. Be answerable from established literature or standard knowledge, not dependent on access to the original text.
4. Avoid contradictions, undefined jargon, or purely contextual references (e.g. “in the given text”).
5. Not consist solely of an unordered list of terms, variables, or topics without an explicit question.

Return exactly one JSON object in one of these forms:
- {"status": "OK", "reason": "<brief justification>"}
- {"status": "update", "query": "<lightly corrected question>", "reason": "<brief justification>"}
- {"status": "fail", "reason": "<brief justification>"}

Classification guidelines:
- Consider concise, domain-specific, or research-level questions as valid when the terms and relationships are standard within advanced mathematics or physics.
- If the input is merely a list of topics or keywords with no interrogative phrasing, mark it as "fail".
- If the question only needs minor stylistic cleanup (e.g. remove vague references, fix LaTeX or clarify notation), mark it as "update".
- Assign "fail" only when the question is incoherent, not a question, or cannot be repaired by small edits.
""".strip()

EVAL_QUERY_EG_1_USER = "purely absolutely continuous spectrum, localization techniques, Aubry duality, Lyapunov exponents"
EVAL_QUERY_EG_1_ASSIST = """{"status": "fail", "reason": "The input is just a list of terms without any question or explanatory structure."}"""

EVAL_QUERY_EG_2_USER = "What are the small denominator problems in one-dimensional quasiperiodic Schrödinger operators?"
EVAL_QUERY_EG_2_ASSIST = """{"status": "OK", "reason": "All terms are well-defined; the question invites a standard explanation from the literature."}"""

EVAL_QUERY_EG_3_USER = "What are the main results regarding localization for CMV operators in the presence of reflection symmetries?"
EVAL_QUERY_EG_3_ASSIST = """{"status": "OK", "reason": "A valid summary-type question referring to established topics in spectral theory."""

EVAL_QUERY_EG_4_USER = "Explain the significance of the parameters β and θ in the almost Mathieu operator H_{β,θ}."
EVAL_QUERY_EG_4_ASSIST = """{"status": "OK", "reason": "Well-defined parameters and context; self-contained and answerable."}"""

EVAL_QUERY_EG_4_USER = "Explain the effect of randomness on energy in physics."
EVAL_QUERY_EG_4_ASSIST = """{"status": "fail", "reason": "Too broad and lacks precise mathematical context."}"""

EVAL_QUERY_EG_5_USER =  "In the given text, how is the spectrum shown to be self-similar?" 
EVAL_QUERY_EG_5_ASSIST = """{"status": "update", "query": "How is the spectrum shown to be self-similar?", "reason": "Remove reference to source for self-containment."}"""

eval_query_input=[
   {"role": "system", "content": SYSTEM_TEXT_TO_QUERY},
   {"role": "system", "content": INSTRUCTIONS_TEXT_TO_QUERY},
   {"role": "user", "content": "JSON Schema:"},
   {"role": "user", "content": EVAL_QUERY_EG_1_USER},
   {"role": "assistant", "content": EVAL_QUERY_EG_1_ASSIST},
   {"role": "user", "content": EVAL_QUERY_EG_2_USER},
   {"role": "assistant", "content": EVAL_QUERY_EG_2_ASSIST},
   {"role": "user", "content": EVAL_QUERY_EG_3_USER},
   {"role": "assistant", "content": EVAL_QUERY_EG_3_ASSIST},
   {"role": "user", "content": EVAL_QUERY_EG_4_USER},
   {"role": "assistant", "content": EVAL_QUERY_EG_4_ASSIST},
   {"role": "user", "content": EVAL_QUERY_EG_5_USER},
   {"role": "assistant", "content": EVAL_QUERY_EG_5_ASSIST},
]

USER_EVALUATE_QUERY = """
<query>
{query}
</query>

EVALUATION INSTRUCTIONS:
- If the query uses ambiguous or incorrect terminology but can be clarified by a small edit, fix it and return the updated version.
- If the query mixes incompatible concepts, lacks mathematical precision, or cannot be answered without the original text, return "fail".
- Only output a JSON object — no commentary or explanation.
""".strip()


In [6]:
_ = client.responses.create(
    model="gpt-4o-mini",
    input=[
        {"role": "system", "content": SYSTEM_TEXT_TO_QUERY},
        {"role": "system", "content": INSTRUCTIONS_TEXT_TO_QUERY},
        {"role": "user", "content": "Priming cache."},
    ],
    max_output_tokens=16,
    temperature=0,
    text={"format": {"type": "json_object"}},
    prompt_cache_key=SYSTEM_TEXT_TO_QUERY_KEY,
)

_ = client.responses.create(
    model="gpt-4o-mini",
    input=[
        {"role": "system", "content": SYSTEM_EVALUATE_QUERY},
        {"role": "user", "content": "Priming cache."},
    ],
    max_output_tokens=16,
    temperature=0,
    text={"format": {"type": "json_object"}},
    prompt_cache_key=SYSTEM_EVALUATE_QUERY_KEY,
)

In [7]:
md_matches = load_matches(config.MD_JSONL,KEYWORDS)
md_sample = sample_wo_replacement(md_matches,NUMBER_RANDOM_ELEMENTS)
txt_matches = load_matches(config.TXT_JSONL,KEYWORDS)
txt_sample = sample_wo_replacement(txt_matches,NUMBER_RANDOM_ELEMENTS)
full_sample = [ {**s, 'etl':'md'} for s in md_sample ] + [ {**s, 'etl':'txt'} for s in txt_sample ] 

In [8]:
records = []

for chunk in full_sample:
    try:
        chunk_id = chunk['chunk_id']
        text = chunk['text']
        etl = chunk['etl']
        prompt = json.dumps(text)
        response = client.responses.parse(
            model="gpt-4o-mini",
            #input=[
            #    {"role": "user", "content": USER_TEXT_TO_QUERY.format(text=text)},
            #],
            instructions='Return JSON ONLY. Return no explanatory prose.',
            input=[
                {"role": "system", "content": SYSTEM_TEXT_TO_QUERY},
                {"role": "system", "content": INSTRUCTIONS_TEXT_TO_QUERY},
                {"role": "user", "content": "JSON Schema:"},
                {"role": "user", "content": prompt},
            ],
            temperature=0.85,
            text_format=TextToQuery,
            #text={"format": {"type": "json_object"}},
            prompt_cache_key=SYSTEM_TEXT_TO_QUERY_KEY,
        )
        #response = json.loads(response.output_text)
        #query = response['query']
        #complexity = response['complexity']

        if not response.output_parsed.query:
            continue

        query_v1 = response.output_parsed.query
        complexity = response.output_parsed.complexity

        eval_query_v1 = [
            *eval_query_input,
            {"role": "user", "content": USER_EVALUATE_QUERY.format(query=query_v1)},
        ]

        response = client.responses.parse(
            model="gpt-4o",
            #input=[
            #    {"role": "user", "content": USER_TEXT_TO_QUERY.format(text=text)},
            #],
            input=eval_query_v1,
            temperature=0,
            text_format=EvalQuery,
            #text={"format": {"type": "json_object"}},
            prompt_cache_key=SYSTEM_EVALUATE_QUERY_KEY,
        )
        status = response.output_parsed.status.name
        query_v2 = response.output_parsed.query
        reason = response.output_parsed.reason
        if status != 'fail':
            record = {
                'chunk_id':chunk_id,
                'text':text,
                'etl':etl,
                'query_v1':query_v1,
                'query_v2':query_v2,
                'complexity':complexity,
                'evaluation':status,
                'reason':reason
            }
            records.append(record)
    except Exception as e:
        print(f'Error: {e}')


In [9]:

with open("generated_queries.jsonl", "w", encoding="utf-8") as f:
    for rec in records:
        json.dump(rec, f, ensure_ascii=False)
        f.write("\n")