### Generate MCQs using ReQUESTA (GPT-5 + Quality first)

In [None]:
import pandas as pd
from pathlib import Path
from src.workflow import question_generation_workflow

In [2]:
INPUT_CSV = "./some_recorded_output/source_texts.csv"
OUTPUT_CSV = "./some_recorded_output/requesta_mcqs.csv"

In [4]:
# --- load & validate input ---
texts = pd.read_csv(INPUT_CSV)

In [None]:

# --- generation knobs ---
FACT = 2
INFERENCE = 2
MAIN_IDEA = 1
MODEL = "gpt-5"
QUALITY_FIRST = True
CONCURRENCY_LIMIT = 3

# --- resume controls ---
START_AT = 0        # 0-based row index to start/resume from
END_AT = None       # optional: last row index (exclusive). None = process to end.
SKIP_ALREADY_WRITTEN = True  # if OUTPUT_CSV exists, skip rows whose textID is already written


n = len(texts)
if END_AT is None or END_AT > n:
    END_AT = n

# --- prepare output / dedup on resume ---
out_path = Path(OUTPUT_CSV)
out_path.parent.mkdir(parents=True, exist_ok=True)
header_needed = not out_path.exists()

processed_ids = set()
if SKIP_ALREADY_WRITTEN and out_path.exists():
    try:
        processed_ids = set(pd.read_csv(out_path, usecols=["textID"])["textID"].tolist())
        header_needed = False
    except Exception:
        # If existing file is malformed, fall back to re-writing header on new file
        pass

# --- process rows with iloc so we know exact positions ---
for i in range(START_AT, END_AT):
    row = texts.iloc[i]
    tid = row["textID"]
    txt = row["text"]

    if SKIP_ALREADY_WRITTEN and tid in processed_ids:
        print(f"Skipping {i+1}/{n} (textID={tid}) — already in output.")
        continue

    if not isinstance(txt, str) or not txt.strip():
        print(f"Skipping empty text at row {i+1} (textID={tid})")
        continue

    print(f"Processing {i+1}/{n} (textID={tid})")
    try:
        questions = await question_generation_workflow(
            text=txt,
            fact=FACT,
            inference=INFERENCE,
            main_idea=MAIN_IDEA,
            model=MODEL,
            quality_first=QUALITY_FIRST,
            concurrency=CONCURRENCY_LIMIT, 
        )
        # build rows (one per question)
        records = []
        for q in questions or []:
            records.append({
                "textID": tid,
                "text": txt,
                "requesta_question_type": q.get("question_type"),
                "requesta_question": q.get("mcq"),
                "requesta_answer": q.get("mcq_answer"),
            })
        # append to CSV immediately (good for resume-after-crash)
        if records:
            pd.DataFrame(records, columns=[
                "textID","text","requesta_question_type","requesta_question","requesta_answer"
            ]).to_csv(out_path, mode="a", index=False, header=header_needed)
            header_needed = False
            processed_ids.add(tid)
        else:
            print(f"  No questions returned for textID={tid}")
    except Exception as e:
        print(f"Error at row {i+1} (textID={tid}): {e}")
        break  # stop here so you can resume later with START_AT=i

print("Done (or stopped on error). Output:", str(out_path))
