In [None]:
import os
import requests
import re
import time
import json
from datetime import datetime

In [None]:
# Config
API_KEY = "YOUR_API_KEY" 
API_URL = "https://openrouter.ai/api/v1/chat/completions"
MODEL_NAME = "qwen/qwen-2.5-72b-instruct"

# Set to True to run on the sample text.
# Set to False to run on the full 'scraped_content' directory.
TEST_MODE = False

# Config
INPUT_DIR = "scraped_content"
OUTPUT_FILE = "QA_pairs.json"
REQUEST_TIMEOUT = 600 # Timeout

# Generates a single, structured JSON object containing a Q&A pair and metadata.
def generate_structured_qa_pair(text_content):
    print(f"Generating structured Q&A pairs...")

    system_prompt = (
        "You are an AI assistant designed to simulate the perspective of an average citizen interacting with government services. Your goal is to:\n"
        "1. Generate realistic, practical, and clear questions (prompts) that ordinary people would ask after reading the provided government website text.\n"
        "2. Provide concise, helpful answers (responses) using ONLY the information in the provided text.\n"
        "3. Assign metadata based on the taxonomy of government services.\n\n"
        "Rules:\n"
        "- Avoid speculation, assumptions, or information not present in the input text.\n"
        "- Avoid jargon; use simple, clear language.\n"
        "- Ensure compliance with responsible AI: no explicit, discriminatory, or sensitive content.\n"
        "- Return output strictly in JSON format as instructed."
    )

    user_prompt = (
        "### TASK\n"
        "Follow this 3-step process and repeat for 4 iterations per input text:\n"
        "STEP 1: Draft Q&A\n"
        "1.1. Generate realistic user question (“prompt”).\n"
        "1.2. Provide a simple, concise answer ("response") using only the INPUT TEXT.\n"
        "STEP 2: Metadata Tagging\n"
        "Infer the following attributes:\n"
        "# User Demographics: Who is asking the question?\n"
        "- targetAgeGroup: (Select one: under18, 18-25, 26-45, 46-65, 65+)\n"
        "- genderIdentity: (Select one: female, male, non-binary, unspecified)\n"
        "- educationBackground: (e.g., none, primary, secondary, graduate, postgraduate)\n"
        "- targetProfession: (e.g., student, teacher, healthcare worker, farmer, pensioner, unemployed, business owner, etc.)\n"
        "- digitalLiteracy: (Select one: low, medium, high)\n"
        "- geoRegion: (Select one: England, Scotland, Wales, Northern Ireland, other)\n"
        "- householdIncomeStatus: (Select one: under poverty limit, moderate, above moderate)\n"
        "- targetRole: (e.g., individual citizen, parent, widow, caregiver, immigrant)\n"
        "# User Intent: Intent of user or context of the prompt\n"
        "- promptIntentType: (Select one: informational, navigational, transactional, procedural, comparative, legal interpretation, personalized guidance, grievance / appeals)\n"
        "# Prompt Complexity\n"
        "- reasoningComplexity: (Select one: Factual Lookup, Procedural Explanation, Multi-step Reasoning, Legal/Policy Reasoning)\n"
        "# Geographic Context: Captures regional relevance\n"
        "- geographicContext: (e.g., 'UK-wide', 'England', 'Scotland', 'Wales', 'Northern Ireland', 'N/A')\n"
        "# Risk Assessment: Assessing risk/bias\n"
        "- sensitiveInformationPresent: (boolean)\n"
        "- vulnerableGroupTargeted: (boolean)\n"
        "- confidenceScore: (A float between 0.0 and 1.0)"
        "STEP 3: Output\n"
        "Return ONLY the final result as a JSON object."
        f"### INPUT TEXT\n{text_content}\n\n"
    )

    messages = [
        {"role": "system", "content": system_prompt_persona},
        {"role": "user", "content": user_prompt}
    ]
    
    data = {
        "model": MODEL_NAME, 
        "messages": messages, 
        "response_format": {"type": "json_object"}
    }
    headers = {"Authorization": f"Bearer {API_KEY}"}

    try:
        response = requests.post(API_URL, headers=headers, json=data, timeout=REQUEST_TIMEOUT)
        response.raise_for_status()
        return response.json()['choices'][0]['message']['content']
    except Exception as e:
        print(f"  [ERROR] Failed to generate structured data: {e}")
        return None

# Main function to generate Q&A pairs and save to JSON
def main():
    print("--- Structured Q&A Pair Generation Script ---")
    if not API_KEY or API_KEY == "YOUR_API_KEY":
        print("FATAL ERROR: API_KEY is not set. Please edit the script and add your OpenRouter API key.")
        return

    all_qa_pairs = []

    if TEST_MODE:
        print("--- RUNNING IN TEST MODE ---")
        if not os.path.isdir(INPUT_DIR):
            print(f"FATAL ERROR: Input directory '{INPUT_DIR}' not found for Test Mode.")
            return
        try:
            test_filename = "target997.txt"
            print(f"Processing test sample from file: '{test_filename}'...")
            filepath = os.path.join(INPUT_DIR, test_filename)

            with open(filepath, 'r', encoding='utf-8') as f:
                lines = f.readlines()

            first_line = lines[0].strip()
            domain_parts = first_line.split(':', 1)
            service_domain = domain_parts[0].strip() if len(domain_parts) == 2 else "Unknown"
            sub_service_domain = domain_parts[1].strip() if len(domain_parts) == 2 else "Unknown"
            topic = lines[2].strip() if len(lines) > 2 else "Unknown"
            content = "".join(lines[1:]).strip()

            url_category = service_domain.lower().replace(' ', '-').replace('&', 'and')
            source_url = f"https://www.gov.uk/browse/{url_category}"

            structured_json_str = generate_structured_qa_pair(content)
            if structured_json_str:
                qa_data = json.loads(structured_json_str)
                # Always treat as a single Q&A object, even if model returns a list
                if isinstance(qa_data, list):
                    qa_object = qa_data[0]  # Only take the first Q&A pair
                elif isinstance(qa_data, dict) and any(isinstance(v, list) for v in qa_data.values()):
                    qa_object = next(v for v in qa_data.values() if isinstance(v, list))[0]
                else:
                    qa_object = qa_data

                if 'metadata' not in qa_object: qa_object['metadata'] = {}
                qa_object['metadata']['serviceDomain'] = service_domain
                qa_object['metadata']['sub_service_domain'] = sub_service_domain
                qa_object['metadata']['topic'] = topic
                qa_object['metadata']['sourceURL'] = source_url
                qa_object['metadata']['sourceDomain'] = 'www.gov.uk'
                qa_object['metadata']['sourceLicense'] = 'Open Government Licence (OGL) v3.0'
                qa_object['metadata']['documentType'] = 'webpage'
                qa_object['metadata']['dateCreated'] = datetime.now().strftime('%Y-%m-%d')
                qa_object['metadata']['language'] = 'en'
                all_qa_pairs.append(qa_object)

        except json.JSONDecodeError as e:
            print(f"  [ERROR] Failed to parse JSON response from the model: {e}")
            print(f"  Raw response: {structured_json_str}")
        except Exception as e:
            print(f"  [FATAL TEST ERROR] Could not process test file: {e}")

    else:
        print("--- RUNNING IN FULL DIRECTORY MODE ---")
        if not os.path.isdir(INPUT_DIR):
            print(f"FATAL ERROR: Input directory '{INPUT_DIR}' not found.")
            return

        def natural_key(s):
            return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]

        files = sorted(list(set(f for f in os.listdir(INPUT_DIR) if f.endswith('.txt'))), key=natural_key)
        if not files:
            print(f"FATAL ERROR: No .txt files found in '{INPUT_DIR}'.")
            return

        print(f"Found {len(files)} unique files. Will process one Q&A pair per file.")

        for i, filename in enumerate(files):
            print(f"\n[{i+1}/{len(files)}] Processing file: {filename}...")
            filepath = os.path.join(INPUT_DIR, filename)

            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    lines = f.readlines()

                if not lines:
                    print("  - File is empty, skipping.")
                    continue

                first_line = lines[0].strip()
                domain_parts = first_line.split(':', 1)

                if len(domain_parts) == 2:
                    service_domain = domain_parts[0].strip()
                    sub_service_domain = domain_parts[1].strip()
                    topic = lines[2].strip() if len(lines) > 2 else "Unknown"
                    content = "".join(lines[1:]).strip()
                else:
                    print(f"  [WARNING] First line of {filename} not in 'category: subcategory' format. Setting domains to 'Unknown'.")
                    service_domain = "Unknown"
                    sub_service_domain = "Unknown"
                    topic = lines[2].strip() if len(lines) > 2 else "Unknown"
                    content = "".join(lines).strip()

                if not content.strip():
                    print("  - File has no content after the first line, skipping.")
                    continue

                url_category = service_domain.lower().replace(' ', '-').replace('&', 'and')
                source_url = f"https://www.gov.uk/browse/{url_category}"

                structured_json_str = generate_structured_qa_pair(content)
                if structured_json_str:
                    try:
                        qa_data = json.loads(structured_json_str)
                        if isinstance(qa_data, list):
                            qa_list = qa_data
                        elif isinstance(qa_data, dict) and any(isinstance(v, list) for v in qa_data.values()):
                            qa_list = next(v for v in qa_data.values() if isinstance(v, list))
                        else:
                            qa_list = [qa_data]

                        for qa_object in qa_list:
                            if 'metadata' not in qa_object: qa_object['metadata'] = {}
                            qa_object['metadata']['serviceDomain'] = service_domain
                            qa_object['metadata']['subServiceDomain'] = sub_service_domain
                            qa_object['metadata']['topic'] = topic
                            qa_object['metadata']['sourceURL'] = source_url
                            qa_object['metadata']['sourceDomain'] = 'www.gov.uk'
                            qa_object['metadata']['sourceLicense'] = 'Open Government Licence (OGL) v3.0'
                            qa_object['metadata']['documentType'] = 'webpage'
                            qa_object['metadata']['dateCreated'] = datetime.now().strftime('%Y-%m-%d')
                            qa_object['metadata']['language'] = 'en'
                            all_qa_pairs.append(qa_object)
                    except json.JSONDecodeError as e:
                        print(f"  [ERROR] Failed to parse JSON response for file {filename}: {e}")
                        print(f"  Raw response: {structured_json_str}")

                time.sleep(1)

            except Exception as e:
                print(f"  [FATAL FILE ERROR] Could not process file {filename}: {e}")

    # Save all collected Q&A pairs to a single JSON file
    print("\n-----------------------------------------")
    print(f"Saving {len(all_qa_pairs)} structured Q&A objects to '{OUTPUT_FILE}'...")
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(all_qa_pairs, f, indent=4)

    print("Structured Q&A pair generation process complete.")
    print("-----------------------------------------")

if __name__ == "__main__":
    main()

SyntaxError: unterminated string literal (detected at line 26) (1803087406.py, line 26)