In [2]:
"""
Exam Parser Script
===================

Converts a Markdown exam output file into a list of Question objects.

IMPORTANT: This script uses Python's dataclass instead of Pydantic BaseModel for 
portability. To use Pydantic BaseModel, replace the Question class definition with:

    from pydantic import BaseModel
    
    class Question(BaseModel):
        id: int
        question: str
        lead_in: str = ""
        option_A: str = "A"
        option_B: str = "B"
        option_C: str = "C"
        option_D: str = "D"
        option_E: str = "E"
        option_A_url: str = ""
        option_B_url: str = ""
        option_C_url: str = ""
        option_D_url: str = ""
        option_E_url: str = ""
        section_id: str = ""

Then install pydantic: pip install pydantic
"""

import re
from typing import List, Optional, Tuple
from dataclasses import dataclass, field, asdict


@dataclass
class Question:
    id: int
    question: str
    lead_in: str = ""
    option_A: str = "A"
    option_B: str = "B"
    option_C: str = "C"
    option_D: str = "D"
    option_E: str = "E"
    option_A_url: str = ""
    option_B_url: str = ""
    option_C_url: str = ""
    option_D_url: str = ""
    option_E_url: str = ""
    section_id: str = ""
    
    def dict(self):
        """Convert to dictionary for JSON serialization."""
        return asdict(self)


# Robust question splitting regex from question_split.py
QUESTION_SPLIT = re.compile(
    r'''
    (?:
        # ── HTML-wrapped number: must be ONLY digits
        <(?:p|b)>\s*(\d+)\s*</(?:p|b)>

      | # ── markdown bold number: must be ONLY digits
        \*\*\s*(\d+)\s*\*\*

      | # ── plain / heading number (line-based)
        ^\s*(?:\#+\s*)?
        (\d+)
        (?!\.)
        (?:
            \s*$                # number-only line
          | \s+(?!\d)            # not a sequence
             (?!cm\b|mm\b|m\b|kg\b|g\b|%)
          | \n+
        )
    )
    ''',
    re.VERBOSE | re.MULTILINE | re.IGNORECASE
)


def extract_questions_with_numbers(text: str) -> List[Tuple[str, str]]:
    """Extract questions with their numbers from text."""
    splits = QUESTION_SPLIT.split(text)
    results = []

    i = 1
    while i < len(splits):
        group = splits[i:i+3]
        q_num = next((g for g in group if g is not None), None)
        block = splits[i + 3] if i + 3 < len(splits) else ""

        if q_num:
            results.append((q_num.strip(), block.strip()))

        i += 4

    return results


def extract_image_urls(text: str) -> List[str]:
    """Extract all image URLs from markdown text."""
    # Pattern: ![...](URL)
    pattern = r'!\[.*?\]\((https?://[^\s\)]+)\)'
    urls = re.findall(pattern, text)
    return urls


def remove_image_markdown(text: str) -> str:
    """Remove image markdown but keep the URLs for later processing."""
    # Remove ![description](url) patterns
    text = re.sub(r'!\[.*?\]\([^\)]+\)', '', text)
    return text.strip()


def embed_images_as_html(urls: List[str]) -> str:
    """Convert image URLs to HTML img tags."""
    if not urls:
        return ""
    html_images = []
    for url in urls:
        html_images.append(f'<p><img src="{url}"></p>')
    return '\n'.join(html_images)


def parse_options(text: str) -> dict:
    """Parse options from text. Returns dict with option_A through option_E."""
    options = {
        'option_A': 'A',
        'option_B': 'B',
        'option_C': 'C',
        'option_D': 'D',
        'option_E': 'E'
    }
    
    # Pattern for options: - A text or - A) text
    option_pattern = r'^\s*-\s*([A-E])[\)\.]?\s+(.+?)$'
    
    lines = text.split('\n')
    for line in lines:
        match = re.match(option_pattern, line.strip())
        if match:
            letter = match.group(1)
            value = match.group(2).strip()
            options[f'option_{letter}'] = value
    
    return options


def parse_table_options(text: str) -> dict:
    """Parse options from markdown tables (for NVR-style vertical tables)."""
    options = {
        'option_A': 'A',
        'option_B': 'B',
        'option_C': 'C',
        'option_D': 'D',
        'option_E': 'E'
    }
    
    lines = text.split('\n')
    table_data = []
    
    for line in lines:
        if '|' in line and not line.strip().startswith('|---'):
            cells = [cell.strip() for cell in line.split('|')]
            cells = [c for c in cells if c]  # Remove empty cells
            if cells:
                table_data.append(cells)
    
    # Check if last row is A B C D E (option labels)
    if table_data and len(table_data[-1]) == 5:
        last_row = table_data[-1]
        if last_row == ['A', 'B', 'C', 'D', 'E']:
            # This is vertical format
            if len(table_data) >= 2:
                # Second to last row contains the actual options
                option_values = table_data[-2]
                for i, letter in enumerate(['A', 'B', 'C', 'D', 'E']):
                    if i < len(option_values):
                        options[f'option_{letter}'] = option_values[i]
    
    return options


def extract_section(text: str, section_name: str, next_section: Optional[str] = None) -> str:
    """Extract a section from the markdown text."""
    # Try different heading levels
    patterns = [
        rf'^#\s+{re.escape(section_name)}\s*$',  # # SECTION
        rf'^##\s+{re.escape(section_name)}\s*$',  # ## SECTION
    ]
    
    match = None
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
        if match:
            break
    
    if not match:
        return ""
    
    start = match.end()
    
    if next_section:
        # Try both heading levels for next section
        next_patterns = [
            rf'^#\s+{re.escape(next_section)}\s*$',
            rf'^##\s+{re.escape(next_section)}\s*$',
        ]
        for next_pattern in next_patterns:
            next_match = re.search(next_pattern, text[start:], re.IGNORECASE | re.MULTILINE)
            if next_match:
                end = start + next_match.start()
                return text[start:end]
    
    return text[start:]


def parse_comprehension(text: str, start_id: int) -> List[Question]:
    """Parse comprehension section."""
    questions = []
    
    # Find the passage (everything before "Please answer these questions")
    passage_end = re.search(r'Please answer these questions', text, re.IGNORECASE)
    if not passage_end:
        return questions
    
    passage_text = text[:passage_end.start()].strip()
    
    # Remove "Read this passage carefully" line
    passage_text = re.sub(r'^.*?Read this passage carefully.*?\n', '', passage_text, flags=re.IGNORECASE)
    
    # Clean the passage - remove image descriptions but keep passage content
    lines = passage_text.split('\n')
    cleaned_lines = []
    for line in lines:
        # Skip image markdown and descriptions
        if line.strip().startswith('![') or 'cloudinary' in line:
            continue
        # Skip lines that are just image descriptions
        if re.match(r'^[A-Z].*\.(jpg|png|gif)$', line.strip()):
            continue
        cleaned_lines.append(line)
    
    passage = '\n'.join(cleaned_lines).strip()
    
    # Extract questions after "Please answer these questions"
    questions_text = text[passage_end.end():]
    
    # Extract individual questions
    q_list = extract_questions_with_numbers(questions_text)
    
    for q_num, q_block in q_list:
        # Split question text from options
        lines = q_block.split('\n')
        question_text = []
        options_started = False
        options_text = []
        
        for line in lines:
            if line.strip().startswith('- ') and re.match(r'^\s*-\s*[A-E][\)\.]?\s+', line):
                options_started = True
            
            if options_started:
                options_text.append(line)
            else:
                question_text.append(line)
        
        lead_in = '\n'.join(question_text).strip()
        options = parse_options('\n'.join(options_text))
        
        q = Question(
            id=start_id,
            question=passage,
            lead_in=lead_in,
            section_id="ENGLISH_COMPREHENSION",
            **options
        )
        questions.append(q)
        start_id += 1
    
    return questions


def parse_spelling(text: str, start_id: int) -> List[Question]:
    """Parse spelling section."""
    questions = []
    
    # Extract questions
    q_list = extract_questions_with_numbers(text)
    
    for q_num, q_block in q_list:
        # For spelling, the sentence is the question and options are just A B C D
        # followed by an implicit N option
        lines = [line.strip() for line in q_block.split('\n') if line.strip()]
        
        # The first line is usually the sentence
        question = lines[0] if lines else ""
        
        # Options for spelling/punctuation are special:
        # A, B, C, D refer to parts of the sentence
        # E is always "N" (no mistake)
        options = {
            'option_A': 'A',
            'option_B': 'B', 
            'option_C': 'C',
            'option_D': 'D',
            'option_E': 'N'  # Always N for "no mistake"
        }
        
        q = Question(
            id=start_id,
            question=question,
            section_id="ENGLISH_SPELLING",
            **options
        )
        questions.append(q)
        start_id += 1
    
    return questions


def parse_punctuation(text: str, start_id: int) -> List[Question]:
    """Parse punctuation section."""
    questions = []
    
    # Extract questions
    q_list = extract_questions_with_numbers(text)
    
    for q_num, q_block in q_list:
        # For punctuation, the sentence is the question and options are just A B C D
        # followed by an implicit N option
        lines = [line.strip() for line in q_block.split('\n') if line.strip()]
        
        # The first line is usually the sentence
        question = lines[0] if lines else ""
        
        # Options for spelling/punctuation are special:
        # A, B, C, D refer to parts of the sentence
        # E is always "N" (no mistake)
        options = {
            'option_A': 'A',
            'option_B': 'B',
            'option_C': 'C',
            'option_D': 'D',
            'option_E': 'N'  # Always N for "no mistake"
        }
        
        q = Question(
            id=start_id,
            question=question,
            section_id="ENGLISH_PUNCTUATION",
            **options
        )
        questions.append(q)
        start_id += 1
    
    return questions


def parse_cloze(text: str, start_id: int) -> List[Question]:
    """Parse cloze section."""
    questions = []
    
    # Find the passage (everything before question numbers start)
    passage_lines = []
    questions_started = False
    
    lines = text.split('\n')
    passage_text = []
    questions_text = []
    
    for line in lines:
        # Check if question numbering starts
        if re.match(r'^\s*\d+\s*$', line.strip()) or re.match(r'^\*\*\s*\d+\s*\*\*', line.strip()):
            questions_started = True
        
        if questions_started:
            questions_text.append(line)
        else:
            passage_text.append(line)
    
    passage = '\n'.join(passage_text).strip()
    # Remove "In this passage you have to choose" instruction
    passage = re.sub(r'^.*?In this passage you have to choose.*?\n', '', passage, flags=re.IGNORECASE)
    passage = re.sub(r'^.*?Choose the.*?answer sheet\.\s*\n', '', passage, flags=re.IGNORECASE | re.MULTILINE)
    
    # For cloze, extract the passage text (title + body)
    # The passage contains blanks marked by question numbers
    
    # Extract questions
    q_list = extract_questions_with_numbers('\n'.join(questions_text))
    
    for q_num, q_block in q_list:
        # Check if options are in table format or plain text format
        if '|' in q_block:
            # Table format - parse the table
            options = parse_table_options(q_block)
            
            # Get the context before the table
            table_start = q_block.find('|')
            lead_in = q_block[:table_start].strip()
        else:
            # Plain text format - options are just single letters A B C D E
            # Each on a separate line
            lines = [l.strip() for l in q_block.split('\n') if l.strip()]
            
            # Find where the single-letter options start
            option_lines = []
            context_lines = []
            found_options = False
            
            for line in lines:
                # Check if line is a single letter A-E
                if line in ['A', 'B', 'C', 'D', 'E']:
                    found_options = True
                    option_lines.append(line)
                elif not found_options:
                    context_lines.append(line)
                else:
                    # After options, this might be continuation of passage
                    pass
            
            lead_in = '\n'.join(context_lines).strip()
            
            # For plain text single-letter options in cloze, 
            # we don't have the actual option text, just placeholders
            options = {
                'option_A': 'A',
                'option_B': 'B',
                'option_C': 'C',
                'option_D': 'D',
                'option_E': 'E'
            }
        
        q = Question(
            id=start_id,
            question=passage,
            lead_in=lead_in,
            section_id="ENGLISH_CLOZE",
            **options
        )
        questions.append(q)
        start_id += 1
    
    return questions


def parse_english_section(text: str, start_id: int) -> List[Question]:
    """Parse the entire ENGLISH section."""
    questions = []
    
    # Find comprehension section
    comp_start = re.search(r'Read this passage carefully', text, re.IGNORECASE)
    spell_start = re.search(r'Spelling Exercises', text, re.IGNORECASE)
    punct_start = re.search(r'In these sentences there are some \*\*punctuation\*\* mistakes', text, re.IGNORECASE)
    cloze_start = re.search(r'In this passage you have to choose the \*\*best\*\* word', text, re.IGNORECASE)
    
    if comp_start and spell_start:
        comp_text = text[comp_start.start():spell_start.start()]
        comp_questions = parse_comprehension(comp_text, start_id)
        questions.extend(comp_questions)
        start_id += len(comp_questions)
    
    if spell_start and punct_start:
        spell_text = text[spell_start.start():punct_start.start()]
        spell_questions = parse_spelling(spell_text, start_id)
        questions.extend(spell_questions)
        start_id += len(spell_questions)
    
    if punct_start and cloze_start:
        punct_text = text[punct_start.start():cloze_start.start()]
        punct_questions = parse_punctuation(punct_text, start_id)
        questions.extend(punct_questions)
        start_id += len(punct_questions)
    
    if cloze_start:
        # Find end of cloze (start of next section or end of text)
        # Look for ## VERBAL REASONING or end of English section
        cloze_text = text[cloze_start.start():]
        
        cloze_questions = parse_cloze(cloze_text, start_id)
        questions.extend(cloze_questions)
        start_id += len(cloze_questions)
    
    return questions


def parse_mathematics_section(text: str, start_id: int) -> List[Question]:
    """Parse MATHEMATICS section."""
    questions = []
    
    # Extract questions
    q_list = extract_questions_with_numbers(text)
    
    for q_num, q_block in q_list:
        # Extract image URLs first
        image_urls = extract_image_urls(q_block)
        
        # Remove image markdown
        cleaned_block = remove_image_markdown(q_block)
        
        # Split question text from options
        lines = cleaned_block.split('\n')
        question_lines = []
        options_text = []
        options_started = False
        
        for line in lines:
            if line.strip().startswith('- ') and re.match(r'^\s*-\s*[A-E][\)\.]?\s+', line):
                options_started = True
            
            if options_started:
                options_text.append(line)
            else:
                question_lines.append(line)
        
        # Build question with images embedded
        question_text = '\n'.join(question_lines).strip()
        
        # Add images before question text
        if image_urls:
            images_html = embed_images_as_html(image_urls)
            question_text = images_html + '\n' + question_text
        
        options = parse_options('\n'.join(options_text))
        
        q = Question(
            id=start_id,
            question=question_text,
            section_id="MATHEMATICS",
            **options
        )
        questions.append(q)
        start_id += 1
    
    return questions


def parse_verbal_reasoning_section(text: str, start_id: int) -> List[Question]:
    """Parse VERBAL REASONING section."""
    questions = []
    
    # Extract questions
    q_list = extract_questions_with_numbers(text)
    
    for q_num, q_block in q_list:
        # Extract image URLs first
        image_urls = extract_image_urls(q_block)
        
        # Remove image markdown
        cleaned_block = remove_image_markdown(q_block)
        
        # Split question text from options
        lines = cleaned_block.split('\n')
        question_lines = []
        options_text = []
        options_started = False
        
        for line in lines:
            if line.strip().startswith('- ') and re.match(r'^\s*-\s*[A-E][\)\.]?\s+', line):
                options_started = True
            
            if options_started:
                options_text.append(line)
            else:
                question_lines.append(line)
        
        # Build question with images embedded
        question_text = '\n'.join(question_lines).strip()
        
        # Add images before question text
        if image_urls:
            images_html = embed_images_as_html(image_urls)
            question_text = images_html + '\n' + question_text
        
        options = parse_options('\n'.join(options_text))
        
        q = Question(
            id=start_id,
            question=question_text,
            section_id="VERBAL_REASONING",
            **options
        )
        questions.append(q)
        start_id += 1
    
    return questions


def parse_exam(file_path: str) -> List[Question]:
    """Main function to parse the exam markdown file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    all_questions = []
    current_id = 1
    
    # Extract ENGLISH section (from # ENGLISH to ## VERBAL REASONING)
    english_text = extract_section(content, "ENGLISH", "VERBAL REASONING")
    if english_text:
        english_questions = parse_english_section(english_text, current_id)
        all_questions.extend(english_questions)
        current_id += len(english_questions)
    
    # Extract VERBAL REASONING section (from ## VERBAL REASONING to ## MATHEMATICS)
    vr_text = extract_section(content, "VERBAL REASONING", "MATHEMATICS")
    if vr_text:
        vr_questions = parse_verbal_reasoning_section(vr_text, current_id)
        all_questions.extend(vr_questions)
        current_id += len(vr_questions)
    
    # Extract MATHEMATICS section (from ## MATHEMATICS to ## NON-VERBAL REASONING or end)
    math_text = extract_section(content, "MATHEMATICS", "NON-VERBAL REASONING")
    if not math_text:
        # Try without next section (in case NVR doesn't exist)
        math_text = extract_section(content, "MATHEMATICS")
    
    if math_text:
        math_questions = parse_mathematics_section(math_text, current_id)
        all_questions.extend(math_questions)
        current_id += len(math_questions)
    
    return all_questions


# Example usage
if __name__ == "__main__":
    # Parse the exam
    questions = parse_exam("output.md")
    
    # Print summary
    print(f"Total questions extracted: {len(questions)}")
    
    # Print first few questions for verification
    for i, q in enumerate(questions[:3]):
        print(f"\n--- Question {i+1} ---")
        print(f"ID: {q.id}")
        print(f"Section: {q.section_id}")
        print(f"Question: {q.question[:100]}...")
        print(f"Lead-in: {q.lead_in[:50]}..." if q.lead_in else "Lead-in: (empty)")
        print(f"Options: A={q.option_A[:30]}, B={q.option_B[:30]}, C={q.option_C[:30]}, D={q.option_D[:30]}, E={q.option_E[:30]}")
    
    # Export to JSON (optional)
    import json
    with open("questions.json", "w", encoding="utf-8") as f:
        json.dump([q.dict() for q in questions], f, indent=2, ensure_ascii=False)
    print("\n✓ Questions exported to questions.json")

Total questions extracted: 150

--- Question 1 ---
ID: 1
Section: ENGLISH_COMPREHENSION
Question: ## The Strange Case of Dr Jekyll and Mr Hyde

by Robert Louis Stevenson

*Dr Jekyll is a well-respec...
Lead-in: In which city is the novel set?...
Options: A=London, B=Manchester, C=Birmingham, D=Oxford, E=Cambridge

--- Question 2 ---
ID: 2
Section: ENGLISH_COMPREHENSION
Question: ## The Strange Case of Dr Jekyll and Mr Hyde

by Robert Louis Stevenson

*Dr Jekyll is a well-respec...
Lead-in: What type of word is "ferocity" (line 2)?...
Options: A=noun, B=preposition, C=verb, D=adverb, E=adjective

--- Question 3 ---
ID: 3
Section: ENGLISH_COMPREHENSION
Question: ## The Strange Case of Dr Jekyll and Mr Hyde

by Robert Louis Stevenson

*Dr Jekyll is a well-respec...
Lead-in: The victim of the crime is described as being of “...
Options: A=that he was on a high platform, B=that he was a man of the upper, C=that he always held his head u, D=that he was rich, E=that he was a man of the lower


In [1]:
import pandas as pd

df = pd.read_json("questions.json")

df = df[['question', 'lead_in', 'option_A', 'option_B', 'option_C', 'option_D', 'option_E', 'option_A_url', 'option_B_url', 'option_C_url', 'option_D_url', 'option_E_url', 'section_id']]
# Rename columns
df = df.rename(columns={
    "question": "Question",
    "lead_in": "Lead In",
    "option_A": "Option A",
    "option_B": "Option B",
    "option_C": "Option C",
    "option_D": "Option D",
    "option_E": "Option E",
    "option_A_url": "Option A Image URL",
    "option_B_url": "Option B Image URL",
    "option_C_url": "Option C Image URL",
    "option_D_url": "Option D Image URL",
    "option_E_url": "Option E Image URL",
    "section_id": "Section External ID"
})

df.to_csv("output_final.csv", index=False)
