In [179]:
import re
import csv
from pydantic import BaseModel
from typing import List, Optional, Tuple

In [180]:
input_file = "output.md"
output_file = "cleaned_paper.csv"

### Question Extraction

In [181]:
class Question(BaseModel):
    id: int
    question: str
    lead_in: str = ""
    option_A: str = "A"
    option_B: str = "B"
    option_C: str = "C"
    option_D: str = "D"
    option_E: str = "E"
    option_A_url: str = ""
    option_B_url: str = ""
    option_C_url: str = ""
    option_D_url: str = ""
    option_E_url: str = ""
    section_id: str="nvr"
    multiple_correct: str = "False"
    correct_option: str = "A"
    side_by_side: str = "True"

In [182]:
questions_data: List[Question] = []

In [188]:
QUESTION_SPLIT = re.compile(
    r'''
    (?:
        # ── HTML-wrapped number: must be ONLY digits
        <(?:p|b)>\s*(\d+)\s*</(?:p|b)>

      | # ── markdown bold number: must be ONLY digits
        \*\*\s*(\d+)\s*\*\*

      | # ── plain / heading number (line-based)
        ^\s*(?:\#+\s*)?
        (\d+)
        (?!\.)
        (?:
            \s*$                # number-only line
          | \s+(?!\d)            # not a sequence
             (?!cm\b|mm\b|m\b|kg\b|g\b|%)
          | \n+
        )
    )
    ''',
    re.VERBOSE | re.MULTILINE | re.IGNORECASE
)




SPLIT_HEADING = re.compile(
    r'(\n#{1,3}\s+(?=.*[A-Za-z])[^\n]+\n)'
)


In [189]:
def extract_questions(text):
    splits = QUESTION_SPLIT.split(text)
    results = []

    i = 1
    while i < len(splits):
        group = splits[i:i+3]
        q_num = next((g for g in group if g is not None), None)
        block = splits[i + 3] if i + 3 < len(splits) else ""

        if q_num:
            results.append((q_num.strip(), block.strip()))

        i += 4

    return results


In [190]:
with open(input_file, 'r', encoding='utf-8') as f:
    content = f.read()

In [191]:
parts = SPLIT_HEADING.split(content)

# Reattach heading + content
sections = []
i = 0
while i < len(parts):
    if parts[i].startswith('\n#'):
        heading = parts[i].strip()
        content = parts[i + 1].strip() if i + 1 < len(parts) else ""
        sections.append((heading, content))
        i += 2
    else:
        i += 1

for h, c in sections:
    print("HEADING", h)
    # print(c)
    print(*extract_questions(c), sep='\n')

HEADING # ENGLISH

HEADING ## The Strange Case of Dr Jekyll and Mr Hyde
('1', 'In which city is the novel set?\n\n- A London\n- B Manchester\n- C Birmingham\n- D Oxford\n- E Cambridge')
('2', 'What type of word is "ferocity" (line 2)?\n\n- A noun\n- B preposition\n- C verb\n- D adverb\n- E adjective')
('3', 'The victim of the crime is described as being of “high position” (line 2). What does this mean?\n\n- A that he was on a high platform when the crime occurred\n- B that he was a man of the upper classes\n- C that he always held his head up high\n- D that he was rich\n- E that he was a man of the lower classes')
('4', 'What time did the maid go to bed?\n\n- A 9pm\n- B 10pm\n- C 11pm\n- D midnight\n- E 1am')
('5', 'What literary technique is being used in the line “a fog rolled over the city” (line 4)?\n\n- A simile\n- B metaphor\n- C rhetorical question\n- D personification\n- E oxymoron')
('6', 'What types of word are “drawing” (line 10), “advancing” (line 11), and “bowed” (line 13)