In [None]:
import fitz  
import os
import json
import re

pdf_path = "2.pdf"
output_dir = "output/images"
os.makedirs(output_dir, exist_ok=True)

doc = fitz.open(pdf_path)
questions = []
image_counter = 0

all_images = []
image_map = {}

def clean_question(text):
    text = re.sub(r"\[\w\]", "", text)
    text = re.sub(r"\bAns\.?\b", "", text, flags=re.IGNORECASE)
    text = re.sub(r"\s{2,}", " ", text).strip()
    text = re.sub(r"(\d+\s+){3,}\d+\s*$", "", text).strip()
    return text

for page_index in range(len(doc)):
    page = doc[page_index]
    image_list = page.get_images(full=True)

    for img_index, img in enumerate(image_list):
        xref = img[0]
        if xref in image_map:
            continue

        base_image = doc.extract_image(xref)
        ext = base_image["ext"]
        img_bytes = base_image["image"]
        img_name = f"page{page_index+1}_img{image_counter}.{ext}"
        img_path = os.path.join(output_dir, img_name)

        with open(img_path, "wb") as f:
            f.write(img_bytes)

        rel_path = os.path.join("output", "images", img_name).replace("/", "\\")
        image_map[xref] = rel_path
        all_images.append(rel_path)
        image_counter += 1

text = ""
for page in doc:
    text += page.get_text("text") + "\n"

lines = [line.strip() for line in text.split("\n") if line.strip()]
question_blocks = []
current_q = None

question_start_pattern = re.compile(r"^\d+\.\s+")

for line in lines:
    if question_start_pattern.match(line):
        if current_q:
            question_blocks.append(clean_question(current_q))
        current_q = line
    elif current_q:
        current_q += " " + line

if current_q:
    question_blocks.append(clean_question(current_q))

image_pointer = 0
final_questions = []

for i, q_text in enumerate(question_blocks):
    question = {
        "question": q_text,
        "questions_images": [],
        "answer_option_images": []
    }

    if i == 0:
        question["questions_images"] = all_images[0:2]
        question["answer_option_images"] = all_images[2:6]
        image_pointer = 6
    elif i == 1:
        question["questions_images"] = [all_images[6]]
        question["answer_option_images"] = [
            all_images[7],
            all_images[8],
            all_images[9],
            all_images[11]
        ]
        image_pointer = 12
    elif i == 2:
        question["questions_images"] = [all_images[12]]
        question["answer_option_images"] = []
        image_pointer = 13
    elif i == 3:
        question["questions_images"] = all_images[13:15]
        question["answer_option_images"] = []
        image_pointer = 15
    elif i == 4:
        question["questions_images"] = [all_images[15]]
        question["answer_option_images"] = []
        image_pointer = 16
    else:
        question["questions_images"] = all_images[image_pointer:image_pointer+2]
        question["answer_option_images"] = all_images[image_pointer+2:image_pointer+6]
        image_pointer += 6

    final_questions.append(question)

with open("output/questions_cleaned.json", "w", encoding="utf-8") as f:
    json.dump(final_questions, f, indent=2, ensure_ascii=False)

print("✅ All fixed! Clean questions and images saved in output/questions_cleaned.json")

✅ All fixed! Clean questions and images saved in output/questions_cleaned.json
