In [42]:
import fitz  # PyMuPDF
from PIL import Image
import pandas as pd
import os

def extract_question_images(pdf_path, output_dir):
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    question_images = []
    question_number = 1
    
    question_list=[question for question in range(1, 100)]
    
    # Iterate through each page
    for page_number in range(len(pdf_document)):
        page = pdf_document.load_page(page_number)
        page_text = page.get_text("text")
         # Check if page text contains "ANSWER KEY" or "answer key"
        if "ANSWER KEY" in page_text or "answer key" in page_text:
            break
        text_lines = page_text.split('\n')
        
        question_start_rects = []
        for line in text_lines:
            # Detect a question pattern (e.g., "1. " or "1) ")
            for question in question_list:
                if line.strip().startswith(f"{question}.") :
                    rect = page.search_for(line.strip())
                    if rect:
                        question_start_rects.append((question_number, rect[0]))
                        question_number += 1
        
        # Capture content between successive question numbers
        for i in range(len(question_start_rects) - 1):
            q_num, start_rect = question_start_rects[i]
            _, end_rect = question_start_rects[i + 1]
            
            clip = fitz.Rect(start_rect.x0, start_rect.y0 ,start_rect.x0 + page.rect.width, end_rect.y0 + 2)  # Adjust as needed
            pix = page.get_pixmap(clip=clip)
            
            if pix.width < page.rect.width-150 or pix.height < 10:
                continue
            
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

            image_path = os.path.join(output_dir, f"question_{q_num}.png")
            try:
                img.save(image_path)
                question_images.append((question_number, image_path))
            except Exception as e:
                print(f"Error saving image {image_path}: {e}")
            
            question_number += 1
        
        # Handle the last question separately (up to the bottom of the page)
        if question_start_rects:
            last_q_num, last_start_rect = question_start_rects[-1]
            clip = fitz.Rect(last_start_rect.x0, last_start_rect.y0 - 10, page.rect.width, page.rect.height-500)  # Adjust as needed
            pix = page.get_pixmap(clip=clip)
            # Check if the image width is less than page width or height is very small
            if pix.width < page.rect.width-150 or pix.height < 10:
                continue
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

            image_path = os.path.join(output_dir, f"question_{last_q_num}.png")
            img.save(image_path)
            question_images.append((last_q_num, image_path))
    
    return question_images

def create_csv(question_images, csv_path):
    df = pd.DataFrame(question_images, columns=["Question Number","Subject","Chapter", "Image Path"])
    df.to_csv(csv_path, index=False)


In [43]:
def all_pdf_to_csv():
    path = '../pdf'
    chapter_array=[]
    path_array=[]
    subject_array=[]
    for dirpath, dirnames, filenames in os.walk(path):
        # Skip the 'csv' folder
        if 'csv' in dirnames:
            dirnames.remove('csv')
        
        # Iterate over each file in the directory
        for file in filenames:
            # Check if the file is a .docx file
            if file.endswith('.pdf'):
                # Append the name of the file (without extension) to the chapter array
                chapter_name = os.path.splitext(file)[0]
                chapter_name=chapter_name.strip()
                chapter_array.append(chapter_name)
                subject_name = os.path.basename(dirpath)
                subject_array.append(subject_name)
                path_array.append(os.path.join(dirpath, file))
        
    all_question_images = []
    for path, subject, chapter in zip(path_array, subject_array, chapter_array):
        output_dir = f"../Screenshots/{subject}/{chapter}"
        question_images = extract_question_images(path, output_dir)
        # Append subject and chapter to each question image entry
        for q_num, image_path in question_images:
            all_question_images.append((q_num, subject, chapter, image_path))
    
    create_csv(all_question_images, "..Screenshots/questions.csv")

# Run the function to process all PDFs and create the CSV
all_pdf_to_csv()