In [1]:
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
import re
import os
from PIL import Image
import io
import matplotlib.pyplot as plt

In [None]:
pdf_file = '/Users/rodrigocarrillo/Library/CloudStorage/OneDrive-EmoryUniversity/Natural Language Processing Projects/Examen Residentado Peru/CONAREME/Year2024/ESPECIALIADAD B.pdf'

In [None]:
def extract_questions(pdf_path):
    doc = fitz.open(pdf_path)
    questions_data = []  # List to store questions

    # Updated regular expression: handle questions with or without a '?'
    #question_pattern = re.compile(r"(\d+)\.\s(.*?)(?:\?)?\s*(?=A\.)", re.DOTALL)
    question_pattern = re.compile(r"(\d+)\.\s(.*?\??)\s*(?=A\.)", re.DOTALL)

    for page in doc:
        text = page.get_text("text")  # Extract full page text
        
        # Find all questions in the text using findall()
        questions = question_pattern.findall(text)
        
        # Append all the questions to the questions_data list
        for _, question in questions:
            questions_data.append(question.strip())  # Store the cleaned question text

    # Convert extracted questions to DataFrame
    questions_df = pd.DataFrame(questions_data, columns=["question"])
    return questions_df

In [None]:
def extract_answers(pdf_path):
    doc = fitz.open(pdf_path)
    answers_data = []  # List to store answers for each question

    # Regular expression for extracting answers
    answer_pattern = re.compile(r"^(A|B|C|D)\.\s(.+)", re.MULTILINE)

    for page in doc:
        text = page.get_text("text")  # Extract full page text
        lines = text.split("\n")  # Split into lines for structured parsing

        current_answers = []  # List to store answers for a given question
        for line in lines:
            answer_match = answer_pattern.match(line)
            if answer_match:
                opt, ans_text = answer_match.groups()
                current_answers.append(ans_text.strip())

            # After four options are found, store them as one row in answers_data
            if len(current_answers) == 4:
                answers_data.append(current_answers)
                current_answers = []  # Reset for next question

    # Convert extracted answers to DataFrame with four columns
    answers_df = pd.DataFrame(answers_data, columns = ["option_A", "option_B", "option_C", "option_D"])
    return answers_df

In [None]:
def get_largest_rectangle(rects):
    """Return the largest rectangle from a list of rectangles."""
    if not rects:
        return None
    
    # Calculate area for each rectangle and store with index
    areas = [(abs((rect.br.x - rect.tl.x) * (rect.br.y - rect.tl.y)), idx) 
             for idx, rect in enumerate(rects)]
    
    # Sort by area (first element of tuple)
    areas.sort(key=lambda x: x[0], reverse=True)
    
    # Return the rectangle with largest area
    return rects[areas[0][1]]

def process_answer_search(page, answer_letter, answer_text):
    """Process search results for an answer, selecting the largest rectangle."""
    # Try different search variations
    search_texts = [
        answer_letter + '. ' + answer_text,
        answer_letter + '. ' + answer_text.strip(),
        answer_letter + '.' + answer_text,
        answer_letter.upper() + '. ' + answer_text,
    ]
    
    rects = None
    used_search_text = ""
    
    # Find the first successful search
    for search_text in search_texts:
        rects = page.search_for(search_text)
        if rects:
            used_search_text = search_text
            break
    
    if rects:
        # Select the largest rectangle
        largest_rect = get_largest_rectangle(rects)
        
        # Print detailed information
        print(f"\nSearched for: '{used_search_text}'")
        print(f"Found {len(rects)} rectangles:")
        for i, rect in enumerate(rects):
            area = abs((rect.br.x - rect.tl.x) * (rect.br.y - rect.tl.y))
            print(f"Rectangle {i+1}: x={rect.tl.x:.2f}, y={rect.tl.y:.2f}, "
                  f"w={rect.br.x-rect.tl.x:.2f}, h={rect.br.y-rect.tl.y:.2f}, "
                  f"area={area:.2f}")
        print(f"\nSelected largest rectangle:")
        print(f"x={largest_rect.tl.x:.2f}, y={largest_rect.tl.y:.2f}, "
              f"w={largest_rect.br.x-largest_rect.tl.x:.2f}, "
              f"h={largest_rect.br.y-largest_rect.tl.y:.2f}")
        
        return largest_rect
    else:
        print(f"\nNo rectangles found for '{answer_letter}. {answer_text}'")
        return None

In [None]:
correct_answers = []

answer_pattern = re.compile(r"^(A|B|C|D)\.\s(.+)", re.MULTILINE)

# Main processing loop
for page_num, page in enumerate(doc, start=1):
    text = page.get_text("text")
    lines = text.split("\n")
    answer_positions = {key: [] for key in ["A", "B", "C", "D"]}
    
    for line in lines:
        match = answer_pattern.match(line)
        if match:
            answer_letter, answer_text = match.groups()
            rect = process_answer_search(page, answer_letter, answer_text)
            if rect:
                answer_positions[answer_letter].append(rect)

In [None]:
for idx in range(num_questions):
    #print(idx)
    for answer_letter in ["A", "B", "C", "D"]:
        rects = answer_positions[answer_letter]
        if idx >= len(rects):
            has_all_options = False
            continue
        rect = rects[idx]
        #print(len(rects))
        #print(rect)

        print(f"Page {page_num}, Q{idx+1} - {answer_letter}: Detected box: "
                      f"x0={rect.x0:.1f}, y0={rect.y0:.1f}, x1={rect.x1:.1f}, y1={rect.y1:.1f}")


In [None]:
def extract_correct_answers(pdf_path):
    doc = fitz.open(pdf_path)
    correct_answers = []
    missing_questions = []  # Track questions that were skipped

    # Flexible regex for answer choices (handles missing spaces like "A." and "A. ")
    answer_pattern = re.compile(r"^(A|B|C|D)\.\s(.+)", re.MULTILINE)

    for page_num, page in enumerate(doc, start=1):
        text = page.get_text("text")  # Extract full page text
        lines = text.split("\n")  # Split into lines

        # Extract valid answer positions using regex
        answer_positions = {key: [] for key in ["A", "B", "C", "D"]}

        for line in lines:
            match = answer_pattern.match(line)
            if match:
                answer_letter, _ = match.groups()

                # Find bounding box of the extracted answer text
                rects = page.search_for(match.group(0))  # Search for full matched text
                if rects:
                    answer_positions[answer_letter].append(rects[0])  # Store first match

        # Find the **maximum** number of questions on the page, instead of using `min()`
        num_questions = max(len(answer_positions["A"]), len(answer_positions["B"]),
                            len(answer_positions["C"]), len(answer_positions["D"]))

        # Debugging: Show how many answers were found
        print(f"\nPage {page_num}: Found {num_questions} questions")

        for idx in range(num_questions):
            best_answer = None
            best_yellow_pixels = 0  # Track the highest number of yellow pixels
            has_all_options = True  # Flag to check if we have all 4 options

            for answer_letter in ["A", "B", "C", "D"]:
                rects = answer_positions[answer_letter]
                if idx >= len(rects):
                    has_all_options = False  # Missing one or more options
                    continue
                
                rect = rects[idx]

                # Debugging: Print detected answer positions
                print(f"Page {page_num}, Q{idx+1} - {answer_letter}: Detected box: "
                      f"x0={rect.x0:.1f}, y0={rect.y0:.1f}, x1={rect.x1:.1f}, y1={rect.y1:.1f}")

                # Expand bounding box only to the right
                expanded_rect = fitz.Rect(rect.x0, rect.y0, rect.x1 + 50, rect.y1)

                # Extract pixels from the expanded region
                pixmap = page.get_pixmap(clip=expanded_rect)
                img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)

                # Count yellow pixels (pixels with high red and green, low blue)
                yellow_mask = (
                    (img[:, :, 0] > 200) &  # High red
                    (img[:, :, 1] > 200) &  # High green
                    (img[:, :, 2] < 150)    # Low blue
                )
                yellow_pixels = np.sum(yellow_mask)

                # Count total non-white pixels
                nonwhite_mask = ~np.all(img > 250, axis=2)
                total_pixels = np.sum(nonwhite_mask)

                # Debug information
                print(f"Page {page_num}, Q{idx+1}, {answer_letter}: Yellow pixels: {yellow_pixels}")
                print(f"Page {page_num}, Q{idx+1}, {answer_letter}: Total non-white pixels: {total_pixels}")
                if total_pixels > 0:
                    yellow_percentage = (yellow_pixels / total_pixels) * 100
                    print(f"Page {page_num}, Q{idx+1}, {answer_letter}: Yellow percentage: {yellow_percentage:.1f}%")

                # Display the box in the notebook
                plt.figure(figsize=(8, 4))
                plt.imshow(img)
                plt.gca().add_patch(plt.Rectangle((0, 0), expanded_rect.width, expanded_rect.height,
                                                  fill=False, color='red', linewidth=2))
                plt.title(f'Page {page_num}, Q{idx+1}, Answer {answer_letter}\nYellow pixels: {yellow_pixels}')
                plt.axis('off')
                plt.show()

                # Store the best answer based on the highest number of yellow pixels
                if yellow_pixels > best_yellow_pixels:
                    best_yellow_pixels = yellow_pixels
                    best_answer = answer_letter

            # If missing options, log missing questions
            if not has_all_options:
                missing_questions.append(f"Page {page_num}, Q{idx+1}")

            correct_answers.append(best_answer)
            print(f"\nPage {page_num}, Q{idx+1} - Selected Answer: {best_answer} (based on yellow pixel count)")

    # Debugging: Print missing questions
    if missing_questions:
        print("\n⚠️ The following questions were skipped due to missing options:")
        for q in missing_questions:
            print(q)

    return pd.DataFrame(correct_answers, columns=["correct_answer"])


In [None]:
# def extract_correct_answers(pdf_path, output_dir = "output_logs_B"):
#     os.makedirs(output_dir, exist_ok=True)  # Ensure output directory exists
#     log_path = os.path.join(output_dir, "output_log.txt")
    
#     with open(log_path, "w") as log_file:
#         doc = fitz.open(pdf_path)
#         correct_answers = []
#         missing_questions = []  # Track questions that were skipped

#         # Flexible regex for answer choices
#         answer_pattern = re.compile(r"^(A|B|C|D)\.\s(.+)", re.MULTILINE)

#         for page_num, page in enumerate(doc, start=1):
#             text = page.get_text("text")
#             lines = text.split("\n")
            
#             answer_positions = {key: [] for key in ["A", "B", "C", "D"]}
#             for line in lines:
#                 match = answer_pattern.match(line)
#                 if match:
#                     answer_letter, _ = match.groups()
#                     rects = page.search_for(match.group(0))
#                     if rects:
#                         answer_positions[answer_letter].append(rects[0])
            
#             num_questions = max(len(answer_positions["A"]), len(answer_positions["B"]),
#                                 len(answer_positions["C"]), len(answer_positions["D"]))
#             log_file.write(f"\nPage {page_num}: Found {num_questions} questions\n")
#             print(f"\nPage {page_num}: Found {num_questions} questions")
            
#             for idx in range(num_questions):
#                 best_answer = None
#                 best_yellow_pixels = 0
#                 has_all_options = True

#                 for answer_letter in ["A", "B", "C", "D"]:
#                     rects = answer_positions[answer_letter]
#                     if idx >= len(rects):
#                         has_all_options = False
#                         continue
                    
#                     rect = rects[idx]
#                     expanded_rect = fitz.Rect(rect.x0, rect.y0, rect.x1 + 25, rect.y1)
#                     pixmap = page.get_pixmap(clip=expanded_rect)
#                     img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
                    
#                     yellow_mask = (img[:, :, 0] > 200) & (img[:, :, 1] > 200) & (img[:, :, 2] < 150)
#                     yellow_pixels = np.sum(yellow_mask)
                    
#                     total_pixels = np.sum(~np.all(img > 250, axis=2))
#                     yellow_percentage = (yellow_pixels / total_pixels) * 100 if total_pixels > 0 else 0
                    
#                     log_entry = (f"Page {page_num}, Q{idx+1} - {answer_letter}: Yellow pixels: {yellow_pixels}, "
#                                  f"Total non-white pixels: {total_pixels}, Yellow %: {yellow_percentage:.1f}%\n")
#                     log_file.write(log_entry)
#                     print(log_entry, end="")
                    
#                     fig, ax = plt.subplots(figsize=(8, 4))
#                     ax.imshow(img)
#                     ax.add_patch(plt.Rectangle((0, 0), expanded_rect.width, expanded_rect.height,
#                                                fill=False, color='red', linewidth=2))
#                     ax.set_title(f'Page {page_num}, Q{idx+1}, Answer {answer_letter}\nYellow pixels: {yellow_pixels}')
#                     ax.axis('off')
#                     img_path = os.path.join(output_dir, f"page_{page_num}_Q{idx+1}_option_{answer_letter}.png")
#                     plt.savefig(img_path, bbox_inches='tight')
#                     plt.close()
                    
#                     if yellow_pixels > best_yellow_pixels:
#                         best_yellow_pixels = yellow_pixels
#                         best_answer = answer_letter
                
#                 if not has_all_options:
#                     missing_questions.append(f"Page {page_num}, Q{idx+1}")
                
#                 correct_answers.append(best_answer)
#                 log_file.write(f"Page {page_num}, Q{idx+1} - Selected Answer: {best_answer}\n")
#                 print(f"Page {page_num}, Q{idx+1} - Selected Answer: {best_answer}\n")
        
#         if missing_questions:
#             log_file.write("\n⚠️ The following questions were skipped due to missing options:\n")
#             for q in missing_questions:
#                 log_file.write(q + "\n")
#                 print(q)
    
#     return pd.DataFrame(correct_answers, columns=["correct_answer"])


In [None]:
# def extract_answers(pdf_path):
#     doc = fitz.open(pdf_path)
#     answers_data = []  # List to store answers for each question

#     # Regular expression for extracting answers
#     answer_pattern = re.compile(r"^(A|B|C|D)\.\s(.+)", re.MULTILINE)

#     for page_num, page in enumerate(doc, start=1):
#         text = page.get_text("text")  # Extract full page text
#         lines = text.split("\n")  # Split into lines for structured parsing

#         current_answers = []
#         question_number = 0  # Track question number

#         for line in lines:
#             answer_match = answer_pattern.match(line)
#             if answer_match:
#                 answer_letter, ans_text = answer_match.groups()
                
#                 # When we find an "A." option, assume it's a new question
#                 if answer_letter == "A":
#                     question_number += 1
                
#                 current_answers.append({
#                     "page": page_num,
#                     "question_number": question_number,
#                     "option": answer_letter,
#                     "text": ans_text.strip()
#                 })

#             # After four options are found, store them as one row in answers_data
#             if len(current_answers) == 4:
#                 answers_data.extend(current_answers)
#                 current_answers = []  # Reset for next question

#     # Convert extracted answers to DataFrame
#     return pd.DataFrame(answers_data)


In [None]:
# import fitz
# import pandas as pd
# import numpy as np
# import re

# def extract_correct_answers(pdf_path):
#     doc = fitz.open(pdf_path)
#     correct_answers = []  # Store extracted correct answers
#     question_numbers = []  # Track question numbers
#     missing_questions = []  # Track questions where no answer was found
    
#     # Regex pattern to detect answer choices (A, B, C, D)
#     answer_pattern = re.compile(r"^(A|B|C|D)\.\s(.+)", re.MULTILINE)
    
#     for page_num, page in enumerate(doc, start=1):
#         text = page.get_text("text")
#         lines = text.split("\n")
        
#         answers_found = []  # Track detected answers per page
#         for line in lines:
#             match = answer_pattern.match(line)
#             if match:
#                 answers_found.append(match.groups())
        
#         # Extract highlight annotations from the page
#         highlighted_answers = []
#         for annot in page.annots():
#             if annot.type[0] == 8:  # Highlight annotation
#                 highlight_rects = annot.rects
#                 for rect in highlight_rects:
#                     text_in_rect = page.get_text("text", clip=rect)
#                     highlighted_answers.append(text_in_rect.strip())
        
#         # Match highlighted text with answer choices
#         for answer_letter, answer_text in answers_found:
#             if answer_text in highlighted_answers:
#                 correct_answers.append(answer_letter)
#                 break  # Stop at first match per question
#         else:
#             correct_answers.append(None)
#             missing_questions.append(f"Page {page_num}")
    
#     # Debugging: Print missing questions
#     if missing_questions:
#         print("\n⚠️ Some questions have no detected correct answer:")
#         print("\n".join(missing_questions))
    
#     return pd.DataFrame(correct_answers, columns=["correct_answer"])


In [None]:
def get_largest_rectangle(rects):
    """Return the largest rectangle from a list of rectangles."""
    if not rects:
        return None
    
    # Calculate area for each rectangle and store with index
    areas = [(abs((rect.br.x - rect.tl.x) * (rect.br.y - rect.tl.y)), idx) 
             for idx, rect in enumerate(rects)]
    
    # Sort by area (first element of tuple)
    areas.sort(key=lambda x: x[0], reverse=True)
    
    # Return the rectangle with largest area
    return rects[areas[0][1]]

def extract_correct_answers(pdf_path):
    doc = fitz.open(pdf_path)
    correct_answers = []
    missing_questions = []  # Track questions that were skipped
    
    # Flexible regex for answer choices (handles missing spaces like "A." and "A. ")
    answer_pattern = re.compile(r"^(A|B|C|D)\.\s(.+)", re.MULTILINE)
    
    for page_num, page in enumerate(doc, start=1):
        text = page.get_text("text")  # Extract full page text
        lines = text.split("\n")  # Split into lines
        
        # Extract valid answer positions using regex
        answer_positions = {key: [] for key in ["A", "B", "C", "D"]}
        
        for line in lines:
            match = answer_pattern.match(line)
            if match:
                answer_letter, answer_text = match.groups()
                
                # Try different search variations to find the best match
                search_texts = [
                    match.group(0),  # Original matched text
                    match.group(0).strip(),  # Stripped version
                    answer_letter + '.' + answer_text,  # Without space after period
                    answer_letter.upper() + '. ' + answer_text,  # Different case
                ]
                
                rects = None
                used_search_text = ""
                
                # Find the first successful search
                for search_text in search_texts:
                    rects = page.search_for(search_text)
                    if rects:
                        used_search_text = search_text
                        break
                
                if rects:
                    # Select the largest rectangle
                    largest_rect = get_largest_rectangle(rects)
                    if largest_rect:
                        answer_positions[answer_letter].append(largest_rect)
        
        # Find the maximum number of questions on the page
        num_questions = max(len(answer_positions["A"]), len(answer_positions["B"]),
                          len(answer_positions["C"]), len(answer_positions["D"]))
        
        # Debugging: Show how many answers were found
        print(f"\nPage {page_num}: Found {num_questions} questions")
        
        for idx in range(num_questions):
            best_answer = None
            best_yellow_pixels = 0  # Track the highest number of yellow pixels
            has_all_options = True  # Flag to check if we have all 4 options
            
            for answer_letter in ["A", "B", "C", "D"]:
                rects = answer_positions[answer_letter]
                if idx >= len(rects):
                    has_all_options = False  # Missing one or more options
                    continue
                
                rect = rects[idx]
                
                # Debugging: Print detected answer positions
                print(f"Page {page_num}, Q{idx+1} - {answer_letter}: Detected box: "
                      f"x0={rect.x0:.1f}, y0={rect.y0:.1f}, x1={rect.x1:.1f}, y1={rect.y1:.1f}")
                
                # Expand bounding box only to the right
                expanded_rect = fitz.Rect(rect.x0, rect.y0, rect.x1 + 50, rect.y1)
                
                # Extract pixels from the expanded region
                pixmap = page.get_pixmap(clip=expanded_rect)
                img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
                
                # Count yellow pixels (pixels with high red and green, low blue)
                yellow_mask = (
                    (img[:, :, 0] > 200) &  # High red
                    (img[:, :, 1] > 200) &  # High green
                    (img[:, :, 2] < 150)    # Low blue
                )
                yellow_pixels = np.sum(yellow_mask)
                
                # Count total non-white pixels
                nonwhite_mask = ~np.all(img > 250, axis=2)
                total_pixels = np.sum(nonwhite_mask)
                
                # Debug information
                print(f"Page {page_num}, Q{idx+1}, {answer_letter}: Yellow pixels: {yellow_pixels}")
                print(f"Page {page_num}, Q{idx+1}, {answer_letter}: Total non-white pixels: {total_pixels}")
                
                if total_pixels > 0:
                    yellow_percentage = (yellow_pixels / total_pixels) * 100
                    print(f"Page {page_num}, Q{idx+1}, {answer_letter}: Yellow percentage: {yellow_percentage:.1f}%")
                
                # Display the box in the notebook
                plt.figure(figsize=(8, 4))
                plt.imshow(img)
                plt.gca().add_patch(plt.Rectangle((0, 0), expanded_rect.width, expanded_rect.height,
                                                fill=False, color='red', linewidth=2))
                plt.title(f'Page {page_num}, Q{idx+1}, Answer {answer_letter}\nYellow pixels: {yellow_pixels}')
                plt.axis('off')
                plt.show()
                
                # Store the best answer based on the highest number of yellow pixels
                if yellow_pixels > best_yellow_pixels:
                    best_yellow_pixels = yellow_pixels
                    best_answer = answer_letter
            
            # If missing options, log missing questions
            if not has_all_options:
                missing_questions.append(f"Page {page_num}, Q{idx+1}")
            
            if best_answer:
                correct_answers.append(best_answer)
                print(f"\nPage {page_num}, Q{idx+1} - Selected Answer: {best_answer} (based on yellow pixel count)")
    
    # Debugging: Print missing questions
    if missing_questions:
        print("\n⚠️ The following questions were skipped due to missing options:")
        for q in missing_questions:
            print(q)
    
    return pd.DataFrame(correct_answers, columns=["correct_answer"])

In [None]:
# import fitz  # PyMuPDF
# import re
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt

# def get_largest_rectangle(rects):
#     """Return the largest rectangle from a list of rectangles."""
#     if not rects:
#         return None
#     return max(rects, key=lambda r: abs((r.br.x - r.tl.x) * (r.br.y - r.tl.y)))

# def extract_questions_and_positions(text):
#     """
#     Extracts question numbers (e.g., "39.") and their positions from the page text.
#     """
#     question_pattern = re.compile(r"(\d+)\.")  # Matches question numbers like "39."
#     questions = []
#     for match in question_pattern.finditer(text):
#         question_number = int(match.group(1))
#         questions.append((question_number, match.start()))
#     return questions

# def assign_bounding_boxes_sequentially(answer_positions, detected_answers):
#     """
#     Assign bounding boxes to answers sequentially based on how they appear in the text.
#     Ensures that repeated answer choices (e.g., "B. 4") are assigned to different bounding boxes.
#     """
#     assigned_rects = {key: [] for key in ["A", "B", "C", "D"]}
#     answer_counts = {key: 0 for key in ["A", "B", "C", "D"]}

#     for answer_letter, answer_text in detected_answers:  # Order from text extraction
#         if answer_positions[answer_letter]:
#             rect_index = answer_counts[answer_letter]
#             if rect_index < len(answer_positions[answer_letter]):
#                 assigned_rects[answer_letter].append(answer_positions[answer_letter][rect_index])
#                 answer_counts[answer_letter] += 1  # Move to next bounding box for next occurrence

#     return assigned_rects

# def extract_correct_answers(pdf_path):
#     doc = fitz.open(pdf_path)
#     correct_answers = []

#     answer_pattern = re.compile(r"^(A|B|C|D)\.\s(.+)", re.MULTILINE)

#     for page_num, page in enumerate(doc, start=1):
#         text = page.get_text("text")
#         lines = text.split("\n")

#         # Extract question numbers to track locations
#         question_positions = extract_questions_and_positions(text)

#         answer_positions = {key: [] for key in ["A", "B", "C", "D"]}
#         detected_answers = []  # Stores extracted answers in text order

#         for line in lines:
#             match = answer_pattern.match(line)
#             if match:
#                 answer_letter, answer_text = match.groups()
#                 rects = page.search_for(match.group(0))
#                 if rects:
#                     largest_rect = get_largest_rectangle(rects)
#                     if largest_rect:
#                         answer_positions[answer_letter].append(largest_rect)
#                         detected_answers.append((answer_letter, answer_text))

#         # Assign bounding boxes sequentially to prevent misalignment
#         assigned_rects = assign_bounding_boxes_sequentially(answer_positions, detected_answers)

#         # Process each question
#         for q_idx, (q_number, _) in enumerate(question_positions):
#             best_answer = None
#             best_yellow_pixels = 0

#             print(f"\n🔹 Processing Question {q_number} on Page {page_num}")

#             for answer_letter in ["A", "B", "C", "D"]:
#                 rects = assigned_rects[answer_letter]
#                 if q_idx >= len(rects):
#                     continue  # Skip if no bounding box found

#                 rect = rects[q_idx]

#                 # Debugging: Print detected answer positions
#                 print(f"  Q{q_number}, {answer_letter}: Box -> x0={rect.x0:.1f}, y0={rect.y0:.1f}, x1={rect.x1:.1f}, y1={rect.y1:.1f}")

#                 # Expand bounding box only to the right
#                 expanded_rect = fitz.Rect(rect.x0, rect.y0, rect.x1 + 50, rect.y1)

#                 # Extract pixels from the expanded region
#                 pixmap = page.get_pixmap(clip=expanded_rect)
#                 img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)

#                 # Count yellow pixels (high red & green, low blue)
#                 yellow_mask = (img[:, :, 0] > 200) & (img[:, :, 1] > 200) & (img[:, :, 2] < 150)
#                 yellow_pixels = np.sum(yellow_mask)

#                 # Count total non-white pixels
#                 nonwhite_mask = ~np.all(img > 250, axis=2)
#                 total_pixels = np.sum(nonwhite_mask)

#                 # Debug info
#                 print(f"  Q{q_number}, {answer_letter}: Yellow Pixels = {yellow_pixels}, Non-White Pixels = {total_pixels}")

#                 if total_pixels > 0:
#                     yellow_percentage = (yellow_pixels / total_pixels) * 100
#                     print(f"  Q{q_number}, {answer_letter}: Yellow % = {yellow_percentage:.1f}%")

#                 # Display image of extracted text
#                 plt.figure(figsize=(8, 4))
#                 plt.imshow(img)
#                 plt.gca().add_patch(plt.Rectangle((0, 0), expanded_rect.width, expanded_rect.height,
#                                                   fill=False, color='red', linewidth=2))
#                 plt.title(f'Q{q_number}, {answer_letter}\nYellow Pixels: {yellow_pixels}')
#                 plt.axis('off')
#                 plt.show()

#                 # Store the best answer based on the highest number of yellow pixels
#                 if yellow_pixels > best_yellow_pixels:
#                     best_yellow_pixels = yellow_pixels
#                     best_answer = answer_letter

#             if best_answer:
#                 correct_answers.append((q_number, best_answer))
#                 print(f"\n✅ Q{q_number} - Selected Answer: {best_answer} (Yellow Pixel Count: {best_yellow_pixels})")

#     # Convert to DataFrame
#     correct_answers_df = pd.DataFrame(correct_answers, columns=["question_number", "correct_answer"])
#     return correct_answers_df


In [None]:
# def get_largest_rectangle(rects):
#     """Return the largest rectangle from a list of rectangles."""
#     if not rects:
#         return None
#     return max(rects, key=lambda r: abs((r.br.x - r.tl.x) * (r.br.y - r.tl.y)))

# def extract_questions_and_positions(text):
#     """
#     Extracts question numbers (e.g., "39.") and their positions from the page text.
#     """
#     question_pattern = re.compile(r"(\d+)\.")  # Matches question numbers like "39."
#     questions = []
#     for match in question_pattern.finditer(text):
#         question_number = int(match.group(1))
#         questions.append((question_number, match.start()))
#     return questions

# def assign_bounding_boxes_sequentially(answer_positions, detected_answers):
#     """
#     Assign bounding boxes to answers sequentially based on how they appear in the text.
#     Ensures that repeated answer choices are assigned to different bounding boxes.
#     """
#     assigned_rects = {key: [] for key in ["A", "B", "C", "D"]}
#     answer_counts = {key: 0 for key in ["A", "B", "C", "D"]}
    
#     # Sort detected answers by their y-coordinate to ensure proper ordering
#     sorted_answers = sorted(detected_answers, 
#                           key=lambda x: answer_positions[x[0]][0].y0)
    
#     for answer_letter, answer_text in sorted_answers:
#         if answer_positions[answer_letter]:
#             rect_index = answer_counts[answer_letter]
#             if rect_index < len(answer_positions[answer_letter]):
#                 assigned_rects[answer_letter].append(answer_positions[answer_letter][rect_index])
#                 answer_counts[answer_letter] += 1
    
#     return assigned_rects

# def extract_correct_answers(pdf_path):
#     doc = fitz.open(pdf_path)
#     correct_answers = []
#     answer_pattern = re.compile(r"^(A|B|C|D)\.\s(.+)", re.MULTILINE)
    
#     for page_num, page in enumerate(doc, start=1):
#         text = page.get_text("text")
#         lines = text.split("\n")
        
#         # Extract question numbers to track locations
#         question_positions = extract_questions_and_positions(text)
#         answer_positions = {key: [] for key in ["A", "B", "C", "D"]}
#         detected_answers = []  # Stores extracted answers in text order
        
#         # First pass: Extract all answers and their positions
#         for line in lines:
#             match = answer_pattern.match(line)
#             if match:
#                 answer_letter, answer_text = match.groups()
#                 rects = page.search_for(match.group(0))
#                 if rects:
#                     largest_rect = get_largest_rectangle(rects)
#                     if largest_rect:
#                         answer_positions[answer_letter].append(largest_rect)
#                         detected_answers.append((answer_letter, answer_text))
        
#         # Assign bounding boxes sequentially to prevent misalignment
#         assigned_rects = assign_bounding_boxes_sequentially(answer_positions, detected_answers)
        
#         # Process each question
#         for q_idx, (q_number, _) in enumerate(question_positions):
#             print(f"\n🔹 Processing Question {q_number} on Page {page_num}")
            
#             # Calculate average y-position for this question's answers
#             y_positions = []
#             for letter in ["A", "B", "C", "D"]:
#                 if q_idx < len(assigned_rects[letter]):
#                     y_positions.append(assigned_rects[letter][q_idx].y0)
            
#             if not y_positions:
#                 continue
                
#             avg_y = sum(y_positions) / len(y_positions)
#             print(f"  Average y-position for Q{q_number}: {avg_y:.1f}")
            
#             best_answer = None
#             best_yellow_pixels = 0
            
#             for answer_letter in ["A", "B", "C", "D"]:
#                 rects = assigned_rects[answer_letter]
#                 if q_idx >= len(rects):
#                     continue
                
#                 rect = rects[q_idx]
#                 # Only process answers that are close to the average y-position
#                 if abs(rect.y0 - avg_y) > 50:  # 50 points tolerance
#                     continue
                
#                 print(f"  Q{q_number}, {answer_letter}: Box -> x0={rect.x0:.1f}, y0={rect.y0:.1f}, x1={rect.x1:.1f}, y1={rect.y1:.1f}")
                
#                 # Expand bounding box only to the right
#                 expanded_rect = fitz.Rect(rect.x0, rect.y0, rect.x1 + 50, rect.y1)
#                 pixmap = page.get_pixmap(clip=expanded_rect)
#                 img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
                
#                 # Count yellow pixels (high red & green, low blue)
#                 yellow_mask = (img[:, :, 0] > 200) & (img[:, :, 1] > 200) & (img[:, :, 2] < 150)
#                 yellow_pixels = np.sum(yellow_mask)
                
#                 # Count total non-white pixels
#                 nonwhite_mask = ~np.all(img > 250, axis=2)
#                 total_pixels = np.sum(nonwhite_mask)
                
#                 print(f"  Q{q_number}, {answer_letter}: Yellow Pixels = {yellow_pixels}, Non-White Pixels = {total_pixels}")
                
#                 if total_pixels > 0:
#                     yellow_percentage = (yellow_pixels / total_pixels) * 100
#                     print(f"  Q{q_number}, {answer_letter}: Yellow % = {yellow_percentage:.1f}%")
                
#                 # Display image of extracted text
#                 plt.figure(figsize=(8, 4))
#                 plt.imshow(img)
#                 plt.gca().add_patch(plt.Rectangle((0, 0), expanded_rect.width, expanded_rect.height,
#                                                 fill=False, color='red', linewidth=2))
#                 plt.title(f'Q{q_number}, {answer_letter}\nYellow Pixels: {yellow_pixels}')
#                 plt.axis('off')
#                 plt.show()
                
#                 # Store the best answer based on the highest number of yellow pixels
#                 if yellow_pixels > best_yellow_pixels:
#                     best_yellow_pixels = yellow_pixels
#                     best_answer = answer_letter
            
#             if best_answer:
#                 correct_answers.append((q_number, best_answer))
#                 print(f"\n✅ Q{q_number} - Selected Answer: {best_answer} (Yellow Pixel Count: {best_yellow_pixels})")
    
#     # Convert to DataFrame
#     correct_answers_df = pd.DataFrame(correct_answers, columns=["question_number", "correct_answer"])
#     return correct_answers_df

In [None]:
questions_df = extract_questions(pdf_path = pdf_file)
answers_df = extract_answers(pdf_path = pdf_file)
correct_answers_df = extract_correct_answers(pdf_path = pdf_file)

In [None]:
questions_df

In [None]:
answers_df

In [None]:
correct_answers_df

In [None]:
total_df = pd.concat([questions_df, answers_df, correct_answers_df], ignore_index = True, axis = 1)
total_df.columns = ["questions", "option_A", "option_B", "option_C", "option_D", "correct_answer"]
total_df

In [None]:
# Extract folder path
folder_path = os.path.dirname(pdf_file)  # Gets the directory path

# Extract filename without extension
file_name = os.path.splitext(os.path.basename(pdf_file))[0]  # Removes the .pdf extension

# Create CSV file path
csv_file = os.path.join(folder_path, f"{file_name}.csv")

# Save a sample DataFrame
total_df.to_csv(csv_file, index=False, encoding="utf-8-sig")
print(f"CSV saved at: {csv_file}")
