In [1]:
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
import re
import os

In [2]:
pdf_file = '/Users/rodrigocarrillo/Library/CloudStorage/OneDrive-EmoryUniversity/Natural Language Processing Projects/Examen Residentado Peru/CONAREME/Year2023/PRUEBA B.pdf'

In [3]:
def extract_questions(pdf_path):
    doc = fitz.open(pdf_path)
    questions_data = []  # List to store questions

    # Updated regular expression: handle questions with or without a '?'
    #question_pattern = re.compile(r"(\d+)\.\s(.*?)(?:\?)?\s*(?=A\.)", re.DOTALL)
    question_pattern = re.compile(r"(\d+)\.\s(.*?\??)\s*(?=A\.)", re.DOTALL)

    for page in doc:
        text = page.get_text("text")  # Extract full page text
        
        # Find all questions in the text using findall()
        questions = question_pattern.findall(text)
        
        # Append all the questions to the questions_data list
        for _, question in questions:
            questions_data.append(question.strip())  # Store the cleaned question text

    # Convert extracted questions to DataFrame
    questions_df = pd.DataFrame(questions_data, columns=["question"])
    return questions_df

In [4]:
def extract_answers(pdf_path):
    doc = fitz.open(pdf_path)
    answers_data = []  # List to store answers for each question

    # Regular expression for extracting answers
    answer_pattern = re.compile(r"^(A|B|C|D)\.\s(.+)", re.MULTILINE)

    for page in doc:
        text = page.get_text("text")  # Extract full page text
        lines = text.split("\n")  # Split into lines for structured parsing

        current_answers = []  # List to store answers for a given question
        for line in lines:
            answer_match = answer_pattern.match(line)
            if answer_match:
                opt, ans_text = answer_match.groups()
                current_answers.append(ans_text.strip())

            # After four options are found, store them as one row in answers_data
            if len(current_answers) == 4:
                answers_data.append(current_answers)
                current_answers = []  # Reset for next question

    # Convert extracted answers to DataFrame with four columns
    answers_df = pd.DataFrame(answers_data, columns = ["option_A", "option_B", "option_C", "option_D"])
    return answers_df

In [5]:
import fitz  # PyMuPDF
import pandas as pd
import numpy as np

def extract_correct_answers(pdf_path):
    doc = fitz.open(pdf_path)
    correct_answers = []

    for page_num, page in enumerate(doc, start=1):
        # Extract all answer positions
        answer_positions = {
            "A": page.search_for("A. "),
            "B": page.search_for("B. "),
            "C": page.search_for("C. "),
            "D": page.search_for("D. ")
        }

        num_questions = min(len(answer_positions["A"]), len(answer_positions["B"]), 
                            len(answer_positions["C"]), len(answer_positions["D"]))

        for idx in range(num_questions):
            best_answer = None
            best_yellow_score = 0  # To track the highest yellow score
            best_uniform_score = 0  # To track the highest uniform score
            best_uniform_answer = None  # To track the answer with perfect uniformity
            perfect_uniform_answer = None  # Track if there is any answer with identical RGB

            # Loop over answers (A, B, C, D)
            for answer_letter in ["A", "B", "C", "D"]:
                rects = answer_positions[answer_letter]

                if idx < len(rects):
                    rect = rects[idx]

                    # Expand bounding box to include highlight area
                    expanded_rect = fitz.Rect(
                        rect.x0 - 5, rect.y0 - 5, rect.x1 + 5, rect.y1 + 5
                    )

                    # Extract pixels
                    pixmap = page.get_pixmap(clip=expanded_rect)
                    img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)

                    # Compute mean RGB color
                    mean_color = img.mean(axis=(0, 1))

                    # Check uniformity: All three channels should be close
                    uniform_score = np.abs(mean_color[0] - mean_color[1]) + np.abs(mean_color[0] - mean_color[2])

                    # Check if RGB components are identical (perfect uniformity)
                    if uniform_score == 0:
                        perfect_uniform_answer = answer_letter
                        uniform_score = 0  # Perfect uniformity

                    # Compute yellow intensity (High R+G, low B)
                    yellow_score = mean_color[0] + mean_color[1] - mean_color[2]

                    # Print debug information
                    print(f"Page {page_num}, Q{idx+1}, {answer_letter}: Mean Color {mean_color}, "
                          f"Uniform Score: {uniform_score}, Yellow Score: {yellow_score}")

                    # Store the best answer based on priority rules
                    if yellow_score >= 300:
                        if yellow_score > best_yellow_score:
                            best_yellow_score = yellow_score
                            best_answer = answer_letter
                    elif uniform_score == 0:  # Uniform score check
                        if not best_answer:  # If no yellow score >= 300 answer found
                            best_answer = perfect_uniform_answer

            # Final selected answer for the question
            print(f"Selected Answer for Q{idx+1}: {best_answer}\n")
            correct_answers.append(best_answer)

    return pd.DataFrame(correct_answers, columns = ["correct_answer"])


In [6]:
questions_df = extract_questions(pdf_path = pdf_file)
answers_df = extract_answers(pdf_path = pdf_file)
correct_answers_df = extract_correct_answers(pdf_path = pdf_file)

Page 1, Q1, A: Mean Color [224.87862319 219.51268116 135.24275362], Uniform Score: 95.00181159420293, Yellow Score: 309.14855072463774
Page 1, Q1, B: Mean Color [234.38752363 232.16635161 196.90359168], Uniform Score: 39.70510396975425, Yellow Score: 269.6502835538753
Page 1, Q1, C: Mean Color [220.41123188 220.41123188 220.41123188], Uniform Score: 0, Yellow Score: 220.41123188405797
Page 1, Q1, D: Mean Color [234.5973535 234.5973535 234.5973535], Uniform Score: 0, Yellow Score: 234.59735349716445
Selected Answer for Q1: A

Page 1, Q2, A: Mean Color [224.26449275 224.26449275 224.26449275], Uniform Score: 0, Yellow Score: 224.2644927536232
Page 1, Q2, B: Mean Color [231.56521739 231.56521739 231.56521739], Uniform Score: 0, Yellow Score: 231.56521739130434
Page 1, Q2, C: Mean Color [233.58333333 228.30072464 144.49637681], Uniform Score: 94.36956521739134, Yellow Score: 317.3876811594203
Page 1, Q2, D: Mean Color [233.48393195 231.16446125 194.24196597], Uniform Score: 41.561436672967

In [7]:
questions_df

Unnamed: 0,question
0,¿Cómo se denominan las masas ováricas bilatera...
1,"Mujer de 37 años, hace 5 meses presenta nódulo..."
2,"Recién nacido de un día, nacido de parto eutóc..."
3,"Varón de 35 años, viaja a Huaraz, acude a emer..."
4,De los siguientes síntomas. ¿Cuál es caracterí...
...,...
95,"Mujer de 45 años, presenta oliguria posterior ..."
96,En la evaluación de un recién nacido. ¿Cuál de...
97,"Varón de 26 años, desde hace 5 días, presenta ..."
98,De acuerdo al algoritmo de valoración del paci...


In [8]:
answers_df

Unnamed: 0,option_A,option_B,option_C,option_D
0,Tumor de Krukenberg,Tumor de células de Sertoli,Tumor de Brenner,Fibrotecoma
1,Medular,Folicular,Papilar,Anaplásico
2,Osteomielitis de hombro,Trauma obstétrico,Sífilis congénita,Artritis séptica
3,Diuréticos de asa,Oxigenoterapia con O2,Oxigenoterapia con Óxido Nítrico,Acetazolamida
4,Disuria,Nicturia frecuente,Pierde orina al toser,Perdida escasa de orina
...,...,...,...,...
95,Vasculitis,Glomérulonefritis,Necrosis tubular aguda,Nefritis intersticial
96,Diástasis de rectos,Saturación de O2 de 90%,Telangiectasia occipital,Puntos rubíes
97,Glucosa 60 mg/dl,Ácido láctico 6 mmol/l,Proteínas 40 mg/dl,Recuento celular 100 /µl
98,Exponer al paciente por completo y proteger de...,"Valorar vía aérea, provocar respuesta verbal y...","Valorar la respiración, exploración física y p...","Valorar circulación, examen físico y control d..."


In [9]:
correct_answers_df

Unnamed: 0,correct_answer
0,A
1,C
2,C
3,B
4,B
...,...
95,C
96,B
97,B
98,B


In [10]:
total_df = pd.concat([questions_df, answers_df, correct_answers_df], ignore_index = True, axis = 1)
total_df.columns = ["questions", "option_A", "option_B", "option_C", "option_D", "correct_answer"]
total_df

Unnamed: 0,questions,option_A,option_B,option_C,option_D,correct_answer
0,¿Cómo se denominan las masas ováricas bilatera...,Tumor de Krukenberg,Tumor de células de Sertoli,Tumor de Brenner,Fibrotecoma,A
1,"Mujer de 37 años, hace 5 meses presenta nódulo...",Medular,Folicular,Papilar,Anaplásico,C
2,"Recién nacido de un día, nacido de parto eutóc...",Osteomielitis de hombro,Trauma obstétrico,Sífilis congénita,Artritis séptica,C
3,"Varón de 35 años, viaja a Huaraz, acude a emer...",Diuréticos de asa,Oxigenoterapia con O2,Oxigenoterapia con Óxido Nítrico,Acetazolamida,B
4,De los siguientes síntomas. ¿Cuál es caracterí...,Disuria,Nicturia frecuente,Pierde orina al toser,Perdida escasa de orina,B
...,...,...,...,...,...,...
95,"Mujer de 45 años, presenta oliguria posterior ...",Vasculitis,Glomérulonefritis,Necrosis tubular aguda,Nefritis intersticial,C
96,En la evaluación de un recién nacido. ¿Cuál de...,Diástasis de rectos,Saturación de O2 de 90%,Telangiectasia occipital,Puntos rubíes,B
97,"Varón de 26 años, desde hace 5 días, presenta ...",Glucosa 60 mg/dl,Ácido láctico 6 mmol/l,Proteínas 40 mg/dl,Recuento celular 100 /µl,B
98,De acuerdo al algoritmo de valoración del paci...,Exponer al paciente por completo y proteger de...,"Valorar vía aérea, provocar respuesta verbal y...","Valorar la respiración, exploración física y p...","Valorar circulación, examen físico y control d...",B


In [11]:
# Extract folder path
folder_path = os.path.dirname(pdf_file)  # Gets the directory path

# Extract filename without extension
file_name = os.path.splitext(os.path.basename(pdf_file))[0]  # Removes the .pdf extension

# Create CSV file path
csv_file = os.path.join(folder_path, f"{file_name}.csv")

# Save a sample DataFrame
total_df.to_csv(csv_file, index=False, encoding="utf-8-sig")
print(f"CSV saved at: {csv_file}")


CSV saved at: /Users/rodrigocarrillo/Library/CloudStorage/OneDrive-EmoryUniversity/Natural Language Processing Projects/Examen Residentado Peru/CONAREME/Year2023/PRUEBA B.csv
