In [1]:
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
import re
import os

In [2]:
pdf_file = '/Users/rodrigocarrillo/Library/CloudStorage/OneDrive-EmoryUniversity/Natural Language Processing Projects/Examen Residentado Peru/CONAREME/Year2022/Especialidad Prueba A.pdf'

In [3]:
def extract_questions(pdf_path):
    doc = fitz.open(pdf_path)
    questions_data = []  # List to store questions

    # Updated regular expression: handle questions with or without a '?'
    #question_pattern = re.compile(r"(\d+)\.\s(.*?)(?:\?)?\s*(?=A\.)", re.DOTALL)
    question_pattern = re.compile(r"(\d+)\.\s(.*?\??)\s*(?=A\.)", re.DOTALL)

    for page in doc:
        text = page.get_text("text")  # Extract full page text
        
        # Find all questions in the text using findall()
        questions = question_pattern.findall(text)
        
        # Append all the questions to the questions_data list
        for _, question in questions:
            questions_data.append(question.strip())  # Store the cleaned question text

    # Convert extracted questions to DataFrame
    questions_df = pd.DataFrame(questions_data, columns=["question"])
    return questions_df

In [4]:
def extract_answers(pdf_path):
    doc = fitz.open(pdf_path)
    answers_data = []  # List to store answers for each question

    # Regular expression for extracting answers
    answer_pattern = re.compile(r"^(A|B|C|D)\.\s(.+)", re.MULTILINE)

    for page in doc:
        text = page.get_text("text")  # Extract full page text
        lines = text.split("\n")  # Split into lines for structured parsing

        current_answers = []  # List to store answers for a given question
        for line in lines:
            answer_match = answer_pattern.match(line)
            if answer_match:
                opt, ans_text = answer_match.groups()
                current_answers.append(ans_text.strip())

            # After four options are found, store them as one row in answers_data
            if len(current_answers) == 4:
                answers_data.append(current_answers)
                current_answers = []  # Reset for next question

    # Convert extracted answers to DataFrame with four columns
    answers_df = pd.DataFrame(answers_data, columns = ["option_A", "option_B", "option_C", "option_D"])
    return answers_df

In [5]:
import fitz  # PyMuPDF
import pandas as pd
import numpy as np

def extract_correct_answers(pdf_path):
    doc = fitz.open(pdf_path)
    correct_answers = []

    for page_num, page in enumerate(doc, start=1):
        # Extract all answer positions
        answer_positions = {
            "A": page.search_for("A. "),
            "B": page.search_for("B. "),
            "C": page.search_for("C. "),
            "D": page.search_for("D. ")
        }

        num_questions = min(len(answer_positions["A"]), len(answer_positions["B"]), 
                            len(answer_positions["C"]), len(answer_positions["D"]))

        for idx in range(num_questions):
            best_answer = None
            best_yellow_score = 0  # To track the highest yellow score
            best_uniform_score = 0  # To track the highest uniform score
            best_uniform_answer = None  # To track the answer with perfect uniformity
            perfect_uniform_answer = None  # Track if there is any answer with identical RGB

            # Loop over answers (A, B, C, D)
            for answer_letter in ["A", "B", "C", "D"]:
                rects = answer_positions[answer_letter]

                if idx < len(rects):
                    rect = rects[idx]

                    # Expand bounding box to include highlight area
                    expanded_rect = fitz.Rect(
                        rect.x0 - 5, rect.y0 - 5, rect.x1 + 5, rect.y1 + 5
                    )

                    # Extract pixels
                    pixmap = page.get_pixmap(clip=expanded_rect)
                    img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)

                    # Compute mean RGB color
                    mean_color = img.mean(axis=(0, 1))

                    # Check uniformity: All three channels should be close
                    uniform_score = np.abs(mean_color[0] - mean_color[1]) + np.abs(mean_color[0] - mean_color[2])

                    # Check if RGB components are identical (perfect uniformity)
                    if uniform_score == 0:
                        perfect_uniform_answer = answer_letter
                        uniform_score = 0  # Perfect uniformity

                    # Compute yellow intensity (High R+G, low B)
                    yellow_score = mean_color[0] + mean_color[1] - mean_color[2]

                    # Print debug information
                    print(f"Page {page_num}, Q{idx+1}, {answer_letter}: Mean Color {mean_color}, "
                          f"Uniform Score: {uniform_score}, Yellow Score: {yellow_score}")

                    # Store the best answer based on priority rules
                    if yellow_score >= 300:
                        if yellow_score > best_yellow_score:
                            best_yellow_score = yellow_score
                            best_answer = answer_letter
                    elif uniform_score == 0:  # Uniform score check
                        if not best_answer:  # If no yellow score >= 300 answer found
                            best_answer = perfect_uniform_answer

            # Final selected answer for the question
            print(f"Selected Answer for Q{idx+1}: {best_answer}\n")
            correct_answers.append(best_answer)

    return pd.DataFrame(correct_answers, columns = ["correct_answer"])


In [6]:
questions_df = extract_questions(pdf_path = pdf_file)
answers_df = extract_answers(pdf_path = pdf_file)
correct_answers_df = extract_correct_answers(pdf_path = pdf_file)

Page 1, Q1, A: Mean Color [214.78769841 214.78769841 214.78769841], Uniform Score: 0, Yellow Score: 214.7876984126984
Page 1, Q1, B: Mean Color [223.24456522 220.94384058 184.53623188], Uniform Score: 41.0090579710145, Yellow Score: 259.6521739130435
Page 1, Q1, C: Mean Color [234.84057971 229.54347826 145.84963768], Uniform Score: 94.2880434782609, Yellow Score: 318.5344202898551
Page 1, Q1, D: Mean Color [233.32514178 233.32514178 233.32514178], Uniform Score: 0, Yellow Score: 233.3251417769376
Selected Answer for Q1: C

Page 1, Q2, A: Mean Color [220.13232514 220.13232514 220.13232514], Uniform Score: 0, Yellow Score: 220.13232514177693
Page 1, Q2, B: Mean Color [224.21376812 221.73913043 183.29891304], Uniform Score: 43.389492753623216, Yellow Score: 262.6539855072464
Page 1, Q2, C: Mean Color [232.59546314 227.03402647 140.73534972], Uniform Score: 97.42155009451793, Yellow Score: 318.89413988657844
Page 1, Q2, D: Mean Color [218.29710145 218.29710145 218.29710145], Uniform Score:

In [7]:
questions_df

Unnamed: 0,question
0,"Lactante de 10 meses, es traído a emergencia p..."
1,"Mujer de 30 años, intervenida de hernioplastia..."
2,"Niña de 3 años, con prurito intenso en cuero c..."
3,¿Con qué agente etiológico está relacionada la...
4,En un paciente con diagnóstico de Síndrome de ...
...,...
95,"Niño de 7 años, sin antecedente previo, súbita..."
96,En relación a las cisternas subaracnoideas. ¿A...
97,"RN varón, presenta vómitos no biliosos “en pro..."
98,"Varón de 25 años, politraumatizado, se queja d..."


In [8]:
answers_df

Unnamed: 0,option_A,option_B,option_C,option_D
0,Síndrome de West,Mioclonías,Convulsión febril compleja,Convulsión febril simple
1,Depresión respiratoria,Schok anafiláctico,Desgarro de duramadre,Reacción vagal
2,"Malation al 0,5%",Tacrolimus,Ivermectina,Lindano loción
3,Infecciones por estafilococo,Radiación,Histoplasmosis,Equinococosis
4,Membrana basal glomerular,Conductos papilares,Túbulos conectores,Conductos colectores corticales
...,...,...,...,...
95,Cuerpo extraño en laringe,Laringitis subglótica,Laringitis espasmódica,Espasmo bronquial agudo
96,Pontina,Interpendicular,Cuadrigémina,Cerebelobulbar posterior
97,Estenosis pilórica hipertrófica,Estenosis duodenal,Intususcepción,Atresia yeyunal
98,Herida extensa en piel,Falta de pulso poplíteo,Acortamiento y angulación,Gran edema del miembro afectado


In [9]:
correct_answers_df

Unnamed: 0,correct_answer
0,C
1,C
2,C
3,A
4,A
...,...
95,A
96,C
97,A
98,C


In [10]:
total_df = pd.concat([questions_df, answers_df, correct_answers_df], ignore_index = True, axis = 1)
total_df.columns = ["questions", "option_A", "option_B", "option_C", "option_D", "correct_answer"]
total_df

Unnamed: 0,questions,option_A,option_B,option_C,option_D,correct_answer
0,"Lactante de 10 meses, es traído a emergencia p...",Síndrome de West,Mioclonías,Convulsión febril compleja,Convulsión febril simple,C
1,"Mujer de 30 años, intervenida de hernioplastia...",Depresión respiratoria,Schok anafiláctico,Desgarro de duramadre,Reacción vagal,C
2,"Niña de 3 años, con prurito intenso en cuero c...","Malation al 0,5%",Tacrolimus,Ivermectina,Lindano loción,C
3,¿Con qué agente etiológico está relacionada la...,Infecciones por estafilococo,Radiación,Histoplasmosis,Equinococosis,A
4,En un paciente con diagnóstico de Síndrome de ...,Membrana basal glomerular,Conductos papilares,Túbulos conectores,Conductos colectores corticales,A
...,...,...,...,...,...,...
95,"Niño de 7 años, sin antecedente previo, súbita...",Cuerpo extraño en laringe,Laringitis subglótica,Laringitis espasmódica,Espasmo bronquial agudo,A
96,En relación a las cisternas subaracnoideas. ¿A...,Pontina,Interpendicular,Cuadrigémina,Cerebelobulbar posterior,C
97,"RN varón, presenta vómitos no biliosos “en pro...",Estenosis pilórica hipertrófica,Estenosis duodenal,Intususcepción,Atresia yeyunal,A
98,"Varón de 25 años, politraumatizado, se queja d...",Herida extensa en piel,Falta de pulso poplíteo,Acortamiento y angulación,Gran edema del miembro afectado,C


In [11]:
# Extract folder path
folder_path = os.path.dirname(pdf_file)  # Gets the directory path

# Extract filename without extension
file_name = os.path.splitext(os.path.basename(pdf_file))[0]  # Removes the .pdf extension

# Create CSV file path
csv_file = os.path.join(folder_path, f"{file_name}.csv")

# Save a sample DataFrame
total_df.to_csv(csv_file, index=False, encoding="utf-8-sig")
print(f"CSV saved at: {csv_file}")


CSV saved at: /Users/rodrigocarrillo/Library/CloudStorage/OneDrive-EmoryUniversity/Natural Language Processing Projects/Examen Residentado Peru/CONAREME/Year2022/Especialidad Prueba A.csv
