In [1]:
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
import re
import os

In [None]:
pdf_file = '/Users/rodrigocarrillo/Library/CloudStorage/OneDrive-EmoryUniversity/Natural Language Processing Projects/Examen Residentado Peru/CONAREME/Year2022/Especialidad Prueba B.pdf'

In [3]:
def extract_questions(pdf_path):
    doc = fitz.open(pdf_path)
    questions_data = []  # List to store questions

    # Updated regular expression: handle questions with or without a '?'
    #question_pattern = re.compile(r"(\d+)\.\s(.*?)(?:\?)?\s*(?=A\.)", re.DOTALL)
    question_pattern = re.compile(r"(\d+)\.\s(.*?\??)\s*(?=A\.)", re.DOTALL)

    for page in doc:
        text = page.get_text("text")  # Extract full page text
        
        # Find all questions in the text using findall()
        questions = question_pattern.findall(text)
        
        # Append all the questions to the questions_data list
        for _, question in questions:
            questions_data.append(question.strip())  # Store the cleaned question text

    # Convert extracted questions to DataFrame
    questions_df = pd.DataFrame(questions_data, columns=["question"])
    return questions_df

In [4]:
def extract_answers(pdf_path):
    doc = fitz.open(pdf_path)
    answers_data = []  # List to store answers for each question

    # Regular expression for extracting answers
    answer_pattern = re.compile(r"^(A|B|C|D)\.\s(.+)", re.MULTILINE)

    for page in doc:
        text = page.get_text("text")  # Extract full page text
        lines = text.split("\n")  # Split into lines for structured parsing

        current_answers = []  # List to store answers for a given question
        for line in lines:
            answer_match = answer_pattern.match(line)
            if answer_match:
                opt, ans_text = answer_match.groups()
                current_answers.append(ans_text.strip())

            # After four options are found, store them as one row in answers_data
            if len(current_answers) == 4:
                answers_data.append(current_answers)
                current_answers = []  # Reset for next question

    # Convert extracted answers to DataFrame with four columns
    answers_df = pd.DataFrame(answers_data, columns = ["option_A", "option_B", "option_C", "option_D"])
    return answers_df

In [5]:
import fitz  # PyMuPDF
import pandas as pd
import numpy as np

def extract_correct_answers(pdf_path):
    doc = fitz.open(pdf_path)
    correct_answers = []

    for page_num, page in enumerate(doc, start=1):
        # Extract all answer positions
        answer_positions = {
            "A": page.search_for("A. "),
            "B": page.search_for("B. "),
            "C": page.search_for("C. "),
            "D": page.search_for("D. ")
        }

        num_questions = min(len(answer_positions["A"]), len(answer_positions["B"]), 
                            len(answer_positions["C"]), len(answer_positions["D"]))

        for idx in range(num_questions):
            best_answer = None
            best_yellow_score = 0  # To track the highest yellow score
            best_uniform_score = 0  # To track the highest uniform score
            best_uniform_answer = None  # To track the answer with perfect uniformity
            perfect_uniform_answer = None  # Track if there is any answer with identical RGB

            # Loop over answers (A, B, C, D)
            for answer_letter in ["A", "B", "C", "D"]:
                rects = answer_positions[answer_letter]

                if idx < len(rects):
                    rect = rects[idx]

                    # Expand bounding box to include highlight area
                    expanded_rect = fitz.Rect(
                        rect.x0 - 5, rect.y0 - 5, rect.x1 + 5, rect.y1 + 5
                    )

                    # Extract pixels
                    pixmap = page.get_pixmap(clip=expanded_rect)
                    img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)

                    # Compute mean RGB color
                    mean_color = img.mean(axis=(0, 1))

                    # Check uniformity: All three channels should be close
                    uniform_score = np.abs(mean_color[0] - mean_color[1]) + np.abs(mean_color[0] - mean_color[2])

                    # Check if RGB components are identical (perfect uniformity)
                    if uniform_score == 0:
                        perfect_uniform_answer = answer_letter
                        uniform_score = 0  # Perfect uniformity

                    # Compute yellow intensity (High R+G, low B)
                    yellow_score = mean_color[0] + mean_color[1] - mean_color[2]

                    # Print debug information
                    print(f"Page {page_num}, Q{idx+1}, {answer_letter}: Mean Color {mean_color}, "
                          f"Uniform Score: {uniform_score}, Yellow Score: {yellow_score}")

                    # Store the best answer based on priority rules
                    if yellow_score >= 300:
                        if yellow_score > best_yellow_score:
                            best_yellow_score = yellow_score
                            best_answer = answer_letter
                    elif uniform_score == 0:  # Uniform score check
                        if not best_answer:  # If no yellow score >= 300 answer found
                            best_answer = perfect_uniform_answer

            # Final selected answer for the question
            print(f"Selected Answer for Q{idx+1}: {best_answer}\n")
            correct_answers.append(best_answer)

    return pd.DataFrame(correct_answers, columns = ["correct_answer"])


In [6]:
questions_df = extract_questions(pdf_path = pdf_file)
answers_df = extract_answers(pdf_path = pdf_file)
correct_answers_df = extract_correct_answers(pdf_path = pdf_file)

Page 1, Q1, A: Mean Color [222.88768116 222.88768116 222.88768116], Uniform Score: 0, Yellow Score: 222.88768115942028
Page 1, Q1, B: Mean Color [230.15879017 230.15879017 230.15879017], Uniform Score: 0, Yellow Score: 230.15879017013233
Page 1, Q1, C: Mean Color [225.09601449 219.95652174 138.13949275], Uniform Score: 92.09601449275362, Yellow Score: 306.9130434782609
Page 1, Q1, D: Mean Color [233.52930057 231.31758034 195.93383743], Uniform Score: 39.80718336483935, Yellow Score: 268.9130434782609
Selected Answer for Q1: C

Page 1, Q2, A: Mean Color [221.28166352 221.28166352 221.28166352], Uniform Score: 0, Yellow Score: 221.28166351606805
Page 1, Q2, B: Mean Color [228.8115942 228.8115942 228.8115942], Uniform Score: 0, Yellow Score: 228.81159420289856
Page 1, Q2, C: Mean Color [233.78638941 231.62570888 197.00378072], Uniform Score: 38.94328922495271, Yellow Score: 268.4083175803402
Page 1, Q2, D: Mean Color [230.07246377 224.86594203 142.56702899], Uniform Score: 92.711956521739

In [7]:
questions_df

Unnamed: 0,question
0,"Mujer de 45 años, que labora como operaria de ..."
1,Mujer de 70 años con diagnóstico de pancreatit...
2,"Lactante de 10 meses, irritable, piel con lesi..."
3,En la RCP básica de alta calidad del lactante ...
4,"Neonato de 5 días, es traído por madre quien r..."
...,...
95,"Adolescente de 15 años, acude a consultorio po..."
96,Varón de 40 años acude por dolor en región lum...
97,Mujer de 36 años con antecedente de LES activo...
98,Se cuenta con un excelente programa de manejo ...


In [8]:
answers_df

Unnamed: 0,option_A,option_B,option_C,option_D
0,Rótula,Meniscos,Bursa de la rodilla,Ligamento cruzado
1,Neoplasia pancreática,Acumulación de líquido pancreático,Acumulación necrótica aguda,Seudoquiste pancreático
2,Tacrolimus,Lindano,Permetrina al 5%,Mometasona al 1%
3,30/2,15/2,40/1,40/2
4,Incompatibilidad Rh,Fallo de la lactancia,Hepatitis del RN,Cefalohematoma
...,...,...,...,...
95,Antimicrobiano tópico,Corticoide tópico,Antibiótico oral,Antihistamínico oral
96,UroTEM sin contraste,Ecografía renal,Urografía excretoria,Radiografía simple de abdomen
97,Ciclofosfamida,Micofenolato,Dexametasona,Metotrexato
98,Incremento de incidencia,Disminución de prevalencia,Disminución de incidencia,Incremento de prevalencia


In [9]:
correct_answers_df

Unnamed: 0,correct_answer
0,C
1,D
2,C
3,A
4,B
...,...
95,C
96,A
97,A
98,D


In [10]:
total_df = pd.concat([questions_df, answers_df, correct_answers_df], ignore_index = True, axis = 1)
total_df.columns = ["questions", "option_A", "option_B", "option_C", "option_D", "correct_answer"]
total_df

Unnamed: 0,questions,option_A,option_B,option_C,option_D,correct_answer
0,"Mujer de 45 años, que labora como operaria de ...",Rótula,Meniscos,Bursa de la rodilla,Ligamento cruzado,C
1,Mujer de 70 años con diagnóstico de pancreatit...,Neoplasia pancreática,Acumulación de líquido pancreático,Acumulación necrótica aguda,Seudoquiste pancreático,D
2,"Lactante de 10 meses, irritable, piel con lesi...",Tacrolimus,Lindano,Permetrina al 5%,Mometasona al 1%,C
3,En la RCP básica de alta calidad del lactante ...,30/2,15/2,40/1,40/2,A
4,"Neonato de 5 días, es traído por madre quien r...",Incompatibilidad Rh,Fallo de la lactancia,Hepatitis del RN,Cefalohematoma,B
...,...,...,...,...,...,...
95,"Adolescente de 15 años, acude a consultorio po...",Antimicrobiano tópico,Corticoide tópico,Antibiótico oral,Antihistamínico oral,C
96,Varón de 40 años acude por dolor en región lum...,UroTEM sin contraste,Ecografía renal,Urografía excretoria,Radiografía simple de abdomen,A
97,Mujer de 36 años con antecedente de LES activo...,Ciclofosfamida,Micofenolato,Dexametasona,Metotrexato,A
98,Se cuenta con un excelente programa de manejo ...,Incremento de incidencia,Disminución de prevalencia,Disminución de incidencia,Incremento de prevalencia,D


In [11]:
# Extract folder path
folder_path = os.path.dirname(pdf_file)  # Gets the directory path

# Extract filename without extension
file_name = os.path.splitext(os.path.basename(pdf_file))[0]  # Removes the .pdf extension

# Create CSV file path
csv_file = os.path.join(folder_path, f"{file_name}.csv")

# Save a sample DataFrame
total_df.to_csv(csv_file, index=False, encoding="utf-8-sig")
print(f"CSV saved at: {csv_file}")


CSV saved at: /Users/rodrigocarrillo/Library/CloudStorage/OneDrive-EmoryUniversity/Natural Language Processing Projects/Examen Residentado Peru/CONAREME/Year2024/ESPECIALIADAD B.csv
