In [1]:
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
import re
import os

In [2]:
pdf_file = '/Users/rodrigocarrillo/Library/CloudStorage/OneDrive-EmoryUniversity/Natural Language Processing Projects/Examen Residentado Peru/CONAREME/Year2024/ESPECIALIADAD A.pdf'

In [3]:
def extract_questions(pdf_path):
    doc = fitz.open(pdf_path)
    questions_data = []  # List to store questions

    # Updated regular expression: handle questions with or without a '?'
    #question_pattern = re.compile(r"(\d+)\.\s(.*?)(?:\?)?\s*(?=A\.)", re.DOTALL)
    question_pattern = re.compile(r"(\d+)\.\s(.*?\??)\s*(?=A\.)", re.DOTALL)

    for page in doc:
        text = page.get_text("text")  # Extract full page text
        
        # Find all questions in the text using findall()
        questions = question_pattern.findall(text)
        
        # Append all the questions to the questions_data list
        for _, question in questions:
            questions_data.append(question.strip())  # Store the cleaned question text

    # Convert extracted questions to DataFrame
    questions_df = pd.DataFrame(questions_data, columns=["question"])
    return questions_df

In [4]:
def extract_answers(pdf_path):
    doc = fitz.open(pdf_path)
    answers_data = []  # List to store answers for each question

    # Regular expression for extracting answers
    answer_pattern = re.compile(r"^(A|B|C|D)\.\s(.+)", re.MULTILINE)

    for page in doc:
        text = page.get_text("text")  # Extract full page text
        lines = text.split("\n")  # Split into lines for structured parsing

        current_answers = []  # List to store answers for a given question
        for line in lines:
            answer_match = answer_pattern.match(line)
            if answer_match:
                opt, ans_text = answer_match.groups()
                current_answers.append(ans_text.strip())

            # After four options are found, store them as one row in answers_data
            if len(current_answers) == 4:
                answers_data.append(current_answers)
                current_answers = []  # Reset for next question

    # Convert extracted answers to DataFrame with four columns
    answers_df = pd.DataFrame(answers_data, columns = ["option_A", "option_B", "option_C", "option_D"])
    return answers_df

In [5]:
import fitz  # PyMuPDF
import pandas as pd
import numpy as np

def extract_correct_answers(pdf_path):
    doc = fitz.open(pdf_path)
    correct_answers = []

    for page_num, page in enumerate(doc, start=1):
        # Extract all answer positions
        answer_positions = {
            "A": page.search_for("A. "),
            "B": page.search_for("B. "),
            "C": page.search_for("C. "),
            "D": page.search_for("D. ")
        }

        num_questions = min(len(answer_positions["A"]), len(answer_positions["B"]), 
                            len(answer_positions["C"]), len(answer_positions["D"]))

        for idx in range(num_questions):
            best_answer = None
            best_yellow_score = 0  # To track the highest yellow score
            best_uniform_score = 0  # To track the highest uniform score
            best_uniform_answer = None  # To track the answer with perfect uniformity
            perfect_uniform_answer = None  # Track if there is any answer with identical RGB

            # Loop over answers (A, B, C, D)
            for answer_letter in ["A", "B", "C", "D"]:
                rects = answer_positions[answer_letter]

                if idx < len(rects):
                    rect = rects[idx]

                    # Expand bounding box to include highlight area
                    expanded_rect = fitz.Rect(
                        rect.x0 - 5, rect.y0 - 5, rect.x1 + 5, rect.y1 + 5
                    )

                    # Extract pixels
                    pixmap = page.get_pixmap(clip=expanded_rect)
                    img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)

                    # Compute mean RGB color
                    mean_color = img.mean(axis=(0, 1))

                    # Check uniformity: All three channels should be close
                    uniform_score = np.abs(mean_color[0] - mean_color[1]) + np.abs(mean_color[0] - mean_color[2])

                    # Check if RGB components are identical (perfect uniformity)
                    if uniform_score == 0:
                        perfect_uniform_answer = answer_letter
                        uniform_score = 0  # Perfect uniformity

                    # Compute yellow intensity (High R+G, low B)
                    yellow_score = mean_color[0] + mean_color[1] - mean_color[2]

                    # Print debug information
                    print(f"Page {page_num}, Q{idx+1}, {answer_letter}: Mean Color {mean_color}, "
                          f"Uniform Score: {uniform_score}, Yellow Score: {yellow_score}")

                    # Store the best answer based on priority rules
                    if yellow_score >= 300:
                        if yellow_score > best_yellow_score:
                            best_yellow_score = yellow_score
                            best_answer = answer_letter
                    elif uniform_score == 0:  # Uniform score check
                        if not best_answer:  # If no yellow score >= 300 answer found
                            best_answer = perfect_uniform_answer

            # Final selected answer for the question
            print(f"Selected Answer for Q{idx+1}: {best_answer}\n")
            correct_answers.append(best_answer)

    return pd.DataFrame(correct_answers, columns = ["correct_answer"])


In [6]:
questions_df = extract_questions(pdf_path = pdf_file)
answers_df = extract_answers(pdf_path = pdf_file)
correct_answers_df = extract_correct_answers(pdf_path = pdf_file)

Page 1, Q1, A: Mean Color [236.3531746 236.3531746 236.3531746], Uniform Score: 0, Yellow Score: 236.3531746031746
Page 1, Q1, B: Mean Color [232.56332703 232.56332703 232.56332703], Uniform Score: 0, Yellow Score: 232.5633270321361
Page 1, Q1, C: Mean Color [223.55253623 221.0942029  182.75543478], Uniform Score: 43.25543478260872, Yellow Score: 261.89130434782606
Page 1, Q1, D: Mean Color [229.98676749 224.51606805 138.55009452], Uniform Score: 96.90737240075615, Yellow Score: 315.95274102079395
Selected Answer for Q1: D

Page 1, Q2, A: Mean Color [224.60507246 224.60507246 224.60507246], Uniform Score: 0, Yellow Score: 224.60507246376812
Page 1, Q2, B: Mean Color [229.19746377 224.01449275 142.68297101], Uniform Score: 91.69746376811594, Yellow Score: 310.5289855072464
Page 1, Q2, C: Mean Color [232.76181474 232.76181474 232.76181474], Uniform Score: 0, Yellow Score: 232.7618147448015
Page 1, Q2, D: Mean Color [230.9692029 230.9692029 230.9692029], Uniform Score: 0, Yellow Score: 23

In [7]:
questions_df

Unnamed: 0,question
0,"Niño de 6 años, presenta hace 7 días rinorrea ..."
1,"Varón de 77 años, fumador, acude por presentar..."
2,"Mujer de 46 años, acude por dolor abdominal, n..."
3,"Primigesta de 38 semanas, acude por presentar ..."
4,En el ámbito sanitario de una red integrada de...
...,...
95,"Varón de 76 años, en reposo post operatorio de..."
96,"Niño de 2 años previamente sano, es traído a e..."
97,"Mujer de 51 años, cursa con recidiva de cáncer..."
98,Escolar de 12 años con derivación ventrículo p...


In [8]:
answers_df

Unnamed: 0,option_A,option_B,option_C,option_D
0,Microbiología,Tomografía,Serología,Radiografía
1,Normal,Obstructivo,Restrictivo,Mixto
2,Severa,Moderada,Grave,Leve
3,Evolución espontánea de parto,Cesárea a las 40 semanas,Terminar la gestación,Control en una semana con doppler
4,Implementar nuevo servicio de cuidados intensivos,Incrementar el requerimiento de medicamentos e...,Solicitar contrato de personal especializado,Acción intersectorial y abordaje de los determ...
...,...,...,...,...
95,Dímero D,Eco doppler de miembros inferiores,Gammagrafía pulmonar,Angiotomografía pulmonar
96,Bronquiolitis,Cuerpo extraño en vía aérea,Crisis asmática,Laringotraqueítis aguda
97,Gloméruloesclerosis,Glomérulonefritis,Nefritis túbulointersticial aguda,Daño tubular agudo
98,Meningitis,Hemorragia intracraneal,Crisis de migraña,Disfunción valvular


In [9]:
correct_answers_df

Unnamed: 0,correct_answer
0,D
1,B
2,B
3,C
4,D
...,...
95,D
96,B
97,C
98,D


In [10]:
total_df = pd.concat([questions_df, answers_df, correct_answers_df], ignore_index = True, axis = 1)
total_df.columns = ["questions", "option_A", "option_B", "option_C", "option_D", "correct_answer"]
total_df

Unnamed: 0,questions,option_A,option_B,option_C,option_D,correct_answer
0,"Niño de 6 años, presenta hace 7 días rinorrea ...",Microbiología,Tomografía,Serología,Radiografía,D
1,"Varón de 77 años, fumador, acude por presentar...",Normal,Obstructivo,Restrictivo,Mixto,B
2,"Mujer de 46 años, acude por dolor abdominal, n...",Severa,Moderada,Grave,Leve,B
3,"Primigesta de 38 semanas, acude por presentar ...",Evolución espontánea de parto,Cesárea a las 40 semanas,Terminar la gestación,Control en una semana con doppler,C
4,En el ámbito sanitario de una red integrada de...,Implementar nuevo servicio de cuidados intensivos,Incrementar el requerimiento de medicamentos e...,Solicitar contrato de personal especializado,Acción intersectorial y abordaje de los determ...,D
...,...,...,...,...,...,...
95,"Varón de 76 años, en reposo post operatorio de...",Dímero D,Eco doppler de miembros inferiores,Gammagrafía pulmonar,Angiotomografía pulmonar,D
96,"Niño de 2 años previamente sano, es traído a e...",Bronquiolitis,Cuerpo extraño en vía aérea,Crisis asmática,Laringotraqueítis aguda,B
97,"Mujer de 51 años, cursa con recidiva de cáncer...",Gloméruloesclerosis,Glomérulonefritis,Nefritis túbulointersticial aguda,Daño tubular agudo,C
98,Escolar de 12 años con derivación ventrículo p...,Meningitis,Hemorragia intracraneal,Crisis de migraña,Disfunción valvular,D


In [11]:
# Extract folder path
folder_path = os.path.dirname(pdf_file)  # Gets the directory path

# Extract filename without extension
file_name = os.path.splitext(os.path.basename(pdf_file))[0]  # Removes the .pdf extension

# Create CSV file path
csv_file = os.path.join(folder_path, f"{file_name}.csv")

# Save a sample DataFrame
total_df.to_csv(csv_file, index=False, encoding="utf-8-sig")
print(f"CSV saved at: {csv_file}")


CSV saved at: /Users/rodrigocarrillo/Library/CloudStorage/OneDrive-EmoryUniversity/Natural Language Processing Projects/Examen Residentado Peru/CONAREME/Year2024/ESPECIALIADAD A.csv
