In [1]:
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
import re
import os

In [2]:
pdf_file = '/Users/rodrigocarrillo/Library/CloudStorage/OneDrive-EmoryUniversity/Natural Language Processing Projects/Examen Residentado Peru/CONAREME/Year2023/PRUEBA A.pdf'

In [3]:
def extract_questions(pdf_path):
    doc = fitz.open(pdf_path)
    questions_data = []  # List to store questions

    # Updated regular expression: handle questions with or without a '?'
    #question_pattern = re.compile(r"(\d+)\.\s(.*?)(?:\?)?\s*(?=A\.)", re.DOTALL)
    question_pattern = re.compile(r"(\d+)\.\s(.*?\??)\s*(?=A\.)", re.DOTALL)

    for page in doc:
        text = page.get_text("text")  # Extract full page text
        
        # Find all questions in the text using findall()
        questions = question_pattern.findall(text)
        
        # Append all the questions to the questions_data list
        for _, question in questions:
            questions_data.append(question.strip())  # Store the cleaned question text

    # Convert extracted questions to DataFrame
    questions_df = pd.DataFrame(questions_data, columns=["question"])
    return questions_df

In [4]:
def extract_answers(pdf_path):
    doc = fitz.open(pdf_path)
    answers_data = []  # List to store answers for each question

    # Regular expression for extracting answers
    answer_pattern = re.compile(r"^(A|B|C|D)\.\s(.+)", re.MULTILINE)

    for page in doc:
        text = page.get_text("text")  # Extract full page text
        lines = text.split("\n")  # Split into lines for structured parsing

        current_answers = []  # List to store answers for a given question
        for line in lines:
            answer_match = answer_pattern.match(line)
            if answer_match:
                opt, ans_text = answer_match.groups()
                current_answers.append(ans_text.strip())

            # After four options are found, store them as one row in answers_data
            if len(current_answers) == 4:
                answers_data.append(current_answers)
                current_answers = []  # Reset for next question

    # Convert extracted answers to DataFrame with four columns
    answers_df = pd.DataFrame(answers_data, columns = ["option_A", "option_B", "option_C", "option_D"])
    return answers_df

In [5]:
import fitz  # PyMuPDF
import pandas as pd
import numpy as np

def extract_correct_answers(pdf_path):
    doc = fitz.open(pdf_path)
    correct_answers = []

    for page_num, page in enumerate(doc, start=1):
        # Extract all answer positions
        answer_positions = {
            "A": page.search_for("A. "),
            "B": page.search_for("B. "),
            "C": page.search_for("C. "),
            "D": page.search_for("D. ")
        }

        num_questions = min(len(answer_positions["A"]), len(answer_positions["B"]), 
                            len(answer_positions["C"]), len(answer_positions["D"]))

        for idx in range(num_questions):
            best_answer = None
            best_yellow_score = 0  # To track the highest yellow score
            best_uniform_score = 0  # To track the highest uniform score
            best_uniform_answer = None  # To track the answer with perfect uniformity
            perfect_uniform_answer = None  # Track if there is any answer with identical RGB

            # Loop over answers (A, B, C, D)
            for answer_letter in ["A", "B", "C", "D"]:
                rects = answer_positions[answer_letter]

                if idx < len(rects):
                    rect = rects[idx]

                    # Expand bounding box to include highlight area
                    expanded_rect = fitz.Rect(
                        rect.x0 - 5, rect.y0 - 5, rect.x1 + 5, rect.y1 + 5
                    )

                    # Extract pixels
                    pixmap = page.get_pixmap(clip=expanded_rect)
                    img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)

                    # Compute mean RGB color
                    mean_color = img.mean(axis=(0, 1))

                    # Check uniformity: All three channels should be close
                    uniform_score = np.abs(mean_color[0] - mean_color[1]) + np.abs(mean_color[0] - mean_color[2])

                    # Check if RGB components are identical (perfect uniformity)
                    if uniform_score == 0:
                        perfect_uniform_answer = answer_letter
                        uniform_score = 0  # Perfect uniformity

                    # Compute yellow intensity (High R+G, low B)
                    yellow_score = mean_color[0] + mean_color[1] - mean_color[2]

                    # Print debug information
                    print(f"Page {page_num}, Q{idx+1}, {answer_letter}: Mean Color {mean_color}, "
                          f"Uniform Score: {uniform_score}, Yellow Score: {yellow_score}")

                    # Store the best answer based on priority rules
                    if yellow_score >= 300:
                        if yellow_score > best_yellow_score:
                            best_yellow_score = yellow_score
                            best_answer = answer_letter
                    elif uniform_score == 0:  # Uniform score check
                        if not best_answer:  # If no yellow score >= 300 answer found
                            best_answer = perfect_uniform_answer

            # Final selected answer for the question
            print(f"Selected Answer for Q{idx+1}: {best_answer}\n")
            correct_answers.append(best_answer)

    return pd.DataFrame(correct_answers, columns = ["correct_answer"])


In [6]:
questions_df = extract_questions(pdf_path = pdf_file)
answers_df = extract_answers(pdf_path = pdf_file)
correct_answers_df = extract_correct_answers(pdf_path = pdf_file)

Page 1, Q1, A: Mean Color [228.96230159 228.96230159 228.96230159], Uniform Score: 0, Yellow Score: 228.9623015873016
Page 1, Q1, B: Mean Color [229.27410208 229.27410208 229.27410208], Uniform Score: 0, Yellow Score: 229.27410207939508
Page 1, Q1, C: Mean Color [234.86956522 232.29891304 192.17028986], Uniform Score: 45.269927536231904, Yellow Score: 274.9981884057971
Page 1, Q1, D: Mean Color [234.63137996 228.95085066 139.71266541], Uniform Score: 100.59924385633272, Yellow Score: 323.8695652173913
Selected Answer for Q1: D

Page 1, Q2, A: Mean Color [226.63224638 226.63224638 226.63224638], Uniform Score: 0, Yellow Score: 226.6322463768116
Page 1, Q2, B: Mean Color [230.87334594 230.87334594 230.87334594], Uniform Score: 0, Yellow Score: 230.8733459357278
Page 1, Q2, C: Mean Color [226.94746377 221.48550725 134.80797101], Uniform Score: 97.60144927536231, Yellow Score: 313.625
Page 1, Q2, D: Mean Color [235.99243856 233.56521739 195.00567108], Uniform Score: 43.41398865784498, Yell

In [7]:
questions_df

Unnamed: 0,question
0,"Varón de 26 años, sufre accidente automovilíst..."
1,Niño de 2 años con fiebre y otalgia izquierda ...
2,"Varón de 25 años, quien ingiere sustancia que ..."
3,"Lactante de 8 meses, presenta lesiones cutánea..."
4,Respecto a la evaluación del bienestar del fet...
...,...
95,En una episiotomía medio lateral. ¿Qué estruct...
96,Niña de 8 años traída a consulta por talla baj...
97,Adolescente de 14 años acude por presentar pru...
98,"Mujer de 35 años, ingresa por vómitos, dolor a..."


In [8]:
answers_df

Unnamed: 0,option_A,option_B,option_C,option_D
0,Lesión primaria de SRAA,Disfunción cerebral difusa,Inconsciencia psicógena,Lesión hemisférica con herniación troncal
1,20,5,10,7
2,Etanol,Flumazenilo,Atropina,Adrenalina
3,Ictiosis vulgar,Dermatitis seborreica,Dermatitis atópica,Eccema numular
4,Test estresante,Líquido amniótico,Fllujometría Doppler,Edad gestacional
...,...,...,...,...
95,Músculo iliococcigeo,Músculo isquiocavernoso,Cuerpo perineal,Músculo bulboesponjoso
96,Turner XO,Turner XX,Noonan XX,Síndrome de X frágil
97,Clindamicina VO + crema antibiótica,Doxiciclina VO + crema antibiótica,Fluconazol dosis única + crema antimicótica,Hidrocortisona tópica al 1%
98,Insulinoterapia,Bicarbonato de sodio,Solución isotónica,Agua destilada


In [9]:
correct_answers_df

Unnamed: 0,correct_answer
0,D
1,C
2,A
3,C
4,B
...,...
95,D
96,A
97,A
98,B


In [10]:
total_df = pd.concat([questions_df, answers_df, correct_answers_df], ignore_index = True, axis = 1)
total_df.columns = ["questions", "option_A", "option_B", "option_C", "option_D", "correct_answer"]
total_df

Unnamed: 0,questions,option_A,option_B,option_C,option_D,correct_answer
0,"Varón de 26 años, sufre accidente automovilíst...",Lesión primaria de SRAA,Disfunción cerebral difusa,Inconsciencia psicógena,Lesión hemisférica con herniación troncal,D
1,Niño de 2 años con fiebre y otalgia izquierda ...,20,5,10,7,C
2,"Varón de 25 años, quien ingiere sustancia que ...",Etanol,Flumazenilo,Atropina,Adrenalina,A
3,"Lactante de 8 meses, presenta lesiones cutánea...",Ictiosis vulgar,Dermatitis seborreica,Dermatitis atópica,Eccema numular,C
4,Respecto a la evaluación del bienestar del fet...,Test estresante,Líquido amniótico,Fllujometría Doppler,Edad gestacional,B
...,...,...,...,...,...,...
95,En una episiotomía medio lateral. ¿Qué estruct...,Músculo iliococcigeo,Músculo isquiocavernoso,Cuerpo perineal,Músculo bulboesponjoso,D
96,Niña de 8 años traída a consulta por talla baj...,Turner XO,Turner XX,Noonan XX,Síndrome de X frágil,A
97,Adolescente de 14 años acude por presentar pru...,Clindamicina VO + crema antibiótica,Doxiciclina VO + crema antibiótica,Fluconazol dosis única + crema antimicótica,Hidrocortisona tópica al 1%,A
98,"Mujer de 35 años, ingresa por vómitos, dolor a...",Insulinoterapia,Bicarbonato de sodio,Solución isotónica,Agua destilada,B


In [11]:
# Extract folder path
folder_path = os.path.dirname(pdf_file)  # Gets the directory path

# Extract filename without extension
file_name = os.path.splitext(os.path.basename(pdf_file))[0]  # Removes the .pdf extension

# Create CSV file path
csv_file = os.path.join(folder_path, f"{file_name}.csv")

# Save a sample DataFrame
total_df.to_csv(csv_file, index=False, encoding="utf-8-sig")
print(f"CSV saved at: {csv_file}")


CSV saved at: /Users/rodrigocarrillo/Library/CloudStorage/OneDrive-EmoryUniversity/Natural Language Processing Projects/Examen Residentado Peru/CONAREME/Year2023/PRUEBA A.csv
