In [2]:
!pip install PyMuPDF --quiet

In [3]:
# Import PyMuPDF library you've just installed (`fitz`) to work with PDF files
import fitz

In [4]:
#You can modify the filename below with any PDF path you upload
pdf_path = "../raw_data/document_to_anonymize.pdf"

In [5]:
# Define a function to extract all text from a PDF file. It reads every page and returns the combined text.

def extract_text_from_pdf(path: str) -> str:
    """
      Extracts text content from all pages of a PDF file.

      Parameters:
          path (str): The file path to the PDF document.

      Returns:
          str: The extracted text from the entire PDF.
    """
    text = ""
    with fitz.open(path) as doc:
        for page in doc:
            text += page.get_text()
    return text

In [6]:
#Defines a function to extract all the special characters in the text, in order to have a raw version of the text

import unicodedata
import re

def remove_all_special_characters(text: str) -> str:
    """
    Normalizes and cleans a text string by removing accents, punctuation, and special characters.

    Steps:
        1. Converts accented characters to their ASCII equivalents.
        2. Removes all characters except letters, numbers, and spaces.
        3. Collapses multiple spaces into a single space.

    Parameters:
        text (str): The input string to be cleaned.

    Returns:
        str: The cleaned and normalized string.
    """
    text = unicodedata.normalize("NFD", text)
    text = text.encode("ascii", "ignore").decode("utf-8")

    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text)

    return text.strip()

In [7]:
# 🚀 Run this function to extract the text on your chosen PDF!

raw_text = extract_text_from_pdf(pdf_path)

In [None]:
import requests
 
# Replace this with your actual API key
API_KEY = "AIzaSyDW2cfHZQQZtNkCn5PKa_Uw1ZbsgBTd3B8"
API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemma-3-12b-it:generateContent?key={API_KEY}"
 
def generate_content(prompt_text: str, temperature: float) -> dict:
    """Generates content based on the given prompt text and temperature.
 
    Args:
        prompt_text (str): The text prompt to generate content from.
        temperature (float): The temperature parameter for controlling randomness.
    """
 
    headers = {
        "Content-Type": "application/json"
    }
 
    body = {
        "contents": [
            {
                "parts": [
                    {"text": prompt_text}
                ]
            }
        ],
        "generationConfig": {
            "temperature": temperature,
            "topK":1,
        }
    }
 
    response = requests.post(API_URL, headers=headers, json=body)
 
    return response.json()
 
def read_text_file(file_path: str) -> str:
    """Reads the content of a text file.
 
    Args:
        file_path (str): The path to the text file.
 
    Returns:
        str: The contents of the text file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()
 
# Define the anonymization prompt
prompt = (
    "You are a specialist in text and document anonymization. Your task is to analyze the document that will be submitted and remove any sensitive or personal information contained within it.\n"
    "Your primary responsibility is to ensure that no individual can be identified from the anonymized document.\n"
    "## Information that needs to be removed includes, but is not limited to:\n"
    "- Names of individuals (e.g., Ana Correira)\n"
    "- Phone numbers (e.g., 1231242312)\n"
    "- Addresses (e.g., Example Street, 123, City)\n"
    "- Government data that can identify an individual, alone or in combination\n"
    "- Specific employment details that may identify the individual (e.g., Company Name)\n"
    "- Any other information that could indirectly identify an individual.\n"
 
    "## Anything that is not considered sensitive MUST NOT be removed and should remain intact. All words or information removed should be replaced with an asterisk (*).\n"
 
    "## Examples of anonymization:\n"
    "- Ana Correira -> * *\n"
    "- 1231242312 -> *\n"
    "- Example Street, 123, City -> * * * *\n"
 
    "Additionally, ensure that the formatting of the response remains identical to the original document, without any blank lines.\n"
    "Make sure that the resulting text does not contain any information that could identify individuals in any form or sensitive information about individuals.\n"
)
 
complete_prompt = prompt + raw_text  # Combine the prompt with the content of the document
output = generate_content(complete_prompt, 0.0)
 
# Get only the response text
response_text = output.get('candidates', [{}])[0].get('content', {}).get('parts', [{}])[0].get('text', '')
 
print(response_text)

Relatório de Admissão - Centro Médico Lisboa 
Data: 15 de abril de 2025 
Referência: ADM-2025-04-15-089 
Informações do Paciente: 
Nome: * * * * 
Data de Nascimento: 12/03/1978 
Sexo: Feminino 
NIF: * * * * * 
Cartão de Cidadão: * * * * * 
Morada: * * * * * 
Telefone: +* 
Email: * * * * 
Número da Segurança Social: * * * * * * * 
Histórico Médico: 
A paciente * *, mulher caucasiana de 47 anos, compareceu à consulta relatando dores 
abdominais intensas. Tem histórico de hipertensão e diabetes tipo 2, diagnosticada há 5 anos. É 
HIV positivo desde 2018, atualmente com carga viral indetectável graças ao tratamento com 
antirretrovirais. 
A paciente relatou que sua família tem histórico de cancro da mama (mãe falecida aos 52 anos) 
e doença cardíaca (pai e avô paterno). Exames genéticos realizados em 2022 indicaram 
predisposição ao cancro de mama (mutação BRCA1 positiva). 
Informações Sociais e Comportamentais: 
Estado civil: Divorciada 
Filhos: 2 (*, 15 anos e *, 12 anos) 
Religião: Cató