In [None]:
import pandas as pd
from faker import Faker
from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult
from presidio_anonymizer import AnonymizerEngine
from transformers import pipeline

In [None]:
file_path = "../../data/adhd-beliefs-pt/adhd-beliefs-pt-cleaned.pkl"
df = pd.read_pickle(file_path)
df.head()

In [None]:
df = df[["special_interest", "diary_entry", "selfdefining_memory", "empty_sheet"]]
df

In [None]:
faker = Faker("pt_PT")

In [None]:
ner_pipeline = pipeline(
    "ner", model="pucpr/roberta-ner-portuguese", aggregation_strategy="simple"
)

In [None]:
# Custom recognizer using HuggingFace pipeline
class HFPortugueseNERRecognizer(EntityRecognizer):
    def __init__(self):
        super().__init__(
            supported_entities=["PERSON", "ORG", "LOC"],
            name="HFPortugueseNERRecognizer",
        )

    def analyze(self, text, entities, nlp_artifacts=None):
        results = []
        ner_results = ner_pipeline(text)

        for item in ner_results:
            entity = item["entity_group"]
            if entity in entities:
                results.append(
                    RecognizerResult(
                        entity_type=entity,
                        start=item["start"],
                        end=item["end"],
                        score=item["score"],
                    )
                )
        return results

In [None]:
def faker_replacement(entity_type):
    if entity_type == "PERSON":
        return faker.name()
    elif entity_type == "ORG":
        return faker.company()
    elif entity_type == "LOC":
        return faker.city()
    elif entity_type == "EMAIL_ADDRESS":
        return faker.email()
    else:
        return "[REDACTED]"

In [None]:
text = """
Meu nome é Carla Souza e trabalho na Fiocruz. Moro em Fortaleza.
Meu e-mail é carla.souza@exemplo.com.
"""

In [None]:
# Step 1: Analyzer with custom NER
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(HFPortugueseNERRecognizer())
results = analyzer.analyze(text=text, language="pt")

In [None]:
# Step 2: Build fake replacement instructions
from presidio_anonymizer.entities import AnonymizerResult, OperatorConfig

operators = {}
for res in results:
    fake_value = faker_replacement(res.entity_type)
    operators[res.entity_type] = OperatorConfig("replace", {"new_value": fake_value})

In [None]:
# Step 3: Anonymize
anonymizer = AnonymizerEngine()
anonymized = anonymizer.anonymize(
    text=text, analyzer_results=results, operators=operators
)

In [None]:
# Output
print("Original:\n", text)
print("\nAnonymized:\n", anonymized.text)