In [None]:
import pandas as pd
from faker import Faker
from huggingface_hub import snapshot_download
from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult
from presidio_anonymizer import AnonymizerEngine
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

## TO-DO

In [None]:
file_path = "../../data/adhd-beliefs-pt/adhd-beliefs-pt-prepared.pkl"
df = pd.read_pickle(file_path)
df.head()

In [None]:
df = df[["special_interest", "diary_entry", "selfdefining_memory", "empty_sheet"]]
df

In [None]:
faker = Faker("pt_PT")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "FacebookAI/xlm-roberta-large-finetuned-conll03-english"
)
model = AutoModelForTokenClassification.from_pretrained(
    "FacebookAI/xlm-roberta-large-finetuned-conll03-english"
)

In [None]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

In [None]:
# Transformer model config
model_config = [
    {
        "lang_code": "pt",
        "model_name": {
            "spacy": "pt_core_web_lg",  # for tokenization, lemmatization
            "transformers": "FacebookAI/xlm-roberta-large-finetuned-conll03-english",  # for NER
        },
    }
]

In [None]:
mapping = dict(
    PER="PERSON",
    LOC="LOCATION",
    ORG="ORGANIZATION",
    EMAIL="EMAIL",
)
labels_to_ignore = ["O"]

In [None]:
ner_model_configuration = NerModelConfiguration(
    model_to_presidio_entity_mapping=mapping,
    alignment_mode="expand",  # "strict", "contract", "expand"
    aggregation_strategy="max",  # "simple", "first", "average", "max"
    labels_to_ignore=labels_to_ignore,
)

In [None]:
transformers_nlp_engine = TransformersNlpEngine(
    models=model_config, ner_model_configuration=ner_model_configuration
)

In [None]:
# Transformer-based analyzer
analyzer = AnalyzerEngine(
    nlp_engine=transformers_nlp_engine, supported_languages=["pt"]
)

In [None]:
ner_pipeline(
    "Meu nome é Carla Souza e trabalho na Fiocruz. Moro em Fortaleza. Tenho 30 anos e gosto de viajar."
)

In [None]:
# Custom recognizer using HuggingFace pipeline
class HFPortugueseNERRecognizer(EntityRecognizer):
    def __init__(self):
        super().__init__(
            supported_entities=["I-PER", "I-ORG", "I-LOC"],
            name="HFPortugueseNERRecognizer",
        )
        self.supported_language = "pt"

    def analyze(self, text, entities, nlp_artifacts=None):
        results = []
        ner_results = ner_pipeline(text)

        for item in ner_results:
            entity = item["entity"]
            if entity in entities:
                results.append(
                    RecognizerResult(
                        entity_type=entity,
                        start=item["start"],
                        end=item["end"],
                        score=item["score"],
                    )
                )
        return results

In [None]:
def faker_replacement(entity_type):
    if entity_type == "I-PER":
        return faker.name()
    elif entity_type == "I-ORG":
        return faker.company()
    elif entity_type == "I-LOC":
        return faker.city()
    elif entity_type == "EMAIL_ADDRESS":
        return faker.email()
    else:
        return "[REDACTED]"

In [None]:
text = """
Meu nome é Carla Souza e trabalho na Fiocruz. Moro em Fortaleza.
Meu e-mail é carla.souza@exemplo.com.
"""

In [None]:
# Step 1: Analyzer with custom NER
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(HFPortugueseNERRecognizer())
results = analyzer.analyze(text=text, language="pt", nlp_artifacts={})

In [None]:
# Step 2: Build fake replacement instructions
from presidio_anonymizer.entities import AnonymizerResult, OperatorConfig

operators = {}
for res in results:
    fake_value = faker_replacement(res.entity_type)
    operators[res.entity_type] = OperatorConfig("replace", {"new_value": fake_value})

In [None]:
# Step 3: Anonymize
anonymizer = AnonymizerEngine()
anonymized = anonymizer.anonymize(
    text=text, analyzer_results=results, operators=operators
)

In [None]:
# Output
print("Original:\n", text)
print("\nAnonymized:\n", anonymized.text)