## Analyzer Engine

In [67]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

text="My phone number is 212-555-5555, my mail id is example@example.com and my name is John Doe."

# Set up the engine, loads the NLP module (spaCy model by default) 
# and other PII recognizers
analyzer = AnalyzerEngine()

# Call analyzer to get results
results = analyzer.analyze(text=text,
                           entities=["PHONE_NUMBER","PERSON","EMAIL_ADDRESS"],
                           language='en')
print(results)

# Analyzer results are passed to the AnonymizerEngine for anonymization

anonymizer = AnonymizerEngine()

anonymized_text = anonymizer.anonymize(text=text,analyzer_results=results)

print(anonymized_text.text)

[type: EMAIL_ADDRESS, start: 47, end: 66, score: 1.0, type: PERSON, start: 82, end: 90, score: 0.85, type: PHONE_NUMBER, start: 19, end: 31, score: 0.75]
My phone number is <PHONE_NUMBER>, my mail id is <EMAIL_ADDRESS> and my name is <PERSON>.


## Printing the results 

In [69]:
for result in results:
   print(text[result.start:result.end], result.entity_type)

example@example.com EMAIL_ADDRESS
John Doe PERSON
212-555-5555 PHONE_NUMBER


# Custom Analyzer

In [54]:
from presidio_analyzer import Pattern, PatternRecognizer
id_pattern = Pattern(name="ID_PATTERN", regex=r"\d{6}", score=0.5)
id_recognizer = PatternRecognizer(supported_entity="ID_NUMBER", patterns=[id_pattern])
analyzer.registry.add_recognizer(id_recognizer)
text1="My ID number is 123456"
analyzer_results = analyzer.analyze(text=text1, entities=["ID_NUMBER"], language='en')

In [53]:
analyzer_results

[type: ID_NUMBER, start: 16, end: 22, score: 0.5]

In [55]:
for result in analyzer_results:
   print(text1[result.start:result.end], result.entity_type)

123456 ID_NUMBER


In [56]:
from presidio_analyzer import PatternRecognizer
titles_recognizer = PatternRecognizer(supported_entity="TITLE",
                                      deny_list=["Mr.","Mrs.","Miss"])
cartoon_recognizer = PatternRecognizer(supported_entity="CARTOON",
                                      deny_list=["Pogo","Mickey Mouse","Donald Duck"])

In [61]:
analyzer.registry.add_recognizer(titles_recognizer)
analyzer.registry.add_recognizer(cartoon_recognizer)

In [63]:
text2="Mr. Schmidt, my ph no is 9948488595, I love ti watch pogo"
results3=analyzer.analyze(text=text2, entities=["TITLE", "CARTOON"],language='en')

In [None]:
for result in results3:
   print(text2[result.start:result.end], result.entity_type)

Mr. TITLE
pogo CARTOON


In [70]:
class NumbersRecognizer(EntityRecognizer):

    expected_confidence_level = 0.7  # expected confidence level for this recognizer

    def load(self) -> None:
        """No loading is required."""
        pass

    def analyze(
        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
    ) -> List[RecognizerResult]:
        """
        Analyzes test to find tokens which represent numbers (either 123 or One Two Three).
        """
        results = []

        # iterate over the spaCy tokens, and call `token.like_num`
        for token in nlp_artifacts.tokens:
            if token.like_num:
                result = RecognizerResult(
                    entity_type="NUMBER",
                    start=token.idx,
                    end=token.idx + len(token),
                    score=self.expected_confidence_level,
                )
                results.append(result)
        return results

NameError: name 'EntityRecognizer' is not defined

In [None]:
new_numbers_recognizer = NumbersRecognizer(supported_entities=["NUMBER"])

In [None]:
text3 = "Roberto lives in Five 10 Broad st."
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(new_numbers_recognizer)

numbers_results2 = analyzer.analyze(text=text3, language="en")
print_analyzer_results(numbers_results2, text=text3)

## Azure AI Language Service for PII Detection

1. Install the package with the azure-ai-language extra:- pip install "presidio-analyzer[azure-ai-language]"

2. Define environment varibles AZURE_AI_KEY and AZURE_AI_ENDPOINT

3. Add the AzureAILanguageRecognizer to the recognizer registry:



In [None]:

from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.predefined_recognizers import AzureAILanguageRecognizer

azure_ai_language = AzureAILanguageRecognizer()

analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(azure_ai_language)

analyzer.analyze(text="My email is email@email.com", language="en")

In [72]:
from typing import List
import pprint

from presidio_analyzer import (
    AnalyzerEngine,
    PatternRecognizer,
    EntityRecognizer,
    Pattern,
    RecognizerResult,
)
from presidio_analyzer.recognizer_registry import RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngine, SpacyNlpEngine, NlpArtifacts
from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer

In [73]:
class NumbersRecognizer(EntityRecognizer):

    expected_confidence_level = 0.7  # expected confidence level for this recognizer

    def load(self) -> None:
        """No loading is required."""
        pass

    def analyze(
        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
    ) -> List[RecognizerResult]:
        """
        Analyzes test to find tokens which represent numbers (either 123 or One Two Three).
        """
        results = []

        # iterate over the spaCy tokens, and call `token.like_num`
        for token in nlp_artifacts.tokens:
            if token.like_num:
                result = RecognizerResult(
                    entity_type="NUMBER",
                    start=token.idx,
                    end=token.idx + len(token),
                    score=self.expected_confidence_level,
                )
                results.append(result)
        return results

In [74]:
new_numbers_recognizer = NumbersRecognizer(supported_entities=["NUMBER"])

In [76]:
text3 = "Roberto lives in Five 10 Broad st."
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(new_numbers_recognizer)

numbers_results2 = analyzer.analyze(text=text3, language="en")
print(numbers_results2)

[type: PERSON, start: 0, end: 7, score: 0.85, type: DATE_TIME, start: 17, end: 24, score: 0.85, type: NUMBER, start: 17, end: 21, score: 0.7, type: NUMBER, start: 22, end: 24, score: 0.7]


## Anonymization

In [32]:
anonymizer = AnonymizerEngine()
anonymized_text = anonymizer.anonymize(text="My phone number is 9958588595",analyzer_results=analyzer_results,)
print(anonymized_text.text)  

My phone number <ID_NUMBER>8588595


## Custom Anonymizer

In [None]:
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig

# Initialize anonymizer
anonymizer = AnonymizerEngine()

# Define operators correctly
operators = {
    "PHONE_NUMBER": OperatorConfig(
        operator_name="mask",
        params={
            "masking_char": "*",
            "chars_to_mask": 10,
            "from_end": True
        }
    ),
    "DEFAULT": OperatorConfig(
        operator_name="replace",
        params={
            "new_value": "<ANONYMIZED>"
        }
    )
}

# Run anonymization
custom_anonymized_results = anonymizer.anonymize(
    text=text_to_anonymize,
    analyzer_results=analyzer_results,
    operators=operators
)

print(f"text: {custom_anonymized_results.text}")
