In [21]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

In [None]:
text_to_anonymize = "My name is John Doe and my email is roniend@telkom.net and my phone number is 123-456-7890."
"""
Entity: EMAIL_ADDRESS, Start: 36, End: 54, Score: 1.00
roniend@telkom.net EMAIL_ADDRESS
Entity: PERSON, Start: 11, End: 19, Score: 0.85
John Doe PERSON
Entity: PHONE_NUMBER, Start: 78, End: 90, Score: 0.75
123-456-7890 PHONE_NUMBER
"""

In [12]:
#text_to_anonymize = "im John Doe and my email is roniend@telkom.net and my phone number is 123-456-7890."
"""
Entity: EMAIL_ADDRESS, Start: 28, End: 46, Score: 1.00
roniend@telkom.net EMAIL_ADDRESS
Entity: PERSON, Start: 3, End: 11, Score: 0.85
John Doe PERSON
Entity: PHONE_NUMBER, Start: 70, End: 82, Score: 0.75
123-456-7890 PHONE_NUMBER
"""

'\nEntity: EMAIL_ADDRESS, Start: 28, End: 46, Score: 1.00\nroniend@telkom.net EMAIL_ADDRESS\nEntity: PERSON, Start: 3, End: 11, Score: 0.85\nJohn Doe PERSON\nEntity: PHONE_NUMBER, Start: 70, End: 82, Score: 0.75\n123-456-7890 PHONE_NUMBER\n'

In [13]:
# please check the supported entities in the Presidio documentation
# https://microsoft.github.io/presidio/analyzer/
# https://microsoft.github.io/presidio/anonymizer/
# https://microsoft.github.io/presidio/supported_entities/
analyzer = AnalyzerEngine()
analyzer_results = analyzer.analyze(text=text_to_anonymize, entities=["PHONE_NUMBER","PERSON","EMAIL_ADDRESS"],  language="en")
print(analyzer_results)

[type: EMAIL_ADDRESS, start: 36, end: 54, score: 1.0, type: PERSON, start: 11, end: 19, score: 0.85, type: PHONE_NUMBER, start: 78, end: 90, score: 0.75]


In [None]:
for result in analyzer_results:
    print(f"Entity: {result.entity_type}, {text_to_anonymize[result.start:result.end]} Start: {result.start}, End: {result.end}, Score: {result.score:.2f}")

Entity: EMAIL_ADDRESS, Start: 36, End: 54, Score: 1.00
roniend@telkom.net EMAIL_ADDRESS
Entity: PERSON, Start: 11, End: 19, Score: 0.85
John Doe PERSON
Entity: PHONE_NUMBER, Start: 78, End: 90, Score: 0.75
123-456-7890 PHONE_NUMBER


##2. Add custom entity

In [15]:
#an entity id to identify any token that contains 2 letters and 3 digits
from presidio_analyzer import Pattern, PatternRecognizer

# Define the regex pattern for the custom entity
id_pattern = Pattern(name="id_pattern", regex=r"\d{6}", score=0.5)

# Define the recognizer for the custom entity
id_recognizer = PatternRecognizer(supported_entity="ID", patterns=[id_pattern])

# Add the custom recognizer to the analyzer
analyzer.registry.add_recognizer(id_recognizer)

In [16]:
text_to_anonymize+= " and my ID is 123456."
analyzer_results = analyzer.analyze(text=text_to_anonymize, entities=["PHONE_NUMBER","PERSON","EMAIL_ADDRESS","ID"], language="en")
print(analyzer_results)

[type: EMAIL_ADDRESS, start: 36, end: 54, score: 1.0, type: PERSON, start: 11, end: 19, score: 0.85, type: PHONE_NUMBER, start: 78, end: 90, score: 0.75, type: ID, start: 105, end: 111, score: 0.5]


In [None]:
for result in analyzer_results:
    print(f"Entity: {result.entity_type}, {text_to_anonymize[result.start:result.end]} Start: {result.start}, End: {result.end}, Score: {result.score:.2f}")

Entity: EMAIL_ADDRESS, roniend@telkom.net Start: 36, End: 54, Score: 1.00
Entity: PERSON, John Doe Start: 11, End: 19, Score: 0.85
Entity: PHONE_NUMBER, 123-456-7890 Start: 78, End: 90, Score: 0.75
Entity: ID, 123456 Start: 105, End: 111, Score: 0.50


3. Anonymize sensitive data

In [22]:
anonymizer = AnonymizerEngine()

anonymized_results = anonymizer.anonymize(
    text=text_to_anonymize,
    analyzer_results=analyzer_results,
)
print(f"text: {anonymized_results.text}")

text: My name is <PERSON> and my email is <EMAIL_ADDRESS> and my phone number is <PHONE_NUMBER>. and my ID is <ID>.


4. Custom anonymization

In [29]:
from presidio_anonymizer.entities import OperatorConfig

operators = {"PHONE_NUMBER": OperatorConfig(
    "mask",
        {"type":"mask", 
         "masking_char": "*", 
         "chars_to_mask":12,
         "from_end":True,
         "DEFAULT":OperatorConfig("replace", {"new_value":"<ANONYMIZED>"})
        },
    )}

custom_anonymized_results = anonymizer.anonymize(
    text=text_to_anonymize,
    analyzer_results=analyzer_results,
    operators=operators,
)

print(f"text: {custom_anonymized_results.text}")

text: My name is <PERSON> and my email is <EMAIL_ADDRESS> and my phone number is ************. and my ID is <ID>.
