## Microsoft Presidio Demo

In [1]:
! pip install presidio_analyzer presidio_anonymizer

Collecting presidio_analyzer
  Downloading presidio_analyzer-2.2.358-py3-none-any.whl.metadata (3.2 kB)
Collecting presidio_anonymizer
  Downloading presidio_anonymizer-2.2.358-py3-none-any.whl.metadata (8.1 kB)
Collecting phonenumbers<9.0.0,>=8.12 (from presidio_analyzer)
  Downloading phonenumbers-8.13.55-py2.py3-none-any.whl.metadata (11 kB)
Collecting tldextract (from presidio_analyzer)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract->presidio_analyzer)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading presidio_analyzer-2.2.358-py3-none-any.whl (114 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading presidio_anonymizer-2.2.358-py3-none-any.whl (31 kB)
Downloading phonenumbers-8.13.55-py2.py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m3

In [2]:
! python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [10]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
import warnings
warnings.filterwarnings("ignore")

## 1. Detect sensitive data in text

In [11]:
text_to_anonymize = "I am Bun Rong and his phone nmber is 212-777-6666 his email address is bill22@gmail.com and his ID is 123456"

In [12]:
analyzer = AnalyzerEngine()
analyzer_results = analyzer.analyze(text = text_to_anonymize, entities = ["PHONE_NUMBER","PERSON","EMAIL_ADDRESS"], language = 'en')

print(analyzer_results)

[type: EMAIL_ADDRESS, start: 71, end: 87, score: 1.0, type: PERSON, start: 5, end: 13, score: 0.85, type: PHONE_NUMBER, start: 37, end: 49, score: 0.75]


In [13]:
for result in analyzer_results:
  print(text_to_anonymize[result.start:result.end], result.entity_type)


bill22@gmail.com EMAIL_ADDRESS
Bun Rong PERSON
212-777-6666 PHONE_NUMBER


### 2. Add custom entity

In [14]:
# An entity ID to identify any token that contains 2 lettres followed by 3 digits

from presidio_analyzer import Pattern, PatternRecognizer

# using regex expression - ex: ID having 6 digits
id_pattern = Pattern(name = "id_pattern", regex = "\d{6}", score = 0.5)

id_recognizer = PatternRecognizer(
    supported_entity = "ID", patterns = [id_pattern]
)


# Add custom recognizer to analyzer



analyzer.registry.add_recognizer(id_recognizer)


In [15]:
analyzer_results = analyzer.analyze(text = text_to_anonymize, entities = ["PHONE_NUMBER","PERSON","EMAIL_ADDRESS","ID"], language = 'en')

print(analyzer_results)

[type: EMAIL_ADDRESS, start: 71, end: 87, score: 1.0, type: PERSON, start: 5, end: 13, score: 0.85, type: PHONE_NUMBER, start: 37, end: 49, score: 0.75, type: ID, start: 102, end: 108, score: 0.5]


### 3. Anonymize sensitive data

In [16]:
anonymizer = AnonymizerEngine()

anonymized_results = anonymizer.anonymize(
    text = text_to_anonymize,
    analyzer_results = analyzer_results
)

print(f"text {anonymized_results.text}")

text I am <PERSON> and his phone nmber is <PHONE_NUMBER> his email address is <EMAIL_ADDRESS> and his ID is <ID>


### 4. Custom anonymization

In [17]:
from presidio_anonymizer.entities import OperatorConfig

In [9]:
from presidio_anonymizer.entities import OperatorConfig

operators = {
    "PHONE_NUMBER": OperatorConfig("mask", {
        "type": "mask",
        "masking_char": "*",
        "chars_to_mask": 12,
        "from_end": True
    }),
    "DEFAULT": OperatorConfig("replace", {
        "new_value": "<ANONYMIZED>"
    })
}

custom_anonymized_results = anonymizer.anonymize(
    text=text_to_anonymize,
    analyzer_results=analyzer_results,
    operators=operators
)

print(f"text: {custom_anonymized_results.text}")


text: I am <ANONYMIZED> and his phone nmber is ************ his email address is <ANONYMIZED> and his ID is <ANONYMIZED>
