In [1]:
# download large spacy model
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m518.3 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [5]:
from presidio_analyzer import AnalyzerEngine

text = "His name is Mr. Jones and his phone number is 212-555-5555 and my email is stevejones@hotmail.com. My address is 12 Pine Road, KT6 8LP"

analyzer = AnalyzerEngine()
analyzer_results = analyzer.analyze(text=text, language="en")

for r in analyzer_results:
    print(r)


type: EMAIL_ADDRESS, start: 75, end: 97, score: 1.0
type: PERSON, start: 16, end: 21, score: 0.85
type: PHONE_NUMBER, start: 46, end: 58, score: 0.75
type: URL, start: 86, end: 97, score: 0.5


### Batch data speed

In [10]:
# batch analyze:
from datasets import load_dataset
data = load_dataset("ai4privacy/pii-masking-200k",split='train')
examples = [data[i]['source_text'] for i in range(10000)]
char_count = [len(text) for text in examples]
word_count = [len(text.split(' ')) for text in examples]
import numpy as np
print('Average chars:', np.mean(char_count))
print('Average words:', np.mean(word_count))
print('Total Words:', np.sum(word_count))
print('sample_text:', examples[0])

Average chars: 172.3057
Average words: 24.9396
Total Words: 249396
sample_text: A student's assessment was found on device bearing IMEI: 06-184755-866851-3. The document falls under the various topics discussed in our Optimization curriculum. Can you please collect it?


In [56]:
from presidio_analyzer import BatchAnalyzerEngine
from presidio_anonymizer import BatchAnonymizerEngine
import time

t0 = time.time()
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
analyzer_results = batch_analyzer.analyze_dict({'examples':examples}, language="en", entities = ['PHONE_NUMBER', 'EMAIL_ADDRESS', 'ID', 'CREDIT_CARD', 'LOCATION'])
analyzer_results = list(analyzer_results)
print('Analyse time:', time.time()-t0)

t0 = time.time()
batch_anonymizer = BatchAnonymizerEngine()
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results)
anonymizer_results = list(anonymizer_results)
print('Redact time:', time.time()-t0)

Analyse time: 30.002370357513428
Redact time: 0.0771169662475586


In [53]:
# USING ALL DETECTORS (sometimes gives overflow error so we resort back to list comprehension)
def analyse_text(text):
    try:
        return analyzer.analyze(text, language="en")
    except OverflowError:
        return 'OVERFLOW'
results = [analyse_text(t) for t in examples[:2000]]


### Deny List Recognizer

In [69]:
from presidio_analyzer import PatternRecognizer
# look for titles and titles only
titles_list = ["Sir","Ma'am","Madam","Mr.","Mrs.","Ms.","Miss","Dr.","Professor",]
titles_recognizer = PatternRecognizer(supported_entity="TITLE", deny_list=titles_list)
text1 = "I suspect Professor Plum, in the Dining Room, with the candlestick. His email is profplum@gmail.com"
results = titles_recognizer.analyze(text1, entities=["TITLE"])
for result in results:
    print(f"- {text1[result.start:result.end]} as {result.entity_type}")

- Professor as TITLE


In [73]:
# add it to a the analyzer
from presidio_analyzer import AnalyzerEngine

analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(titles_recognizer)
results = analyzer.analyze(text1, entities=["TITLE", "EMAIL_ADDRESS"], language="en")
for result in results:
    print(f"- {text1[result.start:result.end]} as {result.entity_type}")

- Professor as TITLE
- profplum@gmail.com as EMAIL_ADDRESS


### REGEX Recogniser

In [76]:
from presidio_analyzer import Pattern, PatternRecognizer

# Define the regex pattern in a Presidio `Pattern` object:
numbers_pattern = Pattern(name="numbers_pattern", regex="\d+", score=0.5)

# Define the recognizer with one or more patterns
number_recognizer = PatternRecognizer(
    supported_entity="NUMBER", patterns=[numbers_pattern])

text2 = "I live in 510 Broad st."

numbers_result = number_recognizer.analyze(text=text2, entities=["NUMBER"])

print("Result:")
print(numbers_result)

Result:
[type: NUMBER, start: 10, end: 13, score: 0.5]


In [80]:
# add it to a the analyzer
from presidio_analyzer import AnalyzerEngine

text = "I live in 510 Broad st. My postcode is KT7 8LP"
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(number_recognizer)
results = analyzer.analyze(text, entities=["NUMBER"], language="en")
for result in results:
    print(result)

type: NUMBER, start: 10, end: 13, score: 0.5
type: NUMBER, start: 41, end: 42, score: 0.5
type: NUMBER, start: 43, end: 44, score: 0.5


### Rule Based Recognizers

In [87]:
from typing import List
from presidio_analyzer import EntityRecognizer, RecognizerResult
from presidio_analyzer.nlp_engine import NlpArtifacts


class NumbersRecognizer(EntityRecognizer):

    """
    load an analyse methods are required
    we use spacy's token.like_number method to check if a token is a number or not
    """

    expected_confidence_level = 0.7  # expected confidence level for this recognizer

    def load(self) -> None:
        """No loading is required."""
        pass

    def analyze(
        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
    ) -> List[RecognizerResult]:
        """
        Analyzes test to find tokens which represent numbers (either 123 or One Two Three).
        """
        results = []

        # iterate over the spaCy tokens, and call `token.like_num`
        for token in nlp_artifacts.tokens:
            if token.like_num:
                result = RecognizerResult(
                    entity_type="NUMBER",
                    start=token.idx,
                    end=token.idx + len(token),
                    score=self.expected_confidence_level,
                )
                results.append(result)
        return results


# Instantiate the new NumbersRecognizer:
new_numbers_recognizer = NumbersRecognizer(supported_entities=["NUMBER"])

from presidio_analyzer import AnalyzerEngine

text = "Roberto lives in Five 10 Broad st. KT7 8HG"
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(new_numbers_recognizer)

results = analyzer.analyze(text=text, language="en")
print("Results:")
for result in results:
    print(str(result) + " : " + text[result.start:result.end])


Results:
type: PERSON, start: 0, end: 7, score: 0.85 : Roberto
type: NUMBER, start: 17, end: 21, score: 0.7 : Five
type: NUMBER, start: 22, end: 24, score: 0.7 : 10


### Levaraging Context

In [2]:
# combine a regex search with context words

from presidio_analyzer import Pattern, PatternRecognizer, RecognizerRegistry, AnalyzerEngine
from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer


# Define the regex pattern
regex = r"[A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}"  # uk post code

postcode_pattern = Pattern(name="uk post code", regex=regex, score=0.5)

# Define the recognizer with the defined pattern
postcode_recognizer = PatternRecognizer(
    supported_entity="UK_POST_CODE", patterns=[postcode_pattern], context=['post', 'code', 'address'], # context increases by 0.4
)

context_aware_enhancer = LemmaContextAwareEnhancer(
    context_similarity_factor=0.45, # added to score if context words present
    min_score_with_context_similarity=0.4 # only flag if above this threshold
)

registry = RecognizerRegistry()
registry.add_recognizer(postcode_recognizer)
analyzer = AnalyzerEngine(registry=registry, context_aware_enhancer=context_aware_enhancer)

# Test
text = "I live in 510 Broad st. My postcode is KT7 8LP. "
results = analyzer.analyze(text=text, language="en")

print(f"Result:\n {results}")

Result:
 [type: UK_POST_CODE, start: 39, end: 46, score: 0.95]


In [8]:
# context can be added at analyse time like so:
# e.g. we have a column name present:

from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, PatternRecognizer

# Define the recognizer with the defined pattern and context words
postcode_recognizer = PatternRecognizer(
    supported_entity="UK_POST_CODE",
    patterns=[postcode_pattern],
    context=["address", "post"],
)
registry = RecognizerRegistry()
registry.add_recognizer(postcode_recognizer)
analyzer = AnalyzerEngine(registry=registry)

# Test with an example record having a column name which could be injected as context
record = {"column_name": "address", "text": "HY8 9LH"}

result = analyzer.analyze(
    text=record["text"], language="en", context=[record["column_name"]]
)

print("Result:")
print(result)

Result:
[type: UK_POST_CODE, start: 0, end: 7, score: 0.85]
