In [74]:
import json
with open('examples.json') as f:
    data = json.load(f)

In [75]:
examples = ['\n'.join(element['text']) for element in data]
ground_truth  = [element['entities'] for element in data]

In [76]:
from presidio_analyzer import AnalyzerEngine

analyzer = AnalyzerEngine()
entities = ['CREDIT_CARD', 'EMAIL_ADDRESS', "LOCATION", "NRP", "PERSON", "PHONE_NUMBER"]
analyzer_results = [analyzer.analyze(text=text, language="en", entities=entities) for text in examples]
print(examples[0], "\n")
for r in analyzer_results[0]:
    print(r, ":", examples[0][r.start:r.end])


Hello I'm having trouble logging into my Service Now account.
My name is Angelica Hubbard and my file id is 9294558.
My work email is angie.hubbard@lloydsbanking.com.
My personal email is anghub@gmail.com.
Please forward any personal emails to my home address: 12 Ditton Road, Surrey, TH9 7HK. 

type: EMAIL_ADDRESS, start: 134, end: 165, score: 1.0 : angie.hubbard@lloydsbanking.com
type: EMAIL_ADDRESS, start: 188, end: 204, score: 1.0 : anghub@gmail.com
type: PERSON, start: 73, end: 89, score: 0.85 : Angelica Hubbard
type: LOCATION, start: 277, end: 283, score: 0.85 : Surrey


In [77]:
# eval
def eval_results(examples, analyzer_results, ground_truth, score_thr = 0.75):
    out = {'total_found' : 0, 'total_ground_truth' : 0}
    for example, result, gt in zip(examples, analyzer_results, ground_truth):
        text_found = [example[r.start:r.end] for r in result if r.score > score_thr]
        out['total_found'] += len(text_found)
        out['total_ground_truth'] += len(gt)
        for pii in gt:
            pii_type, pii_text = pii['type'], pii['text']
            if pii_type not in out:
                out[pii_type] = {'num_total' : 0, 'num_found' : 0, "missed" : [], 'found' : []}
            out[pii_type]['num_total'] += 1
            if any(t in pii_text for t in text_found):
                out[pii_type]['num_found'] += 1
                out[pii_type]['found'] += [pii_text]
            else:
                out[pii_type]['missed'] += [pii_text]
    return out

results = eval_results(examples, analyzer_results, ground_truth)
results



{'total_found': 53,
 'total_ground_truth': 82,
 'NAME': {'num_total': 12,
  'num_found': 12,
  'missed': [],
  'found': ['Angelica Hubbard',
   'Oliver Thompson',
   'Robert Chen',
   'Emily Watson',
   'Sophia Lee',
   'Daniel Brown',
   'Mohammed Ahmed',
   'Charlotte Evans',
   'Olivia Clark',
   'William Turner',
   'David Johnson',
   'Sophia Martinez']},
 'FILE_ID': {'num_total': 18,
  'num_found': 0,
  'missed': ['9294558',
   '7823451',
   '6542198',
   '89234567',
   '7654321',
   '9876543',
   '5432167',
   '6789012',
   '3456789',
   '2345678',
   '8901234',
   '4567890',
   '5678901',
   '7890123',
   '9012345',
   '1234567',
   '2345678',
   '3456789'],
  'found': []},
 'WORK_EMAIL': {'num_total': 18,
  'num_found': 18,
  'missed': [],
  'found': ['angie.hubbard@lloydsbanking.com',
   'oliver.thompson@lloydsbanking.com',
   'sarah.jenkins@halifax.com',
   'robert.chen@clericalmedical.com',
   'emily.watson@lloydsbanking.com',
   'james.wilson@halifax.com',
   'sophia.lee@c

In [78]:
# file ID detector and personal personal email detector

from typing import List, Optional
from presidio_analyzer import EntityRecognizer, RecognizerResult, Pattern, PatternRecognizer
from presidio_analyzer.nlp_engine import NlpArtifacts


class PersonalEmailRecognizer(EntityRecognizer):

    """
    load an analyse methods are required
    we use spacy's token.like_number method to check if a token is a number or not
    """

    expected_confidence_level = 0.80  # expected confidence level for this recognizer

    def load(self) -> None:
        """No loading is required."""
        pass

    def analyze(
        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
    ) -> List[RecognizerResult]:
        """
        Analyzes test to find tokens which represent numbers (either 123 or One Two Three).
        """
        results = []

        # iterate over the spaCy tokens, and call `token.like_num`
        for token in nlp_artifacts.tokens:
            if token.like_email:
                personal_domains = ['@gmail.com', '@yahoo.com', '@hotmail.com', 'aol.com']
                if any([domain in token.text for domain in personal_domains]):
                    result = RecognizerResult(
                        entity_type="PERSONAL_EMAIL",
                        start=token.idx,
                        end=token.idx + len(token),
                        score=self.expected_confidence_level)
                    results.append(result)
        return results
    
    def enhance_using_context(
        self, 
        text: str, 
        raw_recognizer_results: List[RecognizerResult], 
        other_raw_recognizer_results: List[RecognizerResult],
        nlp_artifacts: NlpArtifacts,
        context: Optional[List[str]] = None,
    ) -> List[RecognizerResult]:
        """
        can do anything here: increase value based on context, decrease, filter etc.
        """
        enhanced_results=[]
        context_words = ['personal', 'home', 'customer', 'his', 'her', 'their']
        has_context_words =   any(context_word in text for context_word in context_words)
        if has_context_words:
            for result in raw_recognizer_results:
                result.score += 0.15
                enhanced_results.append(result)
            return enhanced_results
        else:
            return raw_recognizer_results

personal_email_recognizer = PersonalEmailRecognizer(supported_entities=["PERSONAL_EMAIL"])
analyzer.registry.add_recognizer(personal_email_recognizer)


# POST CODE RecognizerResult

from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer


# Define the regex pattern
regex = r"[A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}"  # uk post code

postcode_pattern = Pattern(name="uk post code", regex=regex, score=0.5)

# Define the recognizer with the defined pattern
postcode_recognizer = PatternRecognizer(
    supported_entity="UK_POST_CODE", patterns=[postcode_pattern], context=['post', 'address', 'customer'], # context increases by 0.4
)

context_aware_enhancer = LemmaContextAwareEnhancer(
    context_similarity_factor=0.45, # added to score if context words present
    min_score_with_context_similarity=0.4 # only flag if above this threshold
)

analyzer.registry.add_recognizer(postcode_recognizer)

In [79]:
analyzer = AnalyzerEngine(context_aware_enhancer=context_aware_enhancer)
analyzer.registry.add_recognizer(personal_email_recognizer)
entities = ['CREDIT_CARD', 'PERSONAL_EMAIL', "LOCATION", "NRP", "PERSON", "PHONE_NUMBER", "AGE", "UK_NHS", "UK_POST_CODE"]
analyzer_results = [analyzer.analyze(text=text, language="en", entities=entities) for text in examples]
results = eval_results(examples, analyzer_results, ground_truth)
results


{'total_found': 40,
 'total_ground_truth': 82,
 'NAME': {'num_total': 12,
  'num_found': 12,
  'missed': [],
  'found': ['Angelica Hubbard',
   'Oliver Thompson',
   'Robert Chen',
   'Emily Watson',
   'Sophia Lee',
   'Daniel Brown',
   'Mohammed Ahmed',
   'Charlotte Evans',
   'Olivia Clark',
   'William Turner',
   'David Johnson',
   'Sophia Martinez']},
 'FILE_ID': {'num_total': 18,
  'num_found': 0,
  'missed': ['9294558',
   '7823451',
   '6542198',
   '89234567',
   '7654321',
   '9876543',
   '5432167',
   '6789012',
   '3456789',
   '2345678',
   '8901234',
   '4567890',
   '5678901',
   '7890123',
   '9012345',
   '1234567',
   '2345678',
   '3456789'],
  'found': []},
 'WORK_EMAIL': {'num_total': 18,
  'num_found': 1,
  'missed': ['angie.hubbard@lloydsbanking.com',
   'oliver.thompson@lloydsbanking.com',
   'sarah.jenkins@halifax.com',
   'robert.chen@clericalmedical.com',
   'emily.watson@lloydsbanking.com',
   'james.wilson@halifax.com',
   'daniel.brown@lloydsbanking.c

In [80]:
#presidio only gives 40% accuracy for this phone number.... not good
example = [e for e in examples if '020 7946 0234' in e][0]
print(example)
print(analyzer.analyze(example, language='en', entities=entities))

My laptop is not connecting to the company VPN.
I'm Robert Chen, file ID 89234567.
My work email is robert.chen@clericalmedical.com.
You can contact me at 020 7946 0234 if you need more information.
[type: PERSON, start: 52, end: 63, score: 0.85, type: PHONE_NUMBER, start: 155, end: 168, score: 0.4]


In [68]:
# speed check
# batch analyze:
from datasets import load_dataset
from presidio_analyzer import BatchAnalyzerEngine
from presidio_anonymizer import BatchAnonymizerEngine
import numpy as np

import time

data = load_dataset("ai4privacy/pii-masking-200k",split='train')
examples = [data[i]['source_text'] for i in range(10_000)]
char_count = [len(text) for text in examples]
word_count = [len(text.split(' ')) for text in examples]

print('Average chars:', np.mean(char_count))
print('Average words:', np.mean(word_count))
print('Total Words:', np.sum(word_count))
print('sample_text:', examples[0])

t0 = time.time()
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
analyzer_results = batch_analyzer.analyze_dict({'examples':examples}, language="en", entities = ['PHONE_NUMBER', 'EMAIL_ADDRESS', 'ID', 'CREDIT_CARD', 'LOCATION', "PERSONAL_EMAIL", "UK_POST_CODE"])
analyzer_results = list(analyzer_results)
print('Analyse time:', time.time()-t0)

t0 = time.time()
batch_anonymizer = BatchAnonymizerEngine()
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results)
anonymizer_results = list(anonymizer_results)
print('Redact time:', time.time()-t0)

Average chars: 172.3057
Average words: 24.9396
Total Words: 249396
sample_text: A student's assessment was found on device bearing IMEI: 06-184755-866851-3. The document falls under the various topics discussed in our Optimization curriculum. Can you please collect it?
Analyse time: 32.44903373718262
Redact time: 0.07559061050415039


In [69]:
print(analyzer_results[0].value[0])
print(analyzer_results[0].recognizer_results[0])

A student's assessment was found on device bearing IMEI: 06-184755-866851-3. The document falls under the various topics discussed in our Optimization curriculum. Can you please collect it?
[type: PHONE_NUMBER, start: 57, end: 75, score: 0.4]


In [23]:
analyzer.get_recognizers()

[<presidio_analyzer.predefined_recognizers.in_pan_recognizer.InPanRecognizer at 0x7f4c92b0e920>,
 <presidio_analyzer.predefined_recognizers.in_aadhaar_recognizer.InAadhaarRecognizer at 0x7f4c92b0ea10>,
 <presidio_analyzer.predefined_recognizers.in_vehicle_registration_recognizer.InVehicleRegistrationRecognizer at 0x7f4c92b0ea40>,
 <presidio_analyzer.predefined_recognizers.credit_card_recognizer.CreditCardRecognizer at 0x7f4c92b0ea70>,
 <presidio_analyzer.predefined_recognizers.au_tfn_recognizer.AuTfnRecognizer at 0x7f4c92b0e290>,
 <presidio_analyzer.predefined_recognizers.crypto_recognizer.CryptoRecognizer at 0x7f4c92b0eaa0>,
 <presidio_analyzer.predefined_recognizers.date_recognizer.DateRecognizer at 0x7f4c92b0ead0>,
 <presidio_analyzer.predefined_recognizers.email_recognizer.EmailRecognizer at 0x7f4c92b0eb00>,
 <presidio_analyzer.predefined_recognizers.iban_recognizer.IbanRecognizer at 0x7f4c92b0eb30>,
 <presidio_analyzer.predefined_recognizers.ip_recognizer.IpRecognizer at 0x7f4c92b