## First, let's install the packages we will need. The following libraries will be used throughout the project:

- huggingface_hub
- presidio_analyzer
- presidio_anonymizer
- presidio_image_redactor
- sentence_transformers
- dotenv

In [1]:
!pip install -r requirements.txt -q
# below is a fix for HuggingFace + Tensorflow 2.13+
!pip install -U git+https://github.com/huggingface/transformers.git -q

### We are going to download and use the dslim/bert-base-NER to augment PII detection. 

_bert-base-NER is a fine-tuned BERT model that is ready to use for Named Entity Recognition and achieves state-of-the-art performance for the NER task. It has been trained to recognize four types of entities: location (LOC), organizations (ORG), person (PER) and Miscellaneous (MISC)._

In [2]:
from huggingface_hub import snapshot_download

repo_id = 'dslim/bert-base-NER'
model_id = repo_id.split('/')[-1]

snapshot_download(repo_id=repo_id, local_dir=model_id)

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

'/Users/spm1976/development/pii-analyzer-anonymizer/bert-base-NER'

### Next we will implement our Presidio anonymizer.

_First the base analyzer is created and initialized_
_Second we will create a class to extend the base analyzer instantiation_

In [3]:
import spacy

try:
  nlp_lg = spacy.load("en_core_web_lg")
except ModuleNotFoundError:
  download(model="en_core_web_lg")


In [27]:
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig, RecognizerResult
from presidio_analyzer import AnalyzerEngine
from typing import List  

from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult
from presidio_analyzer.nlp_engine import NlpArtifacts

from transformers import pipeline

# load spacy model -> workaround
#import os
#os.system("spacy download en_core_web_lg")

# list of entities: https://microsoft.github.io/presidio/supported_entities/#list-of-supported-entities
DEFAULT_ANOYNM_ENTITIES = [
    "CREDIT_CARD", 
    "CRYPTO",
    "DATE_TIME",
    "EMAIL_ADDRESS",
    "IBAN_CODE",
    "IP_ADDRESS",
    "NRP",
    "LOCATION",
    "PERSON",
    "PHONE_NUMBER",
    "MEDICAL_LICENSE",
    "URL",
    "ORGANIZATION",
    "US_SSN"
]


In [39]:
# implement EntityRecognizer class for HuggingFace NER model
class TransformerRecognizer(EntityRecognizer):
    '''
    '''
    def __init__(
        self,
        model_id_or_path=None,
        aggregation_strategy='simple',
        supported_language='en',
        ignore_labels=['0','O','MISC']
    ):
         # initialize transformers pipeline for given mode or path
        self.pipeline = pipeline(
            "token-classification",
            model=model_id_or_path, 
            aggregation_strategy=aggregation_strategy,
            ignore_labels=ignore_labels
        )
        
        # map labels to presidio labels
        self.label2presidio = {
            "PER": "PERSON",
            "LOC": "LOCATION",
            "ORG": "ORGANIZATION"
        }
        
        #pass entities from model to parent class
        super().__init__(
            supported_entities=list(self.label2presidio.values()), 
            supported_language=supported_language
        )
        
    '''
    '''
    def load(self):
        ''' no loading is required '''
        pass
    
    '''
    '''
    def analyze(
        self,
        text,
        entities=None,
        nlp_artifacts=None
    ):        
        predicted_entities = self.pipeline(text)
        
        results = [ 
            RecognizerResult(entity_type=self.label2presidio[e['entity_group']], 
                             start=e['start'], 
                             end=e['end'], 
                             score=e['score']) for e in predicted_entities
        ]
                
        return results

In [40]:
# not used. just an example
def model_fn(model_dir):
    xfmr_recognizer = TransformerRecognizer(model_dir)
    analyzer = AnalyzerEngine()
    analyzer.registry.add_recognizer(xfmr_recognizer)
    return analyzer

In [41]:
# not used. just an example
def predict_fn(data, analyzer):
    sentences = data.pop('inputs', data)
    if "parameters" in data:
        anonymization_entities = data["parameters"].get("entities", DEFAULT_ANOYNM_ENTITIES)
        anonymize_text = data["parameters"].get("anonymize", False)
    else:
        anonymization_entities = DEFAULT_ANOYNM_ENTITIES
        anonymize_text = False
        
    # identify entities
    results = analyzer.analyze(text=sentences, entities=anonymization_entities, language="en")
    
    # anonymize text
    if anonymize_text:
        result = engine.anonymize(text=sentences, analyzer_results=results)
        return {"anonymized": result.text}

    return {"found": [entity.to_dict() for entity in results]}

In [42]:
model_dir = 'bert-base-NER' # directory that we downloaded HuggingFace to above

xfmr_recognizer = TransformerRecognizer(model_dir)
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(xfmr_recognizer)

Some weights of the model checkpoint at bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [43]:
text = "His name is Mr. Jones and his phone number is 212-555-5555"

analyzer_results = analyzer.analyze(text=text, language="en")

print(analyzer_results)

[type: PERSON, start: 16, end: 21, score: 0.944421648979187, type: PHONE_NUMBER, start: 46, end: 58, score: 0.75]


In [44]:
operators = {
    "DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"}),
    "PHONE_NUMBER": OperatorConfig(
        "mask",
        {
            "type": "mask",
            "masking_char": "*",
            "chars_to_mask": 12,
            "from_end": True,
        },
    ),
    "TITLE": OperatorConfig("redact", {}),
}

In [45]:
# initialize the anonymizer. this is not the extended EntityRecognizer
anonymizer_engine = AnonymizerEngine()

# create anonymized results
anonymized_results = anonymizer_engine.anonymize(
    text=text, analyzer_results=analyzer_results, operators=operators
)

print(anonymized_results)

text: His name is Mr. <ANONYMIZED> and his phone number is ************
items:
[
    {'start': 53, 'end': 65, 'entity_type': 'PHONE_NUMBER', 'text': '************', 'operator': 'mask'},
    {'start': 16, 'end': 28, 'entity_type': 'PERSON', 'text': '<ANONYMIZED>', 'operator': 'replace'}
]



In [46]:
text = '''
John Smith, born in 1987, lives in Seattle, Washington. 
He is a software engineer and has a Bachelor's degree in Computer Science from the University of Washington. 
He drives a blue Honda Accord and his driver's license number is A123456789. 
His social security number is 995-12-2716 and his phone number is (206) 555-1234. 
John enjoys playing basketball and hiking in his free time. 
He is married to Sarah Smith and they have two children, Emma and Jake.
He banks at JPMC and his account number is 99953153415
'''

analyzer_results =  analyzer.analyze(text=text, language="en")

anonymized_results = anonymizer_engine.anonymize(
    text=text, analyzer_results=analyzer_results, operators=operators
)

print(anonymized_results)

text: 
<ANONYMIZED>, born in <ANONYMIZED>, lives in <ANONYMIZED>, <ANONYMIZED>. 
He is a software engineer and has a Bachelor's degree in Computer Science from the <ANONYMIZED>. 
He drives a blue Honda Accord and his driver's license number is <ANONYMIZED>. 
His social security number is <ANONYMIZED> and his phone number is (2************. 
<ANONYMIZED> enjoys playing basketball and hiking in his free time. 
He is married to <ANONYMIZED> and they have two children, <ANONYMIZED> and <ANONYMIZED>.
He banks at <ANONYMIZED> and his account number is ***********

items:
[
    {'start': 546, 'end': 557, 'entity_type': 'PHONE_NUMBER', 'text': '***********', 'operator': 'mask'},
    {'start': 507, 'end': 519, 'entity_type': 'ORGANIZATION', 'text': '<ANONYMIZED>', 'operator': 'replace'},
    {'start': 481, 'end': 493, 'entity_type': 'PERSON', 'text': '<ANONYMIZED>', 'operator': 'replace'},
    {'start': 464, 'end': 476, 'entity_type': 'PERSON', 'text': '<ANONYMIZED>', 'operator': 'replace'},
  

In [47]:
!cd redpanda && ./start_container.bash

In [None]:
from dotenv import load_dotenv