## First, let's install the packages we will need. The following libraries will be used throughout the project:

- huggingface_hub
- presidio_analyzer
- presidio_anonymizer
- presidio_image_redactor
- sentence_transformers

In [1]:
!pip install -r requirements.txt -q

### We are going to download and use the dslim/bert-base-NER to augment PII detection. 

_bert-base-NER is a fine-tuned BERT model that is ready to use for Named Entity Recognition and achieves state-of-the-art performance for the NER task. It has been trained to recognize four types of entities: location (LOC), organizations (ORG), person (PER) and Miscellaneous (MISC)._

In [2]:
from huggingface_hub import snapshot_download

repo_id = 'dslim/bert-base-NER'
model_id = repo_id.split('/')[-1]

snapshot_download(repo_id=repo_id, local_dir=model_id)

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

'/Users/spm1976/development/pii-analyzer-anonymizer/bert-base-NER'

### Next we will implement our Presidio anonymizer.

_First the base analyzer is created and initialized_
_Second we will create a class to extend the base analyzer instantiation_

In [3]:
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import AnalyzerEngine
from typing import List  

from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult
from presidio_analyzer.nlp_engine import NlpArtifacts
from transformers import pipeline

# load spacy model -> workaround
import os
os.system("spacy download en_core_web_lg")

# list of entities: https://microsoft.github.io/presidio/supported_entities/#list-of-supported-entities
DEFAULT_ANOYNM_ENTITIES = [
    "CREDIT_CARD", 
    "CRYPTO",
    "DATE_TIME",
    "EMAIL_ADDRESS",
    "IBAN_CODE",
    "IP_ADDRESS",
    "NRP",
    "LOCATION",
    "PERSON",
    "PHONE_NUMBER",
    "MEDICAL_LICENSE",
    "URL",
    "ORGANIZATION"
]


# initialize the anonymizer. this is not the extended EntityRecognizer
engine = AnonymizerEngine()

Collecting en-core-web-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [4]:
# implement EntityRecognizer class for HuggingFace NER model
class TransformerRecognizer(EntityRecognizer):
    '''
    '''
    def __init__(
        self,
        model_id_or_path=None,
        aggregation_strategy='simple',
        supported_language='en',
        ignore_labels=['0','MISC']
    ):
         # initialize transformers pipeline for given mode or path
        self.pipeline = pipeline(
            "token-classification",
            model=model_id_or_path, 
            aggregation_strategy=aggregation_strategy,
            ignore_labels=ignore_labels
        )
        
        # map labels to presidio labels
        self.label2presidio = {
            "PER": "PERSON",
            "LOC": "LOCATION",
            "ORG": "ORGANIZATION",
        }
        
        #pass entities from model to parent class
        super().__init__(
            supported_entities=list(self.label2presidio.values()), 
            supported_language=supported_language
        )
        
    '''
    '''
    def load(self):
        ''' no loading is required '''
        pass
    
    '''
    '''
    def analyze(
        self,
        text,
        entities=None,
        nlp_artifacts=None
    ):        
        predicted_entities = self.pipeline(text)
        
        results = [ 
            RecognizerResult(entity_type=self.label2presidio[e['entity_group']], 
                             start=e['start'], 
                             end=e['end'], 
                             score=e['score']) for e in predicted_entities
        ]
        
        return results
        
    

In [6]:
def model_fn(model_dir):
    xfmr_recognizer = TransformerRecognizer(model_dir)
    analyzer = AnalyzerEngine()
    analyzer.registry.add_recognizer(xfmr_recognizer)
    return analyzer

In [7]:
def predict_fn(data, analyzer):
    sentences = data.pop('inputs', data)
    if "parameters" in data:
        anonymization_entities = data["parameters"].get("entities", DEFAULT_ANOYNM_ENTITIES)
        anonymize_text = data["parameters"].get("anonymize", False)
    else:
        anonymization_entities = DEFAULT_ANOYNM_ENTITIES
        anonymize_text = False
        
    # identify entities
    results = analyzer.analyze(text=sentences, entities=anonymization_entities, language="en")
    
    # anonymize text
    if anonymize_text:
        result = engine.anonymize(text=sentences, analyzer_results=results)
        return {"anonymized": result.text}

    return {"found": [entity.to_dict() for entity in results]}