In [1]:
import transformers
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoModelForTokenClassification

transformers_model = 'obi/deid_roberta_i2b2' # e.g. "obi/deid_roberta_i2b2"

snapshot_download(repo_id=transformers_model)

# Instantiate to make sure it's downloaded during installation and not runtime
AutoTokenizer.from_pretrained(transformers_model)
AutoModelForTokenClassification.from_pretrained(transformers_model)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
    

In [2]:
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, EntityRecognizer
from presidio_analyzer.nlp_engine import NlpEngineProvider
import copy

from presidio_analyzer import EntityRecognizer
from typing import Optional, List
from transformers import (
        AutoTokenizer,
        AutoModelForTokenClassification,
        pipeline,
        TokenClassificationPipeline,
    )

In [6]:
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, EntityRecognizer
from presidio_analyzer.nlp_engine import NlpEngineProvider
import copy

from presidio_analyzer import EntityRecognizer
from typing import Optional, List
from transformers import (
        AutoTokenizer,
        AutoModelForTokenClassification,
        pipeline,
        TokenClassificationPipeline,
    )

class TransformersRecognizer(EntityRecognizer):
  
    def load(self) -> None:
        pass

    def __init__(
        self,
        model_path: Optional[str] = None,
        pipeline: Optional[TokenClassificationPipeline] = None,
        supported_entities: Optional[List[str]] = None,
    ):
        self.name = f"Transformers model {model_path}"
        self.model_path = model_path
        self.pipeline = pipeline
        self.is_loaded = False

        self.aggregation_mechanism = None
        self.ignore_labels = None
        self.model_to_presidio_mapping = None
        self.entity_mapping = None
        self.default_explanation = None
        self.text_overlap_length = None
        self.chunk_length = None
        self.id_entity_name = None
        self.id_score_reduction = None
        self.supported_language = ["en"]
        self.supported_entities = [
        "LOCATION",
        "PERSON",
        "ORGANIZATION",
        "AGE",
        "PHONE_NUMBER",
        "EMAIL",
        "DATE_TIME",
        "ZIP",
        "PROFESSION",
        "USERNAME",
        "ID"]
        self._id = 1
        
    def load_transformer(self, **kwargs) -> None:

        self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {})
        self.model_to_presidio_mapping = kwargs.get("MODEL_TO_PRESIDIO_MAPPING", {})
        self.ignore_labels = kwargs.get("LABELS_TO_IGNORE", ["O"])
        self.aggregation_mechanism = kwargs.get("SUB_WORD_AGGREGATION", "simple")
        self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", None)
        self.text_overlap_length = kwargs.get("CHUNK_OVERLAP_SIZE", 40)
        self.chunk_length = kwargs.get("CHUNK_SIZE", 600)
        self.id_entity_name = kwargs.get("ID_ENTITY_NAME", "ID")
        self.id_score_reduction = kwargs.get("ID_SCORE_REDUCTION", 0.5)
        
        if not self.pipeline:
            if not self.model_path:
                self.model_path = "obi/deid_roberta_i2b2"

        self._load_pipeline()
        return self.pipeline

    def _load_pipeline(self) -> None:
        """Initialize NER transformers_rec pipeline using the model_path provided"""

        print(f"Initializing NER pipeline using", self.model_path)
        device = -1
        self.pipeline = pipeline(
            "ner",
            model=AutoModelForTokenClassification.from_pretrained(self.model_path),
            tokenizer=AutoTokenizer.from_pretrained(self.model_path),
            # Will attempt to group sub-entities to word level
            aggregation_strategy=self.aggregation_mechanism,
            device=device,
            framework="pt",
            ignore_labels=self.ignore_labels,
        )
        
        self.is_loaded = True

In [7]:
model_path = "obi/deid_roberta_i2b2"
transformers_recognizer = TransformersRecognizer(model_path=model_path)
model_configuration = { "PRESIDIO_SUPPORTED_ENTITIES":[ "LOCATION", "PERSON", "ORGANIZATION", "AGE", "PHONE_NUMBER", "EMAIL", "DATE_TIME", "ZIP", "PROFESSION", "USERNAME", "ID" ], "DEFAULT_MODEL_PATH":"obi/deid_roberta_i2b2", "LABELS_TO_IGNORE":["O"], "DEFAULT_EXPLANATION":"Identifiedas{}bytheobi/deid_roberta_i2b2NERmodel", "SUB_WORD_AGGREGATION":"simple", "DATASET_TO_PRESIDIO_MAPPING":{ "DATE":"DATE_TIME", "DOCTOR":"PERSON", "PATIENT":"PERSON", "HOSPITAL":"ORGANIZATION", "MEDICALRECORD":"O", "IDNUM":"O", "ORGANIZATION":"ORGANIZATION", "ZIP":"O", "PHONE":"PHONE_NUMBER", "USERNAME":"", "STREET":"LOCATION", "PROFESSION":"PROFESSION", "COUNTRY":"LOCATION", "LOCATION-OTHER":"LOCATION", "FAX":"PHONE_NUMBER", "EMAIL":"EMAIL", "STATE":"LOCATION", "DEVICE":"O", "ORG":"ORGANIZATION", "AGE":"AGE", }, "MODEL_TO_PRESIDIO_MAPPING":{ "PER":"PERSON", "LOC":"LOCATION", "ORG":"ORGANIZATION", "AGE":"AGE", "ID":"ID", "EMAIL":"EMAIL", "PATIENT":"PERSON", "STAFF":"PERSON", "HOSP":"ORGANIZATION", "PATORG":"ORGANIZATION", "DATE":"DATE_TIME", "PHONE":"PHONE_NUMBER", }, "CHUNK_OVERLAP_SIZE":40, "CHUNK_SIZE":600, "ID_SCORE_MULTIPLIER":0.4, "ID_ENTITY_NAME":"ID" }
pipeline_ = transformers_recognizer.load_transformer(**model_configuration)

Initializing NER pipeline using obi/deid_roberta_i2b2


In [8]:
with open("demo_text.txt") as f:
    demo_text = f.readlines()
    demo_text = "".join(demo_text)
    
print(demo_text)

Here are a few example sentences we currently support:

Hello, my name is David Johnson and I live in Maine.
My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.

On September 18 I visited microsoft.com and sent an email to test@presidio.site,  from the IP 192.168.0.1.

My passport: 191280342 and my phone number: (212) 555-1234.

This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?

Kate's social security number is 078-05-1126.  Her driver license? it is 1234567A.


In [12]:
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, EntityRecognizer
from presidio_analyzer.nlp_engine import NlpEngineProvider
import copy

# configuration = {
#     "nlp_engine_name": "transformers",
#     "models": [
#              {"lang_code": "en", "model_name": {"spacy": "en_core_web_lg", "transformers": "obi/deid_roberta_i2b2"} }]
# }

configuration = {
        "nlp_engine_name": "spacy",
        "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
    }

# Create NLP engine based on configuration
provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine_with_spanish = provider.create_engine()

registry = RecognizerRegistry()
registry.load_predefined_recognizers()
registry.add_recognizer(transformers_recognizer)
registry.remove_recognizer("SpacyRecognizer")

# Pass the created NLP engine and supported_languages to the AnalyzerEngine
analyzer = AnalyzerEngine(
    nlp_engine=nlp_engine_with_spanish,registry=registry, 
    supported_languages=["en"]
)


def split_text_to_word_chunks(input_length: int, chunk_length: int, overlap_length: int):

    if input_length < chunk_length:
        return [[0, input_length]]
    if chunk_length <= overlap_length:
        print("overlap_length should be shorter than chunk_length, setting overlap_length to by half of chunk_length")
        overlap_length = chunk_length // 2
    return [
        [i, min([i + chunk_length, input_length])]
        for i in range(
            0, input_length - overlap_length, chunk_length - overlap_length
        )]


model_max_length = 512
# calculate inputs based on the text
text_length = len(demo_text)
print("text_length:::", text_length)

# Split text into chunks
if text_length <= model_max_length:
    
    predictions = pipeline(demo_text)
    
else:
    print("splitting the text into chunks, length::", text_length, ">", model_max_length)
    predictions = list()
    chunk_length = 600
    text_overlap_length = 40
    
    chunk_indexes = split_text_to_word_chunks(
        text_length, chunk_length, text_overlap_length
        )
    
    print("chunk_indexes:::", chunk_indexes)
    
    # iterate over text chunks and run inference
    for chunk_start, chunk_end in chunk_indexes:
        chunk_text = demo_text[chunk_start:chunk_end]
        chunk_preds = pipeline_(chunk_text)
        
        # align indexes to match the original text - add to each position the value of chunk_start
        aligned_predictions = list()
        
        
        for prediction in chunk_preds:
            prediction_tmp = copy.deepcopy(prediction)
            prediction_tmp["start"] += chunk_start
            prediction_tmp["end"] += chunk_start
            aligned_predictions.append(prediction_tmp)

        predictions.extend(aligned_predictions)


print("\n\nprediction:::", predictions)

text_length::: 609
splitting the text into chunks, length:: 609 > 512
chunk_indexes::: [[0, 600], [560, 609]]


prediction::: [{'entity_group': 'PATIENT', 'score': 0.99971944, 'word': ' David', 'start': 74, 'end': 79}, {'entity_group': 'PATIENT', 'score': 0.9997254, 'word': ' Johnson', 'start': 80, 'end': 87}, {'entity_group': 'LOC', 'score': 0.99989116, 'word': ' Maine', 'start': 102, 'end': 107}, {'entity_group': 'ID', 'score': 0.9778367, 'word': ' 40', 'start': 134, 'end': 136}, {'entity_group': 'PHONE', 'score': 0.9174068, 'word': '95', 'start': 136, 'end': 138}, {'entity_group': 'PHONE', 'score': 0.9045687, 'word': '-2609-9393-4932', 'start': 138, 'end': 153}, {'entity_group': 'ID', 'score': 0.70982337, 'word': ' 16Yeky6GM', 'start': 181, 'end': 190}, {'entity_group': 'ID', 'score': 0.43062696, 'word': 'N', 'start': 196, 'end': 197}, {'entity_group': 'ID', 'score': 0.5441897, 'word': 'BY7', 'start': 198, 'end': 201}, {'entity_group': 'DATE', 'score': 0.998892, 'word': ' September'