## Address Extraction & Masking using NER finetuned BERT Variants

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline


model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)


ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, device=0)

: 

### We pass the following arguments for address extraction

B-LOC	Beginning of a location right after another location \
I-LOC	Location

See https://huggingface.co/dslim/bert-base-NER for more details

In [30]:
text = "Send the package to 123 Main St, Springfield, IL 62704. " \
       "Or alternatively, you can send it to 456 Elm St, Shelbyville, IL 62565."\
       "you cna also send it to 75 Rue de la Tunisie, Nanterre, France"

ner_results = ner_pipeline(text)

address_tokens = [res['word'] for res in ner_results if res['entity'] == 'B-LOC' or res['entity'] == 'I-LOC']
address = ' '.join(address_tokens)

print("Extracted Address:", address)

ner_results = ner_pipeline(text)
address_tokens = [res['word'] for res in ner_results if res['entity'] == 'B-LOC' or res['entity'] == 'I-LOC']
print ("Address Tokens are ", address_tokens)

Extracted Address: Main St Springfield IL Elm St Shelby ##ville IL Rue de la Tu ##nis ##ie Nan ##ter ##re France
Address Tokens are  ['Main', 'St', 'Springfield', 'IL', 'Elm', 'St', 'Shelby', '##ville', 'IL', 'Rue', 'de', 'la', 'Tu', '##nis', '##ie', 'Nan', '##ter', '##re', 'France']


### Token Masking

In [31]:
import re
def mask_words(text, words_to_mask):
  for word in words_to_mask:
    text = re.sub(r'\b{}\b'.format(re.escape(word)), '[MASK]', text)
  return text

masked_text = mask_words(text, address_tokens)
print(masked_text)

Send the package to 123 [MASK] [MASK], [MASK], [MASK] 62704. Or alternatively, you can send it to 456 [MASK] [MASK], Shelbyville, [MASK] 62565.you cna also send it to 75 [MASK] [MASK] [MASK] Tunisie, Nanterre, [MASK]


# \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

## You can use a different NER model

Here we use a Distilled version of BERT, fine tuned for NER \
Model card: https://huggingface.co/dslim/distilbert-NER

In [32]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline


model_name = "dslim/distilbert-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)


ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, device=0)

text = "Send the package to 123 Main St, Springfield, IL 62704. " \
       "Or alternatively, you can send it to 456 Elm St, Shelbyville, IL 62565."\
       "you cna also send it to 75 Rue de la Tunisie, Nanterre, France"

ner_results = ner_pipeline(text)
address_tokens = [res['word'] for res in ner_results if res['entity'] == 'B-LOC' or res['entity'] == 'I-LOC']
print ("Address Tokens are ", address_tokens)
masked_text = mask_words(text, address_tokens)
print(masked_text)

Address Tokens are  ['Main', 'St', 'Springfield', 'IL', 'Elm', 'St', 'Shelby', '##ville', 'IL', 'Rue', 'de', 'la', 'Tu', '##nis', '##ie', 'Nan', '##ter', '##re', 'France']
Send the package to 123 [MASK] [MASK], [MASK], [MASK] 62704. Or alternatively, you can send it to 456 [MASK] [MASK], Shelbyville, [MASK] 62565.you cna also send it to 75 [MASK] [MASK] [MASK] Tunisie, Nanterre, [MASK]
