### This is a minimal example for sanitizing sentences. 

In [1]:
from preempt.ner import *
from preempt.sanitizer import *

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load NER model.
# We use a Universal NER model here: https://universal-ner.github.io/
ner_model = NER("/path/to/UniNER-7B-all", device="cuda:1")
# ner_model = NER("/path/to/Meta-Llama-3-8B-Instruct/", device="cuda:1")

# Load Sanitizer object
sanitizer = Sanitizer(ner_model, key = "EF4359D8D580AA4F7F036D6F04FC6A94", tweak = "D8E7920AFA330A73")

Loading checkpoint shards: 100%|██████████| 6/6 [00:02<00:00,  2.42it/s]


### Example for sanitizing names of people:

In [3]:
sentences = ["Ben Parker and John Doe went to the bank.", "Who was late today? Adam."]
extracted = ner_model.extract(sentences, entity_type='Name')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:00<00:00,  2.76it/s]

NER extraction:
{'Name': [['Ben Parker', 'John Doe'], ['Adam']]}





In [4]:
sanitized_sentences, _ = sanitizer.encrypt(sentences, entity='Name', epsilon=1)
print("Sanitized sentences:")
print(sanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  5.81it/s]

Sanitized sentences:
['Jay Francois and Lamine Franklin went to the bank.', 'Who was late today? Elie Vinod.']





In [5]:
extracted = ner_model.extract(sanitized_sentences, entity_type='Name')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:00<00:00,  4.64it/s]

NER extraction:
{'Name': [['Jay Francois', 'Lamine Franklin'], ['Elie Vinod']]}





In [6]:
desanitized_sentences = sanitizer.decrypt(sanitized_sentences, entity='Name')
print("Desanitized sentences:")
print(desanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  4.44it/s]

Desanitized sentences:
['Ben Parker and John Doe went to the bank.', 'Who was late today? Adam.']





### Example for sanitizing currency values with FPE:

In [7]:
sentences = ["Ben withdrew $10,000 from the bank. Adam got $5,550 in a loan.", "I won $25 in the lottery."]
extracted = ner_model.extract(sentences, entity_type='Money')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:00<00:00,  4.24it/s]

NER extraction:
{'Money': [['10,000', '5,550'], ['25']]}





In [8]:
sanitized_sentences, _ = sanitizer.encrypt(sentences, entity='Money', epsilon=1)
print("Sanitized sentences:")
print(sanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  4.17it/s]

Sanitized sentences:
['Ben withdrew $54829534,343 from the bank. Adam got $0400114,733 in a loan.', 'I won $70096546 in the lottery.']





In [9]:
extracted = ner_model.extract(sanitized_sentences, entity_type='Money')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:00<00:00,  2.38it/s]

NER extraction:
{'Money': [['54829534,343', '0400114,733'], ['70096546']]}





In [10]:
desanitized_sentences = sanitizer.decrypt(sanitized_sentences, entity='Money')
print("Desanitized sentences:")
print(desanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  2.31it/s]

Desanitized sentences:
['Ben withdrew $10,000 from the bank. Adam got $5,550 in a loan.', 'I won $25 in the lottery.']





### Example for sanitizing currency values with m-LDP:

In [11]:
sentences = ["Ben withdrew $1000 from the bank. Adam got $555 in a loan.", "I won $25 in the lottery."]
extracted = ner_model.extract(sentences, entity_type='Money')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:00<00:00,  5.42it/s]

NER extraction:
{'Money': [['1000', '555'], ['25']]}





In [12]:
sanitized_sentences, _ = sanitizer.encrypt(sentences, entity='Money', 
                                           epsilon=1, use_fpe=False, use_mdp=True)
print("Sanitized sentences:")
print(sanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  5.18it/s]

Sanitized sentences:
['Ben withdrew $1000 from the bank. Adam got $496 in a loan.', 'I won $21 in the lottery.']





In [13]:
sanitizer.entity_mapping

[{'1000': '1000', '496': '555'}, {'21': '25'}]

In [14]:
extracted = ner_model.extract(sanitized_sentences, entity_type='Money')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:00<00:00,  5.03it/s]

NER extraction:
{'Money': [['1000', '496'], ['21']]}





In [15]:
desanitized_sentences = sanitizer.decrypt(sanitized_sentences, entity='Money', 
                                          use_fpe=False, use_mdp=True)
print("Desanitized sentences:")
print(desanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  5.17it/s]

Desanitized sentences:
['Ben withdrew $1000 from the bank. Adam got $555 in a loan.', 'I won $25 in the lottery.']





### Example for sanitizing age values with m-LDP:

In [27]:
sentences = ["Ben turned 15 years old today.", "I am 25 years old."]
extracted = ner_model.extract(sentences, entity_type='Age')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:00<00:00,  6.21it/s]

NER extraction:
{'Age': [['15'], ['25']]}





In [28]:
sanitized_sentences, _ = sanitizer.encrypt(sentences, entity='Age', epsilon=1, use_mdp=True, use_fpe=False)
print("Sanitized sentences:")
print(sanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  6.61it/s]

Sanitized sentences:
['Ben turned 13 years old today.', 'I am 26 years old.']





In [29]:
desanitized_sentences = sanitizer.decrypt(sanitized_sentences, entity='Age')
print("Desanitized sentences:")
print(desanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  6.62it/s]

Desanitized sentences:
['Ben turned 15 years old today.', 'I am 25 years old.']



