### This is a minimal example for sanitizing sentences. 

In [None]:
from preempt.utils import *

: 

In [3]:
# Load NER model.
# We use a Universal NER model here: https://universal-ner.github.io/
ner_model = NER("/nobackup3/divyam/models/uniner-7b-pii-v3", device="cuda:1")
# ner_model = NER("/path/to/Meta-Llama-3-8B-Instruct/", device="cuda:1")

# Load Sanitizer object
sanitizer = Sanitizer(ner_model, key = "EF4359D8D580AA4F7F036D6F04FC6A94", tweak = "D8E7920AFA330A73")

Loading checkpoint shards: 100%|██████████| 6/6 [00:02<00:00,  2.46it/s]


### Example for sanitizing names of people:

In [4]:
sentences = ["Ben Parker and John Doe went to the bank.", "Who was late today? Adam."]
extracted = ner_model.extract(sentences, entity_type='Name')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:00<00:00,  2.74it/s]

NER extraction:
{'Name': [['Ben Parker', 'John Doe'], ['Adam']]}





In [5]:
sanitized_sentences, _ = sanitizer.encrypt(sentences, entity='Name', epsilon=1)
print("Sanitized sentences:")
print(sanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  6.20it/s]

Sanitized sentences:
['Jay Francois and Lamine Franklin went to the bank.', 'Who was late today? Elie Vinod.']





In [6]:
extracted = ner_model.extract(sanitized_sentences, entity_type='Name')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:00<00:00,  4.40it/s]

NER extraction:
{'Name': [['Jay Francois', 'Lamine Franklin'], ['Elie Vinod']]}





In [8]:
sanitizer.entity_mapping

{'Jay Francois': 'Ben Parker',
 'Lamine Franklin': 'John Doe',
 'Teddy Phil': 'Ben',
 'Surinder Rodriguez': 'John',
 'Ever Gonzalez': 'Parker',
 'Maxime Christophe': 'Doe',
 'Elie Vinod': 'Adam'}

In [None]:
desanitized_sentences = sanitizer.decrypt(sanitized_sentences, entity='Name', use_cache=False)
print("Desanitized sentences:")
print(desanitized_sentences)

In [7]:
desanitized_sentences = sanitizer.decrypt(sanitized_sentences, entity='Name', use_cache=True)
print("Desanitized sentences:")
print(desanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  4.48it/s]


KeyError: 0

### Example for sanitizing currency values with FPE:

In [9]:
sentences = ["Ben withdrew $10,000 from the bank. Adam got $5,550 in a loan.", "I won $25 in the lottery."]
extracted = ner_model.extract(sentences, entity_type='Money')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:00<00:00,  4.33it/s]

NER extraction:
{'Money': [['10,000', '5,550'], ['25']]}





In [10]:
sanitized_sentences, _ = sanitizer.encrypt(sentences, entity='Money', epsilon=1)
print("Sanitized sentences:")
print(sanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  4.36it/s]

Sanitized sentences:
['Ben withdrew $54829534,343 from the bank. Adam got $0400114,733 in a loan.', 'I won $70096546 in the lottery.']





In [11]:
extracted = ner_model.extract(sanitized_sentences, entity_type='Money')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:00<00:00,  2.37it/s]

NER extraction:
{'Money': [['54829534,343', '0400114,733'], ['70096546']]}





In [12]:
desanitized_sentences = sanitizer.decrypt(sanitized_sentences, entity='Money')
print("Desanitized sentences:")
print(desanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  2.36it/s]

Desanitized sentences:
['Ben withdrew $10,000 from the bank. Adam got $5,550 in a loan.', 'I won $25 in the lottery.']





### Example for sanitizing currency values with m-LDP:

In [13]:
sentences = ["Ben withdrew $1000 from the bank. Adam got $555 in a loan.", "I won $25 in the lottery."]
extracted = ner_model.extract(sentences, entity_type='Money')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:00<00:00,  5.36it/s]

NER extraction:
{'Money': [['1000', '555'], ['25']]}





In [14]:
sanitized_sentences, _ = sanitizer.encrypt(sentences, entity='Money', 
                                           epsilon=1, use_fpe=False, use_mdp=True)
print("Sanitized sentences:")
print(sanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  5.40it/s]

Sanitized sentences:
['Ben withdrew $1180 from the bank. Adam got $397 in a loan.', 'I won $11 in the lottery.']





In [15]:
sanitizer.entity_mapping

[{'1180': '1000', '397': '555'}, {'11': '25'}]

In [16]:
extracted = ner_model.extract(sanitized_sentences, entity_type='Money')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:00<00:00,  5.44it/s]

NER extraction:
{'Money': [['1180', '397'], ['11']]}





In [17]:
desanitized_sentences = sanitizer.decrypt(sanitized_sentences, entity='Money', 
                                          use_fpe=False, use_mdp=True)
print("Desanitized sentences:")
print(desanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  5.41it/s]

Desanitized sentences:
['Ben withdrew $1000 from the bank. Adam got $555 in a loan.', 'I won $25 in the lottery.']





### Example for sanitizing age values with m-LDP:

In [18]:
sentences = ["Ben turned 15 years old today.", "I am 25 years old."]
extracted = ner_model.extract(sentences, entity_type='Age')
print("NER extraction:")
print(extracted)

100%|██████████| 2/2 [00:00<00:00,  6.50it/s]

NER extraction:
{'Age': [['15'], ['25']]}





In [19]:
sanitized_sentences, _ = sanitizer.encrypt(sentences, entity='Age', epsilon=1, use_mdp=True, use_fpe=False)
print("Sanitized sentences:")
print(sanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  6.46it/s]

Sanitized sentences:
['Ben turned 16 years old today.', 'I am 22 years old.']





In [20]:
desanitized_sentences = sanitizer.decrypt(sanitized_sentences, entity='Age')
print("Desanitized sentences:")
print(desanitized_sentences)

100%|██████████| 2/2 [00:00<00:00,  6.59it/s]

Desanitized sentences:
['Ben turned 15 years old today.', 'I am 25 years old.']



