A notebook demonstrating how to anonymise data prior to querying a third party LLM.

Sections:
    1. Anonymization using Presidio(pattern matching and Named Entity Extraction)
    2. Anonymization using Phi-3 (local model with structured generation)

# Presidio Anonymization
  - Direct matching
  - Context matching(spaCy)
  - ...

In [1]:
!pip install openai -q
!pip install "presidio_analyzer[transformers]"
!pip install presidio_anonymizer -q
!oython -m spacy download en_core_web_lg - q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting presidio_analyzer[transformers]
  Downloading presidio_analyzer-2.2.354-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/92.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting tldextract (from presidio_analyzer[transformers])
  Downloading tldextract-5.1.2-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.6/97.6 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting phonenumbers<9.0.0,>=8.12 (from pre

In [3]:
simple_text = "Jane's email is jane.doe@example.com and her birthday is 1992-05-15."

In [4]:
from presidio_analyzer import AnalyzerEngine

  ## Notes:
  # This will use a large spacy model by default: en_core_web_lg

  # Set up the engine, loads the NLP module (spaCy model by default) and other PII recognizers
analyzer = AnalyzerEngine()



[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [11]:
#Call analyzer to get results
simple_analyser_results = analyzer.analyze(text=simple_text,
                                          #  entities=["EMAIL_ADDRESS"],
                                           language="en")
print(f"\n\n{simple_analyser_results}")



[type: EMAIL_ADDRESS, start: 16, end: 36, score: 1.0, type: DATE_TIME, start: 57, end: 67, score: 0.95, type: PERSON, start: 0, end: 4, score: 0.85, type: URL, start: 16, end: 23, score: 0.5, type: URL, start: 25, end: 36, score: 0.5]


In [12]:
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import RecognizerResult

engine = AnonymizerEngine()

In [13]:
!pip install faker -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.8 MB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m1.2/1.8 MB[0m [31m17.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [15]:
# Now add some operators, which generates fake data, using the "faker library"
from faker import Faker
from presidio_anonymizer.entities import OperatorConfig

fake = Faker()

# Create faker function (note that it has to receive a value)
def fake_name(x):
  Faker.seed(42)
  return fake.name()

# Create custom operator for the PERSON entity
operators = {'PERSON': OperatorConfig("custom", {"lambda": fake_name})}

Notice below how the name is replaced, although it's the wrong gender, which could cause issues for the LLM with the incorrect pronouns/pronoun
references

In [17]:
# Invoke the anonymize function with the text
# Operators to get the anonymizaiton output:
simple_anon_result = engine.anonymize(
    text=simple_text,
    analyzer_results=simple_analyser_results,
    operators=operators,
)
print("De-identified text")
print(f"\n\n{simple_anon_result}")
print("\nOriginal text")
print(simple_text)

De-identified text


text: Allison Hill's email is <EMAIL_ADDRESS> and her birthday is <DATE_TIME>.
items:
[
    {'start': 60, 'end': 71, 'entity_type': 'DATE_TIME', 'text': '<DATE_TIME>', 'operator': 'replace'},
    {'start': 24, 'end': 39, 'entity_type': 'EMAIL_ADDRESS', 'text': '<EMAIL_ADDRESS>', 'operator': 'replace'},
    {'start': 0, 'end': 12, 'entity_type': 'PERSON', 'text': 'Allison Hill', 'operator': 'custom'}
]


Original text
Jane's email is jane.doe@example.com and her birthday is 1992-05-15.


## Reversible Anonymization
  - Presidio analyzer
  - Custom anonymizer (replacememt)
  - Processing via 3rd party end point (openai)
  - Custom reversal (replacement back to original)

In [19]:
from presidio_analyzer import AnalyzerEngine
from faker import Faker
from faker.providers import internet, person, date_time
import openai

In [36]:
fake = Faker("en_US")
fake.add_provider(internet)
fake.add_provider(person)
fake.add_provider(date_time)

analyzer = AnalyzerEngine()

def anonymize_text(analyzer_results, text_to_anonymize):
    """Anonymize text using Faker and build a mapping for de-anonymization."""
    entity_mapping = {}
    updated_text = text_to_anonymize  # Use updated_text to avoid modifying the original text

    def replace_and_store(entity_type, replacement_func):
        nonlocal updated_text  # Reference the non-local variable
        for result in analyzer_results:
            if result.entity_type == entity_type:
                original_value = text_to_anonymize[result.start:result.end]
                fake_value = replacement_func()
                entity_mapping[fake_value] = original_value

                # Replace in the updated_text the real value with the fake value
                updated_text = updated_text.replace(original_value, fake_value, 1)
        return updated_text

    updated_text = replace_and_store("EMAIL_ADDRESS", fake.safe_email)
    updated_text = replace_and_store("PERSON", fake.name)
    updated_text = replace_and_store("DATE_TIME", lambda: fake.date_time().strftime('%Y-%m-%d'))

    return updated_text, entity_mapping

# Define de-anonymization function using the mapping
def de_anonymize_text(anonymized_text, entity_mapping):
    for fake_value, real_value in entity_mapping.items():
        anonymized_text = anonymized_text.replace(fake_value, real_value)
    return anonymized_text




In [37]:
# Initialize engines and generate fake text
text = "Jane's email is jane.doe@example.com and her birthday is 1992-05-15."

# Analyze the text
# Note that the supported entities are listed here: https://microsoft.github.io/presidio/supported_entities/
# Entity detection can involve multiple techniques - including regex, spaCy model (for context), checksums (validating credit card numbers)
analyzer_results = analyzer.analyze(
    text=text,
    entities=["EMAIL_ADDRESS", "PERSON", "DATE_TIME"],  # comment out for autodetection (but that requires adjusting the denonymization step)
    language="en"
)

# Display the initial text and the analysis results
print(f"Original Text:\n{text}\n")
print(f"Analyzer result:\n{analyzer_results}\n")

# Anonymize the text and display the anonymized text and mapping
anonymized_text, entity_mapping = anonymize_text(analyzer_results, text)
print(f"Anonymized Text:\n{anonymized_text}\n")
print(f"Entity Mapping:\n{entity_mapping}\n")


Original Text:
Jane's email is jane.doe@example.com and her birthday is 1992-05-15.

Analyzer result:
[type: EMAIL_ADDRESS, start: 16, end: 36, score: 1.0, type: DATE_TIME, start: 57, end: 67, score: 0.95, type: PERSON, start: 0, end: 4, score: 0.85]

Anonymized Text:
Sharon James's email is stanleykendra@example.org and her birthday is 1994-09-04.

Entity Mapping:
{'stanleykendra@example.org': 'jane.doe@example.com', 'Sharon James': 'Jane', '1994-09-04': '1992-05-15'}



In [38]:
import getpass
import os

# Prompt for API key in Colab
os.environ["OPENAI_API_KEY"] = getpass.getpass("Please enter your OpenAI API key: ")


Please enter your OpenAI API key: ··········


In [39]:
# Generate text continuation with GPT-3.5 (using the OpenAI API)
# Specify a prompt for GPT-3.5 using the anonymized text
prompt = anonymized_text + " Re-write that information a little differently."

from openai import OpenAI
client = OpenAI()

response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "user", "content": prompt},
    ],
    temperature=0,
)

print(response)
# Extract the generated text from the response
generated_text = response.choices[0].message.content.strip()

print(f"\nAnonymized Text Sent to LLM:\n{prompt}\n")
print(f"LLM Response:\n{generated_text}\n")

ChatCompletion(id='chatcmpl-9W18YhdS3I2nyNJGBbTLnWxi7M8Fq', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Email: stanleykendra@example.org\nBirthday: September 4, 1994', role='assistant', function_call=None, tool_calls=None))], created=1717417894, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=18, prompt_tokens=39, total_tokens=57))

Anonymized Text Sent to LLM:
Sharon James's email is stanleykendra@example.org and her birthday is 1994-09-04. Re-write that information a little differently.

LLM Response:
Email: stanleykendra@example.org
Birthday: September 4, 1994



In [41]:
print(f"\nAnonymized Text Sent to LLM:\n{prompt}\n")
print(f"\nLLM Response:\n{generated_text}\n")

# De-anonymize the response and display the final result

# De-anonymize the full text
de_anonymized_text = de_anonymize_text(generated_text, entity_mapping)
print(f"\nDe-anonymized Response:\n{de_anonymized_text}\n")



Anonymized Text Sent to LLM:
Sharon James's email is stanleykendra@example.org and her birthday is 1994-09-04. Re-write that information a little differently.


LLM Response:
Email: stanleykendra@example.org
Birthday: September 4, 1994


De-anonymized Response:
Email: jane.doe@example.com
Birthday: September 4, 1994



In [42]:
# Display the initial text and the analysis results
print(f"\nOriginal Text:\n{text}\n")
print(f"\nAnalyzer result:\n{analyzer_results}\n")

# Anonymize the text and display the anonymized text and mapping
print(f"\nAnonymized Text:\n{anonymized_text}\n")
print(f"\nEntity Mapping:\n{entity_mapping}\n")
print(f"\nAnonymized Text Sent to LLM:\n{prompt}\n")
print(f"\nLLM Response:\n{generated_text}\n")
print(f"\nDe-anonymized Response:\n{de_anonymized_text}\n")



Original Text:
Jane's email is jane.doe@example.com and her birthday is 1992-05-15.


Analyzer result:
[type: EMAIL_ADDRESS, start: 16, end: 36, score: 1.0, type: DATE_TIME, start: 57, end: 67, score: 0.95, type: PERSON, start: 0, end: 4, score: 0.85]


Anonymized Text:
Sharon James's email is stanleykendra@example.org and her birthday is 1994-09-04.


Entity Mapping:
{'stanleykendra@example.org': 'jane.doe@example.com', 'Sharon James': 'Jane', '1994-09-04': '1992-05-15'}


Anonymized Text Sent to LLM:
Sharon James's email is stanleykendra@example.org and her birthday is 1994-09-04. Re-write that information a little differently.


LLM Response:
Email: stanleykendra@example.org
Birthday: September 4, 1994


De-anonymized Response:
Email: jane.doe@example.com
Birthday: September 4, 1994



# Local LLM based Anonymization
- Phi-3 with structured generation (from Outlines).