In [1]:
# install presidio via pip if not yet installed

#!pip install presidio-analyzer
#!pip install presidio-evaluator

In [2]:
import datetime
import pprint
from collections import Counter
from pathlib import Path
from typing import Dict, List

import pandas as pd
import numpy as np

from presidio_evaluator import InputSample
from presidio_evaluator.data_generator import PresidioSentenceFaker



# Generate fake PII data using the Presidio Sentence Faker

The Presidio Sentence Faker enables you to generate a synthetic dataset from sentence templates.
Example templates:

> I live at {{address}}

> You can email me at {{email}}. Thanks, {{first_name}}

> What's your last name? It's {{last_name}}

> Every time I see you falling I get down on my knees and pray

### Simple example
This uses the default generator to create 10 samples based on three templates

In [3]:
from presidio_evaluator.data_generator.faker_extensions.providers import *

IpAddressProvider  # Both Ipv4 and IPv6 IP addresses
NationalityProvider  # Read countries + nationalities from file
OrganizationProvider  # Read organization names from file
UsDriverLicenseProvider  # Read US driver license numbers from file
AgeProvider  # Age values (unavailable on Faker
AddressProviderNew  # Extend the default address formats
PhoneNumberProviderNew  # Extend the default phone number formats
ReligionProvider  # Read religions from file
HospitalProvider  # Read hospital names from file
MedicalProvider  # Read medical entities from file
UsPassportProvider

from presidio_evaluator.data_generator.presidio_sentence_faker import PresidioSentenceFaker

""" sentence_templates = [
    "Patient {{PERSON}} ({{AGE}} years old, {{GENDER}}) completed their registration on {{DATE_TIME}}. Contact: {{EMAIL_ADDRESS}}. SSN: {{US_SSN}}. Billing on file ends with {{CREDIT_CARD}}.",
    "Order confirmed for {{PERSON}} ({{AGE}}) on {{DATE_TIME}}. Receipt sent to {{EMAIL_ADDRESS}}. Shipping to {{LOCATION}}. Payment processed with card {{CREDIT_CARD}}. Order details at {{URL}}.",
    "New account opened for {{PERSON}} ({{AGE}}, {{GENDER}}) on {{DATE_TIME}}. SSN {{US_SSN}}. Primary contact: {{EMAIL_ADDRESS}}. Initial deposit from card {{CREDIT_CARD}}. Nearest branch: {{LOCATION}}."
    "{{PERSON}} ({{AGE}}, {{GENDER}}) had an appointment on {{DATE_TIME}}. Patient portal login sent to {{EMAIL_ADDRESS}}. Insurance verified with SSN {{US_SSN}}. Next visit scheduled at our {{LOCATION}} location. Secure message sent via {{URL}}.",
    "Booking confirmed for {{PERSON}} ({{AGE}}) on {{DATE_TIME}}. E-ticket sent to {{EMAIL_ADDRESS}}. Payment processed with card {{CREDIT_CARD}}. Pickup at {{LOCATION}}. View itinerary at {{URL}}.",
    "Application received from {{PERSON}} ({{AGE}}, {{GENDER}}) on {{DATE_TIME}}. Contact: {{EMAIL_ADDRESS}}. SSN provided for background check: {{US_SSN}}. Current location: {{LOCATION}}. Resume available at {{URL}}.",
    "New {{AGE}}-year-old {{GENDER}} member {{PERSON}} joined on {{DATE_TIME}}. Welcome email sent to {{EMAIL_ADDRESS}}. Membership card will be mailed to {{LOCATION}}. Payment method: {{CREDIT_CARD}}. Access member portal at {{URL}}.",
    "Claim submitted by {{PERSON}} ({{AGE}}, {{GENDER}}) on {{DATE_TIME}}. Policyholder SSN: {{US_SSN}}. Correspondence sent to {{EMAIL_ADDRESS}}. Incident location: {{LOCATION}}. Payment processed to card {{CREDIT_CARD}}.",
    "Subscription activated for {{PERSON}} ({{AGE}}) on {{DATE_TIME}}. Account email: {{EMAIL_ADDRESS}}. Billing: {{CREDIT_CARD}}. Location: {{LOCATION}}. Access your account at {{URL}}. For security, SSN required: {{US_SSN}}.",
    "Application for {{PERSON}} ({{AGE}}, {{GENDER}}) received on {{DATE_TIME}}. SSN verified: {{US_SSN}}. Notifications will be sent to {{EMAIL_ADDRESS}}. Service center: {{LOCATION}}. Check status at {{URL}}.",
    "Alert: Suspicious transaction on {{DATE_TIME}} for card {{CREDIT_CARD}}. Cardholder {{PERSON}} ({{AGE}}) was in {{LOCATION}}. If unauthorized, contact us at {{URL}} or verify at {{EMAIL_ADDRESS}}.",
    "{{PERSON}} ({{AGE}} years, {{GENDER}}), SSN {{US_SSN}}, seen on {{DATE_TIME}}. Records updated and notification sent to {{EMAIL_ADDRESS}}. Next appointment at our {{LOCATION}} office. Secure portal: {{URL}}.",
    "{{PERSON}} ({{AGE}}, {{GENDER}}) registered on {{DATE_TIME}}. Confirmation sent to {{EMAIL_ADDRESS}}. Event location: {{LOCATION}}. Payment method: {{CREDIT_CARD}}. Event details at {{URL}}. Emergency contact SSN: {{US_SSN}}.",
    "Ticket: {{PERSON}} ({{AGE}}) contacted support on {{DATE_TIME}}. Verified email: {{EMAIL_ADDRESS}}. Account verified with SSN: {{US_SSN}}. Billing address: {{LOCATION}}. Reference URL: {{URL}}.",
    "Background check initiated for {{PERSON}} ({{AGE}}, {{GENDER}}) on {{DATE_TIME}}. SSN: {{US_SSN}}. Results will be sent to {{EMAIL_ADDRESS}}. Current residence: {{LOCATION}}. Payment processed with {{CREDIT_CARD}}. Access report at {{URL}}.",
    "Reservation confirmed for {{PERSON}} ({{AGE}}) at {{LOCATION}} on {{DATE_TIME}}. Confirmation sent to {{EMAIL_ADDRESS}}. Guaranteed by card {{CREDIT_CARD}}. Check-in requires ID and SSN: {{US_SSN}}. View booking at {{URL}}.",
    "{{PERSON}} ({{AGE}}, {{GENDER}}) enrolled on {{DATE_TIME}}. Student ID created using SSN {{US_SSN}}. Campus email: {{EMAIL_ADDRESS}}. Campus location: {{LOCATION}}. Tuition payment method: {{CREDIT_CARD}}. Portal: {{URL}}.",
    "Rental agreement for {{PERSON}} ({{AGE}}) on {{DATE_TIME}}. Driver's license verified with SSN {{US_SSN}}. Pickup: {{LOCATION}}. Confirmation sent to {{EMAIL_ADDRESS}}. Secured with card {{CREDIT_CARD}}. Details at {{URL}}."
    "Offer extended to {{PERSON}} ({{AGE}}, {{GENDER}}) on {{DATE_TIME}}. Employment verification completed via SSN {{US_SSN}}. Offer letter sent to {{EMAIL_ADDRESS}}. Office location: {{LOCATION}}. Sign documents at {{URL}}. Direct deposit form requires bank details and last 4 of SSN.",
    "New payee {{PERSON}} ({{AGE}}) added on {{DATE_TIME}}. Verification code sent to {{EMAIL_ADDRESS}}. Account linked to {{CREDIT_CARD}}. Mailing address: {{LOCATION}}. For security, please verify SSN: {{US_SSN}}. Manage payees at {{URL}}.",
] """

sentence_templates = [
    "During the comprehensive health assessment conducted on {{DATE_TIME}}, {{PERSON}}, a {{AGE}}-year-old {{GENDER}} with a history of {{MEDICAL_CONDITION}}, reported experiencing persistent {{SYMPTOM}} that required further investigation through {{MEDICAL_PROCEDURE}}, while the attending physician reviewed the patient's contact information including phone {{PHONE_NUMBER}} and {{EMAIL_ADDRESS}} for follow-up communications.",
    "Following the diagnostic evaluation, the medical team prescribed {{DRUG}} at a dosage of {{DOSAGE}} to be taken {{DRUG_FREQUENCY}}, along with scheduling a follow-up {{MEDICAL_PROCEDURE}} at our {{LOCATION}} facility on {{DATE_TIME}} to monitor the patient's response to the treatment plan.",
    "The insurance claim for {{PERSON}} was processed on {{DATE_TIME}} using credit card ending in {{CREDIT_CARD}}, with the total amount being charged to the account linked to bank number {{BANK_NUMBER}}, while the international payment of $1,500.00 was successfully transferred to IBAN {{IBAN_CODE}} for the medical services rendered at our {{LOCATION}} clinic.",
    "During the registration process at {{ORGANIZATION}} on {{DATE_TIME}}, {{PERSON}} provided multiple forms of identification including US passport number {{US_PASSPORT}}, state driver's license {{US_DRIVER_LICENSE}}, and social security number {{US_SSN}}, all of which were verified through our secure authentication system before proceeding with the scheduled {{MEDICAL_PROCEDURE}}.",
    "The electronic health record for {{PERSON}}, a {{AGE}}-year-old {{GENDER}}, indicates a diagnosis of {{MEDICAL_CONDITION}} with recent {{MEDICAL_PROCEDURE}} results showing significant improvement since the initiation of {{DRUG}} therapy at {{DOSAGE}} administered {{DRUG_FREQUENCY}}, while the patient's contact details including phone {{PHONE_NUMBER}} and {{EMAIL_ADDRESS}} were confirmed during the last visit on {{DATE_TIME}} at our {{LOCATION}} medical center.",
    "The detailed invoice generated on {{DATE_TIME}} for services rendered to {{PERSON}} includes charges for the recent {{MEDICAL_PROCEDURE}} and {{MEDICAL_PROCEDURE}}, with payment authorized through credit card {{CREDIT_CARD}} and a portion of the amount being directly billed to the patient's bank account {{BANK_NUMBER}}, while the remaining balance was covered by an international wire transfer to IBAN {{IBAN_CODE}}.",
    "Upon admission to {{ORGANIZATION}} on {{DATE_TIME}}, {{PERSON}}, a {{AGE}}-year-old {{GENDER}} with a medical history significant for {{MEDICAL_CONDITION}}, underwent an initial evaluation that included {{MEDICAL_PROCEDURE}}, followed by the administration of {{DRUG}} at {{DOSAGE}} to be taken {{DRUG_FREQUENCY}}, while the patient's emergency contact information including phone {{PHONE_NUMBER}} was verified by the nursing staff during the admission process.",
    "The medical report dated {{DATE_TIME}} for {{PERSON}} outlines the successful completion of {{MEDICAL_PROCEDURE}} performed at our {{LOCATION}} facility, with prescribed medication including {{DRUG}} at {{DOSAGE}} to be taken {{DRUG_FREQUENCY}}, while the patient's identification documents including US passport {{US_PASSPORT}} and driver's license {{US_DRIVER_LICENSE}} were verified prior to the procedure to ensure accurate medical record-keeping and billing processes.",
    "During the follow-up consultation on {{DATE_TIME}}, {{PERSON}}, a {{AGE}}-year-old {{GENDER}} being treated for {{MEDICAL_CONDITION}}, reported improved {{SYMPTOM}} following the prescribed regimen of {{DRUG}} at {{DOSAGE}} taken {{DRUG_FREQUENCY}}, while the most recent {{MEDICAL_PROCEDURE}} results showed positive response to treatment, and the patient's contact information including phone {{PHONE_NUMBER}} was updated in our system for future communications.",
    "The multidisciplinary treatment plan developed for {{PERSON}} on {{DATE_TIME}} includes ongoing management of {{MEDICAL_CONDITION}} through regular {{MEDICAL_PROCEDURE}} monitoring and administration of {{DRUG}} at {{DOSAGE}} to be taken {{DRUG_FREQUENCY}}, with scheduled {{MEDICAL_PROCEDURE}} to be performed at our {{LOCATION}} facility, while all financial transactions are processed through the patient's credit card {{CREDIT_CARD}} and verified bank account {{BANK_NUMBER}} for seamless billing and record-keeping purposes.",
    "Offer extended to {{PERSON}} ({{AGE}}, {{GENDER}}) on {{DATE_TIME}}. Employment verification completed via SSN {{US_SSN}}. Offer letter sent to {{EMAIL_ADDRESS}}. Office location: {{LOCATION}}. Sign documents at {{URL}}. Direct deposit form requires bank details and last 4 of SSN.",
    "New payee {{PERSON}} ({{AGE}}) added on {{DATE_TIME}}. Verification code sent to {{EMAIL_ADDRESS}}. Account linked to {{CREDIT_CARD}}. Mailing address: {{LOCATION}}. For security, please verify SSN: {{US_SSN}}. Manage payees at {{URL}}.",
]

sentence_faker = PresidioSentenceFaker(
    "en_US", lower_case_ratio=0.05, sentence_templates=sentence_templates, #base_records=records
)
fake_sentence_results = sentence_faker.generate_new_fake_sentences(25)

# Print the spans of the first sample
print(fake_sentence_results[0].masked)
print(fake_sentence_results[0].spans)
print(fake_sentence_results[1].masked)
print(fake_sentence_results[1].spans)

Using default entity providers
Using default entity mapping between the entities in the templates and the ones in the output dataset
Using default provider aliases


Sampling: 100%|██████████| 25/25 [00:00<00:00, 839.20it/s]

Offer extended to {{PERSON}} ({{AGE}}, {{GENDER}}) on {{DATE_TIME}}. Employment verification completed via SSN {{US_SSN}}. Offer letter sent to {{EMAIL_ADDRESS}}. Office location: {{LOCATION}}. Sign documents at {{URL}}. Direct deposit form requires bank details and last 4 of SSN.
[Span(type: URL, value: https://www.zgqyuxoephf.org/download?id=8906, char_span: [220: 264]), Span(type: LOCATION, value: Simpsonmouth, IA, char_span: [184: 200]), Span(type: EMAIL_ADDRESS, value: amandacastro@example.org, char_span: [141: 165]), Span(type: US_SSN, value: 073-45-1795, char_span: [107: 118]), Span(type: DATE_TIME, value: 1985-06-24 18:27:40, char_span: [44: 63]), Span(type: GENDER, value: female, char_span: [33: 39]), Span(type: AGE, value: 9, char_span: [30: 31]), Span(type: PERSON, value: Holly Long, char_span: [18: 28])]
Upon admission to {{ORGANIZATION}} on {{DATE_TIME}}, {{PERSON}}, a {{AGE}}-year-old {{GENDER}} with a medical history significant for {{MEDICAL_CONDITION}}, underwent an in




## Generate a full dataset

In this example we generate a large dataset with multiple entity types and save it in in JSON and CONLL03 formats.
This uses the default sentence templates included in this package.

In [4]:
import datetime

number_of_samples = 50
lower_case_ratio = 0.05
locale = "en"
cur_time = datetime.date.today().strftime("%B_%d_%Y")

output_file = f"../data/generated_size_{number_of_samples}_date_{cur_time}.json"
output_conll = f"../data/generated_size_{number_of_samples}_date_{cur_time}.tsv"

The `PresidioSentenceFaker` is based on the Faker library. It loads [FakeNameGenerator](https://www.fakenamegenerator.com/) data by default
to extend the set of fake values and creates a `SentenceFaker` 
which returns a fake person record (with multiple values) instead of one value,
allowing dependencies between values belonging to the same fake person
(e.g. name = Michael Smith with the email michael.smith@gmail.com).

`FakeNameGenerator.com_3000.csv` is included in this package and can be sourced from https://www.fakenamegenerator.com/order.php

In [5]:
sentence_faker = PresidioSentenceFaker("en_US", lower_case_ratio=0.05, sentence_templates=sentence_templates)

Using default entity providers
Using default entity mapping between the entities in the templates and the ones in the output dataset
Using default provider aliases


In [6]:
pd.DataFrame(sentence_faker._sentence_faker.records).head()

Unnamed: 0,first_name,last_name,gender,name,person,first_name_male,first_name_female,last_name_male,last_name_female,prefix,prefix_male,prefix_female
0,Andrew,Lara,male,Andrew Lara,Andrew Lara,Andrew,,Lara,,Mr.,Mr.,
1,Donna,Barnes,female,Donna Barnes,Donna Barnes,,Donna,,Barnes,Ms.,,Mrs.
2,Theodore,Hernandez,male,Theodore Hernandez,Theodore Hernandez,Theodore,,Hernandez,,Mr.,Mr.,
3,Andrew,Salinas,male,Andrew Salinas,Andrew Salinas,Andrew,,Salinas,,Mr.,Mr.,
4,Marc,White,male,Marc White,Marc White,Marc,,White,,Mr.,Mr.,


`PresidioSentenceFaker` adds additional providers by default, which are not included in the Faker package.
These can be found in `presidio_evaluator.data_generator.faker_extensions.providers`

It is possible to create providers for additional entity types by extending Faker's `BaseProvider` class, 
and calling `add_provider` on the `PresidioSentenceFaker` instance.
For example:

In [7]:
import random
from faker.providers import BaseProvider


class MarsIdProvider(BaseProvider):
    def mars_id(self):
        # Generate a random row number between 1 and 50
        row = random.randint(1, 50)
        # Generate a random letter for the seat location from A-K
        location = random.choice("ABCDEFGHIJK")
        # Return the seat in the format "row-letter" (e.g., "25A")
        return f"{row}{location}"


sentence_faker.add_provider(MarsIdProvider)
# Now a new `mars_id` entity can be generated if a template has `mars_id` in it.

In [8]:
from presidio_evaluator.data_generator.faker_extensions.providers import *

IpAddressProvider  # Both Ipv4 and IPv6 IP addresses
NationalityProvider  # Read countries + nationalities from file
OrganizationProvider  # Read organization names from file
UsDriverLicenseProvider  # Read US driver license numbers from file
AgeProvider  # Age values (unavailable on Faker
AddressProviderNew  # Extend the default address formats
PhoneNumberProviderNew  # Extend the default phone number formats
ReligionProvider  # Read religions from file
HospitalProvider  # Read hospital names from file
MedicalProvider  # Read medical entities from file




presidio_evaluator.data_generator.faker_extensions.providers.MedicalProvider

`PresidioSentenceFaker.PROVIDER_ALIASES` can be extended to add additional provider aliases for when templates have
a different entity name than what the providers emit.

In [9]:
# Create entity aliases (e.g. if your provider supports "name" but templates contain "person").
provider_aliases = PresidioSentenceFaker.PROVIDER_ALIASES
provider_aliases

# To customize, call `PresidioSentenceFaker(locale="en_US",...,provider_aliases=provider_aliases)`

[('name', 'person'),
 ('credit_card_number', 'credit_card'),
 ('date_of_birth', 'birthday')]

Generate data

In [10]:
import pprint
fake_records = sentence_faker.generate_new_fake_sentences(num_samples=number_of_samples)
pprint.pprint(fake_records[0])

Sampling: 100%|██████████| 50/50 [00:00<00:00, 1514.70it/s]

Full text: During the follow-up consultation on 1988-08-14 12:18:08, Heather Lyons, a 36-year-old female being treated for Tuberculosis, reported improved Vomiting following the prescribed regimen of Albuterol at 0.9% drops taken twice daily, while the most recent Physical Therapy results showed positive response to treatment, and the patient's contact information including phone +41 42 068 07 30 was updated in our system for future communications.
Spans: [Span(type: PHONE_NUMBER, value: +41 42 068 07 30, char_span: [372: 388]), Span(type: MEDICAL_PROCEDURE, value: Physical Therapy, char_span: [254: 270]), Span(type: DRUG_FREQUENCY, value: twice daily, char_span: [219: 230]), Span(type: DOSAGE, value: 0.9% drops, char_span: [202: 212]), Span(type: DRUG, value: Albuterol, char_span: [189: 198]), Span(type: SYMPTOM, value: Vomiting, char_span: [144: 152]), Span(type: MEDICAL_CONDITION, value: Tuberculosis, char_span: [112: 124]), Span(type: GENDER, value: female, char_span: [87: 93]), Sp




#### Verify randomness of dataset

In [11]:
from collections import Counter
import numpy as np
count_per_template_id = Counter([sample.template_id for sample in fake_records])

print(f"Total: {sum(count_per_template_id.values())}")
print(f"Avg # of records per template: {np.mean(list(count_per_template_id.values()))}")
print(
    f"Median # of records per template: {np.median(list(count_per_template_id.values()))}"
)
print(f"Std: {np.std(list(count_per_template_id.values()))}")

Total: 50
Avg # of records per template: 4.166666666666667
Median # of records per template: 5.0
Std: 1.674979270186815


In [12]:
# Encrypt / Decrypt Functions

import os
import base64
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad, unpad
from Crypto.Hash import SHA256

# Generate a 32-byte key from a string
def generate_key(key_string):
    hash_object = SHA256.new(key_string.encode())
    return hash_object.digest()

# Encrypts the given plaintext using AES and returns the ciphertext
def encrypt(plaintext, key):
    cipher = AES.new(key, AES.MODE_CBC)
    iv = cipher.iv
    padded_plaintext = pad(plaintext.encode(), AES.block_size)  # Pad the plaintext
    ciphertext = cipher.encrypt(padded_plaintext)
    return base64.b64encode(iv + ciphertext).decode('utf-8')  # Prepend IV for decryption

# Decrypts the given ciphertext using AES and returns the plaintext
def decrypt(ciphertext, key):
    ciphertext_bytes = base64.b64decode(ciphertext)
    iv = ciphertext_bytes[:AES.block_size]
    cipher = AES.new(key, AES.MODE_CBC, iv)
    padded_plaintext = cipher.decrypt(ciphertext_bytes[AES.block_size:])
    return unpad(padded_plaintext, AES.block_size).decode('utf-8')  # Unpad the plaintext

# Example usage

key_string = "marveluniverse"  # Use a secure key
eKey = generate_key(key_string)

# Example plaintext
plaintext = "This is my new test string for testing decryption"

# Encrypt
encrypted = encrypt(plaintext, eKey)
print("Encrypted:", encrypted)

# Decrypt
decrypted = decrypt(encrypted, eKey)
print("Decrypted:", decrypted)



Encrypted: 3FOxJYRX3sXHUbUZa1QBeg1scwU0CaVjkm2dXjbvgtlP9sKFChjnD6lRMSJ7dlf4jLYPjkeYvTVTaeOZhOX6N/px+M7xsi+UPYn5GP8QdYE=
Decrypted: This is my new test string for testing decryption


In [13]:

# Remove duplicates based on 'full_text'
seen = set()
deduped = []
count = 1
for record in fake_records:
    key = record.full_text  # or use a tuple of fields if needed
    record.full_text = encrypt(record.full_text, eKey)
    record.template_id = count
    count += 1
    if key not in seen:
        seen.add(key)
        del record.masked
        del record.metadata
        for span in record.spans:
            span.entity_value = encrypt(span.entity_value, eKey)
            del span.normalized_tokens
            del span.normalized_start_index
            del span.normalized_end_index
            del span.token_start
            del span.token_end
        deduped.append(record)

print(f"Original records: {len(fake_records)}")
print(f"Deduplicated records: {len(deduped)}")

# Save the deduplicated data
fake_records = deduped

Original records: 50
Deduplicated records: 50


#### Which entities did we generate?

In [14]:
count_per_entity = Counter()
for record in fake_records:
    count_per_entity.update(Counter([span.entity_type for span in record.spans]))

count_per_entity

Counter({'MEDICAL_PROCEDURE': 53,
         'DATE_TIME': 50,
         'PERSON': 44,
         'LOCATION': 31,
         'DRUG_FREQUENCY': 28,
         'DOSAGE': 28,
         'DRUG': 28,
         'MEDICAL_CONDITION': 19,
         'AGE': 19,
         'GENDER': 17,
         'CREDIT_CARD': 17,
         'BANK_NUMBER': 15,
         'PHONE_NUMBER': 13,
         'EMAIL_ADDRESS': 13,
         'US_SSN': 11,
         'US_DRIVER_LICENSE': 10,
         'US_PASSPORT': 10,
         'IBAN_CODE': 9,
         'SYMPTOM': 7,
         'URL': 6,
         'ORGANIZATION': 6})

In [15]:
for record in fake_records[:10]:
    print(record)

Full text: 2V2RV00PzeAoijKMrqjJyqGtx7pa9pAaaEdNFkRX1KE3ZDZopPfKokmwdWeYuQ20wqkTGT46xjd8rxc30LuwveOBXUz+xUjmioYdkTLITTic5/ufGmDRK4DDlRF382scW731sgfCieu6sDMr9MekGxqLUcGSFT/Wwe88Rtsiaq1zvU9+0NZ8uEMUXYQhsbrTFCbI1G1xqzzX7xk2VKnfqNrOKZTPDzLS8bEAsCqGF+HOwOyUZBifFzm5M/4t7YM0n9hlxQXkzYhMKlGolQl1ULUlPouOW9zCYCNmVEce2fsYJebPQz5YJD2pkKtxth+gP3oCpO4aCMOTHEA05cJKOPQN6XguchHj+pXz5C0dRksY0aUu/CvzwpbH8fOBky/NlL3w3qRiJ8jNQ/lOehDW7i8iY69Y+8p2InT0kVEFAiEGfEZnU3EFrIQ4BXwy9qVjQh/esMwlNyTNNVpbdjlrzMHEiBt+9NWOLhOC98AhTlPqoXale9tzVLO1stxO/lQDsM7hYtxxRmOg7yF52MLAdmdT7ZtR2Te19ft8OP09U94B0UPH9i6ikj6kMWHeTQxWv0iGRsQC6Yk9Qo89+3Y3X7hZFeCg9+0N6ehf4tRjaek=
Spans: [Span(type: PHONE_NUMBER, value: daU46hXUvwBxMbhYCjnyuy9BqRaeLZDG9ZTa8jcP+BnNR8L/yQtGN73qhHfO1K0v, char_span: [372: 388]), Span(type: MEDICAL_PROCEDURE, value: 9K0zlpLKsN7C0+zXmWb1Ma1aG+PXTHWzZUJF4pLsM/iWbh15jYGzG+4Pl9mIIeCC, char_span: [254: 270]), Span(type: DRUG_FREQUENCY, value: MH74U8aOxBnTNkBdideuprMdweADBqqaMSpaT22q7Ak=, char_span: [219: 230]), Span(ty

#### Save as json

In [16]:
from presidio_evaluator import InputSample
InputSample.to_json(dataset=fake_records, output_file=output_file)

In [17]:
output_file

'../data/generated_size_50_date_August_17_2025.json'

#### Create a CONLL like data frame

In [18]:
#conll = InputSample.create_conll_dataset(dataset=fake_records)
#conll.head(10)

In [19]:
#conll.to_csv(output_conll, sep="\t")
#print(f"CoNLL2003 dataset structure output location: {output_conll}")

### Next steps

- Evaluate Presidio using fake data: [Sample](4_Evaluate_Presidio_Analyzer.ipynb)
- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset: [Sample](3_Split_by_pattern_#.ipynb)
- Conduct a small exploratory data analysis on the generated data: [Sample](2_PII_EDA.ipynb)

#### Copyright notice:


Data generated for evaluation was created using Fake Name Generator.

Fake Name Generator identities by the [Fake Name Generator](https://www.fakenamegenerator.com/) 
are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/). Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC.