In [None]:
# Generate fake PII data using the Presidio Sentence Faker
# Encrypt / Decrypt Functions

import os
import base64
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad, unpad
from Crypto.Hash import SHA256

# Generate a 32-byte key from a string
def generate_key(key_string):
    hash_object = SHA256.new(key_string.encode())
    return hash_object.digest()

# Encrypts the given plaintext using AES and returns the ciphertext
def encrypt(plaintext, key):
    cipher = AES.new(key, AES.MODE_CBC)
    iv = cipher.iv
    padded_plaintext = pad(plaintext.encode(), AES.block_size)  # Pad the plaintext
    ciphertext = cipher.encrypt(padded_plaintext)
    return base64.b64encode(iv + ciphertext).decode('utf-8')  # Prepend IV for decryption

# Decrypts the given ciphertext using AES and returns the plaintext
def decrypt(ciphertext, key):
    ciphertext_bytes = base64.b64decode(ciphertext)
    iv = ciphertext_bytes[:AES.block_size]
    cipher = AES.new(key, AES.MODE_CBC, iv)
    padded_plaintext = cipher.decrypt(ciphertext_bytes[AES.block_size:])
    return unpad(padded_plaintext, AES.block_size).decode('utf-8')  # Unpad the plaintext

# Example usage

key_string = "marveluniverse"  # Use a secure key
eKey = generate_key(key_string)

# Example plaintext
#plaintext = "This is my new test string for testing decryption"

encrypted = "ehpEhRFFLp3kgaLtyHWIjRGqp4wfaM80rC/wjFQJ0lQmnUFZ6rXQU5lW5uCjK0jwutAV0sbPrX3e7EtoX/p+4Jnl/QhYd24XbUXAhDF+uWwwqjYTTLsbEWI7XbFDdCkYuARO2aYt3hNzvovSjNZb11KUpQazEjm1VpDmrhxPFN1uO8CPZIUDEBd+zw9V2SxiYcIkc19EuCgza6zOJUZnQA2GblwahnsL6flPBXDLVmhFTIDVSThnRzWiZKGR2Ru8G7NKI7PyZnddrunc78hp79o/LXzjGtBS3yq7m2zo8lbqM46ihqcVGYxBlH9X8tkTE8DI8SK2EuU9NrDm0KeDz4dlwcfRSTh1I63HD/KxQnTt1yuTf5Hees+0HPFgtQtRFIMEdfTukJXMi/TXCnnawzHBHTcmWZaZexi8Zavj6Ov9THc1MGJhWRq2LlfsokQn"
# Encrypt
#encrypted = encrypt(plaintext, eKey)
#print("Encrypted:", encrypted)

# Decrypt
decrypted = decrypt(encrypted, eKey)
print("Decrypted:", decrypted)


## Define Templates

In [None]:
sentence_templates = [
    "During the comprehensive health assessment conducted on {{DATE_TIME}}, {{PERSON}}, a {{AGE}}-year-old {{GENDER}} with a history of {{MEDICAL_CONDITION}}, reported experiencing persistent {{SYMPTOM}} that required further investigation through {{MEDICAL_PROCEDURE}}, while the attending physician reviewed the patient's contact information including phone {{PHONE_NUMBER}} and {{EMAIL_ADDRESS}} for follow-up communications. \n Following the diagnostic evaluation, the medical team prescribed {{DRUG}} at a dosage of {{DOSAGE}} to be taken {{DRUG_FREQUENCY}}, along with scheduling a follow-up {{MEDICAL_PROCEDURE}} at our {{LOCATION}} facility on {{DATE_TIME}} to monitor the patient's response to the treatment plan. Patient {{PERSON}} ({{AGE}} years old, {{GENDER}}) completed their registration on {{DATE_TIME}}. Contact: {{EMAIL_ADDRESS}}. SSN: {{US_SSN}}. Billing on file ends with {{CREDIT_CARD}}. The detailed invoice generated on {{DATE_TIME}} for services rendered to {{PERSON}} includes charges for the recent {{MEDICAL_PROCEDURE}} and {{MEDICAL_PROCEDURE}}, with payment authorized through credit card {{CREDIT_CARD}} and a portion of the amount being directly billed to the patient's bank account {{BANK_NUMBER}}, while the remaining balance was covered by an international wire transfer to IBAN {{IBAN_CODE}}. The medical report dated {{DATE_TIME}} for {{PERSON}} outlines the successful completion of {{MEDICAL_PROCEDURE}} performed at our {{LOCATION}} facility, with prescribed medication including {{DRUG}} at {{DOSAGE}} to be taken {{DRUG_FREQUENCY}}, while the patient's identification documents including US passport {{US_PASSPORT}} and driver's license {{US_DRIVER_LICENSE}} were verified prior to the procedure to ensure accurate medical record-keeping and billing processes. \n During the follow-up consultation on {{DATE_TIME}}, {{PERSON}}, a {{AGE}}-year-old {{GENDER}} being treated for {{MEDICAL_CONDITION}}, reported improved {{SYMPTOM}} following the prescribed regimen of {{DRUG}} at {{DOSAGE}} taken {{DRUG_FREQUENCY}}, while the most recent {{MEDICAL_PROCEDURE}} results showed positive response to treatment, and the patient's contact information including phone {{PHONE_NUMBER}} was updated in our system for future communications.",

    "The multidisciplinary treatment plan developed for {{PERSON}} on {{DATE_TIME}} includes ongoing management of {{MEDICAL_CONDITION}} through regular {{MEDICAL_PROCEDURE}} monitoring and administration of {{DRUG}} at {{DOSAGE}} to be taken {{DRUG_FREQUENCY}}, with scheduled {{MEDICAL_PROCEDURE}} to be performed at our {{LOCATION}} facility, while all financial transactions are processed through the patient's credit card {{CREDIT_CARD}} and verified bank account {{BANK_NUMBER}} for seamless billing and record-keeping purposes. \n During the registration process at {{ORGANIZATION}} on {{DATE_TIME}}, {{PERSON}} provided multiple forms of identification including US passport number {{US_PASSPORT}}, state driver's license {{US_DRIVER_LICENSE}}, and social security number {{US_SSN}}, all of which were verified through our secure authentication system before proceeding with the scheduled {{MEDICAL_PROCEDURE}}. \n The electronic health record for {{PERSON}}, a {{AGE}}-year-old {{GENDER}}, indicates a diagnosis of {{MEDICAL_CONDITION}} with recent {{MEDICAL_PROCEDURE}} results showing significant improvement since the initiation of {{DRUG}} therapy at {{DOSAGE}} administered {{DRUG_FREQUENCY}}, while the patient's contact details including phone {{PHONE_NUMBER}} and {{EMAIL_ADDRESS}} were confirmed during the last visit on {{DATE_TIME}} at our {{LOCATION}} medical center. \n The dosage of {{DRUG}} for {{PERSON}} is set to {{DOSAGE}}, to be taken daily. He should visit the hospital next on {{DATE_TIME}}. \n The laboratory results for {{PERSON}}'s {{MEDICAL_PROCEDURE}} conducted on {{DATE_TIME}} at {{LOCATION}} indicate {{MEDICAL_CONDITION}}, and Dr. {{PERSON}} has prescribed {{DRUG}} {{DOSAGE}} to be taken {{DRUG_FREQUENCY}} with follow-up scheduled for {{DATE_TIME}}.",


    "The insurance claim for {{PERSON}} was processed on {{DATE_TIME}} using credit card ending in {{CREDIT_CARD}}, with the total amount being charged to the account linked to bank number {{BANK_NUMBER}}, while the international payment of $1,500.00 was successfully transferred to IBAN {{IBAN_CODE}} for the medical services rendered at our {{LOCATION}} clinic. \n New payee {{PERSON}} ({{AGE}}) added on {{DATE_TIME}}. Verification code sent to {{EMAIL_ADDRESS}}. Account linked to {{CREDIT_CARD}}. Mailing address: {{LOCATION}}. For security, please verify SSN: {{US_SSN}}. Manage payees at {{URL}}. \n Claim submitted by {{PERSON}} on {{DATE_TIME}} for {{MEDICAL_PROCEDURE}} performed at {{LOCATION}} has been processed, and payment of $1,250.00 has been issued to account {{BANK_NUMBER}}. \n Account update processed for {{PERSON}} on {{DATE_TIME}}. New billing address updated to {{LOCATION}}, and confirmation has been sent to {{EMAIL_ADDRESS}} and {{PHONE_NUMBER}}. \n Insurance policy issued to {{PERSON}} (DOB: {{DATE_TIME}}, {{GENDER}}) on {{DATE_TIME}} with monthly premium of $125.00 to be automatically debited from account {{BANK_NUMBER}}. Policy documents have been emailed to {{EMAIL_ADDRESS}}.",


    "Offer extended to {{PERSON}} ({{AGE}}, {{GENDER}}) on {{DATE_TIME}}. Employment verification completed via SSN {{US_SSN}}. Offer letter sent to {{EMAIL_ADDRESS}}. Office location: {{LOCATION}}. Sign documents at {{URL}}. Direct deposit form requires bank details and last 4 of SSN. \n Emergency contact for {{PERSON}} is {{PHONE_NUMBER}} and email is {{EMAIL_ADDRESS}}. \n New hire onboarding for {{PERSON}} (SSN: {{US_SSN}}, DOB: {{DATE_TIME}}, {{GENDER}}) was completed on {{DATE_TIME}}. Direct deposit has been set up with account {{BANK_NUMBER}}, and company email {{EMAIL_ADDRESS}} has been activated. \n Background check authorization form submitted by {{PERSON}} (SSN: {{US_SSN}}) on {{DATE_TIME}} for position at {{ORGANIZATION}}. Results will be sent to HR at {{EMAIL_ADDRESS}} with a copy to the candidate. \n Background check initiated for {{PERSON}} ({{AGE}}, {{GENDER}}) on {{DATE_TIME}}. SSN: {{US_SSN}}. Results will be sent to {{EMAIL_ADDRESS}}. Current residence: {{LOCATION}}. Payment processed with {{CREDIT_CARD}}. Access report at {{URL}}.",
]





In [None]:
import datetime
import pprint
from collections import Counter
from pathlib import Path
from typing import Dict, List

import pandas as pd
import numpy as np

from presidio_evaluator import InputSample
from presidio_evaluator.data_generator import PresidioSentenceFaker
from presidio_evaluator.data_generator.faker_extensions.providers import *

IpAddressProvider  # Both Ipv4 and IPv6 IP addresses
NationalityProvider  # Read countries + nationalities from file
OrganizationProvider  # Read organization names from file
UsDriverLicenseProvider  # Read US driver license numbers from file
AgeProvider  # Age values (unavailable on Faker
AddressProviderNew  # Extend the default address formats
PhoneNumberProviderNew  # Extend the default phone number formats
ReligionProvider  # Read religions from file
HospitalProvider  # Read hospital names from file
MedicalProvider  # Read medical entities from file
UsPassportProvider

from presidio_evaluator.data_generator.presidio_sentence_faker import PresidioSentenceFaker


sentence_faker = PresidioSentenceFaker(
    "en_US", lower_case_ratio=0.05, sentence_templates=sentence_templates, #base_records=records
)
fake_sentence_results = sentence_faker.generate_new_fake_sentences(10)


number_of_samples = 10
lower_case_ratio = 0.05
locale = "en"
cur_time = datetime.date.today().strftime("%B_%d_%Y")

output_file = f"../data/generated_TEXT_date_{cur_time}.json"
sentence_faker = PresidioSentenceFaker("en_US", lower_case_ratio=0.05, sentence_templates=sentence_templates)

# Create entity aliases (e.g. if your provider supports "name" but templates contain "person").
provider_aliases = PresidioSentenceFaker.PROVIDER_ALIASES
provider_aliases

fake_records = sentence_faker.generate_new_fake_sentences(num_samples=number_of_samples)
pprint.pprint(fake_records[0])

print(f"Generated records: {len(fake_records)}")
#### Which entities did we generate?
count_per_entity = Counter()
for record in fake_records:
    count_per_entity.update(Counter([span.entity_type for span in record.spans]))

count_per_entity
#### Save as json
InputSample.to_json(dataset=fake_records, output_file=output_file)
output_file

In [None]:
# Entity mapping: Maps template entity names to normalized entity types
ENTITY_MAPPING = {
    # Person-related
    'PERSON1': 'PERSON',
    'PERSON2': 'PERSON',
    'PERSON3': 'PERSON',
    'MGR1': 'PERSON',
    'MGR2': 'PERSON',
    
    # Date-related
    'DATE1': 'DATE_TIME',
    'DATE2': 'DATE_TIME',
    'DOB1': 'DATE_TIME',
    'HIRE_DATE1': 'DATE_TIME',
    
    # Contact information
    'EMAIL1': 'EMAIL_ADDRESS',
    'PHONE1': 'PHONE_NUMBER',
    'ADDRESS1': 'LOCATION',
    
    # Financial
    'AMOUNT1': 'AMOUNT',
    'SALARY1': 'AMOUNT',
    'BONUS1': 'PERCENTAGE',
    'STOCK_OPTIONS1': 'NUMBER',
    
    # Document IDs
    'MRN1': 'MEDICAL_RECORD_NUMBER',
    'TX_ID1': 'TRANSACTION_ID',
    'EMP_ID1': 'EMPLOYEE_ID',
    'REF1': 'REFERENCE_NUMBER',
    
    # Other
    'AGE1': 'AGE',
    'GENDER1': 'GENDER',
    'WEEK1': 'DURATION',
    'CURRENCY1': 'CURRENCY',
    'IBAN1': 'IBAN_CODE',
    'ACCOUNT1': 'BANK_NUMBER',
    'SENDER_ACCOUNT1': 'BANK_NUMBER',
    'STATUS1': 'STATUS',
    'TITLE1': 'JOB_TITLE',
    'DEPT1': 'DEPARTMENT',
    'EMP_TYPE1': 'EMPLOYMENT_TYPE',
    'TX_TYPE1': 'TRANSACTION_TYPE',
    'DESC1': 'DESCRIPTION',
    'SYMPTOM1': 'SYMPTOM',
    'DRUG1': 'MEDICATION',
    'DOSAGE1': 'DOSAGE',
    'FREQUENCY1': 'DRUG_FREQUENCY',
    'RESPONSE1': 'RESPONSE',
    'SENDER_NAME1': 'ORGANIZATION',
    'RECEIVER_NAME1': 'ORGANIZATION'
}

# Entities to ignore when generating spans
IGNORE_ENTITIES = {
    'CURRENCY1',  # Common and not typically PII
    'WEEK1',      # Duration, not PII
    'RESPONSE1',  # Generic response, not PII
    'STATUS1',    # Status values, not PII
    'DESC1',      # Description, typically not PII
    'TX_TYPE1',   # Transaction type, not PII
    'EMP_TYPE1',  # Employment type, not PII
    'DEPT1',      # Department name, typically not PII
    'TITLE1',      # Job title, typically not PII
    'REFERENCE_NUMBER',
    'AMOUNT',
    'AMOUNT1',
    'TRANSACTION_ID',
    'BONUS',
    'BONUS1',
    'STOCK_OPTIONS',
    'STOCK_OPTIONS1',
    'EMPLOYEE_ID',
    'EMPLOYEE_ID1',
    'MEDICAL_RECORD_NUMBER',
    'MEDICAL_RECORD_NUMBER1',
    'TRANSACTION_ID',
    'TRANSACTION_ID1',
    'REF',
    'REF1',
    'MRN',
    'MRN1',
    'TX_ID',
    'TX_ID1',
    'EMP_ID',
    'EMP_ID1',
    'SALARY',
    'SALARY1',
}

def normalize_entity(entity_name: str) -> str:
    """Normalize entity name by removing numbers and looking up in mapping."""
    # If exact match in mapping, return it
    if entity_name in ENTITY_MAPPING:
        return ENTITY_MAPPING[entity_name]
    
    # Try to match patterns like NAME1, PHONE2, etc.
    base_name = re.sub(r'\d+$', '', entity_name)
    if base_name in ENTITY_MAPPING.values():
        return base_name
    
    # Default to original name if no mapping found
    return entity_name

def generate_template(domain: str) -> Tuple[str, List[Dict]]:
    """Generate a template for the specified domain with entity spans."""
    # [Previous template generation code remains the same until the spans generation part]
    if domain == 'medical':
        # First, generate all entities
        entities = {
            'PERSON1': generate_entity("PERSON"),
            'MRN1': f"MRN{random.randint(100000, 999999)}",
            'DOB1': (datetime.now() - timedelta(days=random.randint(18*365, 90*365))).strftime('%Y-%m-%d'),
            'PERSON2': f"Dr. {generate_entity('PERSON')}",
            'DATE1': datetime.now().strftime('%Y-%m-%d'),
            'AGE1': str(generate_entity("AGE")),
            'GENDER1': generate_entity("GENDER"),
            'SYMPTOM1': random.choice(['headache', 'chest pain', 'abdominal pain']),
            'WEEK1': str(random.randint(1, 12)),
            'DRUG1': generate_entity("DRUG"),
            'DOSAGE1': generate_entity("DOSAGE"),
            'FREQUENCY1': generate_entity("DRUG_FREQUENCY"),
            'RESPONSE1': random.choice(['good', 'partial', 'no'])
        }
        
        # Create the template with values directly inserted
        template = f"""
        MEDICAL REPORT
        Patient: {entities['PERSON1']} (MRN: {entities['MRN1']})
        Date of Birth: {entities['DOB1']}
        Provider: {entities['PERSON2']}
        Date: {entities['DATE1']}
        
        CHIEF COMPLAINT:
        {entities['PERSON1']} is a {entities['AGE1']}-year-old {entities['GENDER1']} who presents with {entities['SYMPTOM1']}.
        
        HISTORY OF PRESENT ILLNESS:
        The patient reports a {entities['WEEK1']}-week history of {entities['SYMPTOM1']}. 
        Previous treatment included {entities['DRUG1']} {entities['DOSAGE1']} {entities['FREQUENCY1']} with {entities['RESPONSE1']} response.
        """
        
    elif domain == 'finance':
        # Generate finance entities
        entities = {
            'TX_ID1': f"TX{random.randint(100000, 999999)}",
            'DATE1': datetime.now().strftime('%Y-%m-%d'),
            'AMOUNT1': f"{random.uniform(100, 100000):.2f}",
            'CURRENCY1': random.choice(CURRENCIES),
            'SENDER_NAME1': faker.company(),
            'SENDER_ACCOUNT1': f"AC{random.randint(10000000, 99999999)}",
            'RECEIVER_NAME1': faker.company(),
            'IBAN1': faker.iban(),
            'TX_TYPE1': random.choice(['Wire Transfer', 'ACH Payment', 'International Transfer']),
            'DESC1': f"Payment for {faker.bs()} services",
            'REF1': f"INV-{random.randint(1000, 9999)}",
            'STATUS1': random.choice(['Completed', 'Pending', 'Failed'])
        }
        
        template = f"""
        FINANCIAL TRANSACTION RECORD
        Transaction ID: {entities['TX_ID1']}
        Date: {entities['DATE1']}
        Amount: {entities['AMOUNT1']} {entities['CURRENCY1']}
        
        PARTIES:
        Sender: {entities['SENDER_NAME1']} (Account: {entities['SENDER_ACCOUNT1']})
        Receiver: {entities['RECEIVER_NAME1']} (IBAN: {entities['IBAN1']})
        
        DETAILS:
        Transaction type: {entities['TX_TYPE1']}
        Description: {entities['DESC1']}
        Reference: {entities['REF1']}
        Status: {entities['STATUS1']}
        """
        
    elif domain == 'hr':
        # Generate HR entities
        entities = {
            'EMP_ID1': f"EMP{random.randint(10000, 99999)}",
            'PERSON1': generate_entity("PERSON"),
            'TITLE1': random.choice(JOB_TITLES),
            'DEPT1': random.choice(DEPARTMENTS),
            'MGR1': generate_entity("PERSON"),
            'EMAIL1': faker.email(),
            'PHONE1': generate_entity("PHONE_NUMBER"),
            'ADDRESS1': generate_entity("LOCATION"),
            'HIRE_DATE1': (datetime.now() - timedelta(days=random.randint(30, 3650))).strftime('%Y-%m-%d'),
            'STATUS1': random.choice(STATUSES),
            'EMP_TYPE1': random.choice(['Full-time', 'Part-time', 'Contractor']),
            'SALARY1': f"${random.randint(50000, 250000):,}",
            'CURRENCY1': 'USD',
            'BONUS1': random.randint(0, 30),
            'STOCK_OPTIONS1': random.randint(0, 10000)
        }
        
        template = f"""
        EMPLOYEE RECORD
        Employee ID: {entities['EMP_ID1']}
        Name: {entities['PERSON1']}
        Position: {entities['TITLE1']}
        Department: {entities['DEPT1']}
        Manager: {entities['MGR1']}
        
        CONTACT INFORMATION:
        Email: {entities['EMAIL1']}
        Phone: {entities['PHONE1']}
        Address: {entities['ADDRESS1']}
        
        EMPLOYMENT DETAILS:
        Hire Date: {entities['HIRE_DATE1']}
        Status: {entities['STATUS1']}
        Employment Type: {entities['EMP_TYPE1']}
        
        COMPENSATION:
        Base Salary: {entities['SALARY1']} {entities['CURRENCY1']}
        Bonus: {entities['BONUS1']}%
        Stock Options: {entities['STOCK_OPTIONS1']}
        """
        
    # Clean up the template
    template = template.strip()
    # Generate spans for all entities in the text
    spans = []
    for entity_name, value in entities.items():
        # Skip ignored entities
        if entity_name in IGNORE_ENTITIES:
            continue
            
        # Normalize entity name
        normalized_entity = normalize_entity(entity_name)
        
        value_str = str(value)
        start = 0
        while True:
            # Find the entity value in the text
            pos = template.find(value_str, start)
            if pos == -1:
                break
            # Add the span for this occurrence
            spans.append({
                'entity': normalized_entity,  # Use normalized entity name
                'value': value_str,
                'start': pos,
                'end': pos + len(value_str),
                'original_entity': entity_name  # Keep original for reference
            })
            start = pos + 1
    
    # Sort spans by start position for cleaner output
    spans.sort(key=lambda x: x['start'])
    
    return template, spans

def generate_document(domain: str, 
                     min_length: int = 3000,
                     ignore_entities = None,
                     entity_mapping: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
    """
    Generate a document with entity spans for the specified domain.
    
    Args:
        domain: The domain to generate data for ('medical', 'finance', or 'hr')
        min_length: Minimum length of the generated text in characters
        ignore_entities: Set of entity names to exclude from span generation
        entity_mapping: Custom mapping from template entity names to normalized names
        
    Returns:
        Dictionary containing the generated document and its spans
    """
    # Use provided ignore list or default
    global IGNORE_ENTITIES
    original_ignore = IGNORE_ENTITIES.copy()
    if ignore_entities is not None:
        IGNORE_ENTITIES = ignore_entities
    
    # Use provided entity mapping or default
    global ENTITY_MAPPING
    original_mapping = ENTITY_MAPPING.copy()
    if entity_mapping is not None:
        ENTITY_MAPPING.update(entity_mapping)
    
    try:
        # Generate base template
        text, spans = generate_template(domain)
        
        # If text is too short, append more content
        while len(text) < min_length:
            # Add a section break
            text += "\n\n" + "-" * 50 + "\n\n"
            
            # Generate another template of the same domain
            more_text, more_spans = generate_template(domain)
            
            # Adjust the offsets of the new spans
            offset = len(text)
            for span in more_spans:
                span['start'] += offset
                span['end'] += offset
            
            # Append the new content and spans
            text += more_text
            spans.extend(more_spans)
        
        return {
            'domain': domain,
            'text': text,
            'spans': spans,
            'generated_at': datetime.now().isoformat(),
            'metadata': {
                'ignored_entities': list(IGNORE_ENTITIES),
                'entity_mapping': ENTITY_MAPPING
            }
        }
    finally:
        # Restore original values
        IGNORE_ENTITIES = original_ignore
        ENTITY_MAPPING = original_mapping

# Example usage
if __name__ == "__main__":
    # Example of custom ignore list and entity mapping
    custom_ignore = {
        'CURRENCY1',  # Don't tag currency values
        'AMOUNT1',    # Don't tag amounts
        'STATUS1'     # Don't tag status values
    }
    
    custom_mapping = {
        # Add or override entity mappings
        'PERSON1': 'PATIENT',  # In medical context
        'PERSON2': 'DOCTOR',   # In medical context
        'MGR1': 'MANAGER'      # In HR context
    }
    
    # Generate one document from each domain
    for domain in DOMAINS:
        print(f"\nGenerating {domain.upper()} document...")
        
        # Generate document with custom ignore list and entity mapping
        doc = generate_document(
            domain, 
            min_length=1000
            #ignore_entities=None,#custom_ignore,
            #entity_mapping=None,#custom_mapping
        )
        
        # Print summary
        print(f"Document length: {len(doc['text'])} characters")
        print(f"Number of entity spans: {len(doc['spans'])}")
        
        # Get unique entity types
        entity_types = set(span['entity'] for span in doc['spans'])
        print(f"Entity types: {', '.join(sorted(entity_types))}")
        
        # Save to file
        filename = f"data/{domain}_document_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        Path("data").mkdir(exist_ok=True)
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(doc, f, indent=2)
        print(f"Saved to {filename}")
        
        # Print sample of the first few entities
        print("\nSample entities:")
        for i, span in enumerate(doc['spans'][:5]):  # Show first 5 entities
            print(f"  {span['entity']} (was {span['original_entity']}): {span['value']} (position: {span['start']}-{span['end']})")