In [None]:
# Generate fake PII data using the Presidio Sentence Faker
# Encrypt / Decrypt Functions

import os
import base64
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad, unpad
from Crypto.Hash import SHA256

# Generate a 32-byte key from a string
def generate_key(key_string):
    hash_object = SHA256.new(key_string.encode())
    return hash_object.digest()

# Encrypts the given plaintext using AES and returns the ciphertext
def encrypt(plaintext, key):
    cipher = AES.new(key, AES.MODE_CBC)
    iv = cipher.iv
    padded_plaintext = pad(plaintext.encode(), AES.block_size)  # Pad the plaintext
    ciphertext = cipher.encrypt(padded_plaintext)
    return base64.b64encode(iv + ciphertext).decode('utf-8')  # Prepend IV for decryption

# Decrypts the given ciphertext using AES and returns the plaintext
def decrypt(ciphertext, key):
    ciphertext_bytes = base64.b64decode(ciphertext)
    iv = ciphertext_bytes[:AES.block_size]
    cipher = AES.new(key, AES.MODE_CBC, iv)
    padded_plaintext = cipher.decrypt(ciphertext_bytes[AES.block_size:])
    return unpad(padded_plaintext, AES.block_size).decode('utf-8')  # Unpad the plaintext

# Example usage

key_string = "marveluniverse"  # Use a secure key
eKey = generate_key(key_string)

# Example plaintext
#plaintext = "This is my new test string for testing decryption"

encrypted = "ehpEhRFFLp3kgaLtyHWIjRGqp4wfaM80rC/wjFQJ0lQmnUFZ6rXQU5lW5uCjK0jwutAV0sbPrX3e7EtoX/p+4Jnl/QhYd24XbUXAhDF+uWwwqjYTTLsbEWI7XbFDdCkYuARO2aYt3hNzvovSjNZb11KUpQazEjm1VpDmrhxPFN1uO8CPZIUDEBd+zw9V2SxiYcIkc19EuCgza6zOJUZnQA2GblwahnsL6flPBXDLVmhFTIDVSThnRzWiZKGR2Ru8G7NKI7PyZnddrunc78hp79o/LXzjGtBS3yq7m2zo8lbqM46ihqcVGYxBlH9X8tkTE8DI8SK2EuU9NrDm0KeDz4dlwcfRSTh1I63HD/KxQnTt1yuTf5Hees+0HPFgtQtRFIMEdfTukJXMi/TXCnnawzHBHTcmWZaZexi8Zavj6Ov9THc1MGJhWRq2LlfsokQn"
# Encrypt
#encrypted = encrypt(plaintext, eKey)
#print("Encrypted:", encrypted)

# Decrypt
decrypted = decrypt(encrypted, eKey)
print("Decrypted:", decrypted)


## Define Templates

In [None]:
import json
import random
import re
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Any, Tuple, Optional
from faker import Faker
from presidio_evaluator.data_generator.faker_extensions import (
    MedicalProvider, UsPassportProvider, 
    UsDriverLicenseProvider, OrganizationProvider
)
from presidio_evaluator.data_generator.faker_extensions.providers import *

IpAddressProvider  # Both Ipv4 and IPv6 IP addresses
NationalityProvider  # Read countries + nationalities from file
OrganizationProvider  # Read organization names from file
UsDriverLicenseProvider  # Read US driver license numbers from file
AgeProvider  # Age values (unavailable on Faker
AddressProviderNew  # Extend the default address formats
PhoneNumberProviderNew  # Extend the default phone number formats
ReligionProvider  # Read religions from file
HospitalProvider  # Read hospital names from file
MedicalProvider  # Read medical entities from file
UsPassportProvider

# Initialize Faker with custom providers
faker = Faker('en_US')
faker.add_provider(MedicalProvider)
faker.add_provider(UsPassportProvider)
faker.add_provider(UsDriverLicenseProvider)
faker.add_provider(OrganizationProvider)

# Entity mapping: Maps template entity names to normalized entity types
ENTITY_MAPPING = {
    # Person-related
    'PERSON1': 'PERSON',
    'PERSON2': 'PERSON',
    'PERSON3': 'PERSON',
    'MGR1': 'PERSON',
    'MGR2': 'PERSON',
    'JUDGE1': 'PERSON',
    'PARTY1': 'PERSON',
    'PARTY2': 'PERSON',
    'JUDGE': 'PERSON',
    
    # Date-related
    'DATE1': 'DATE_TIME',
    'DATE2': 'DATE_TIME',
    'DOB1': 'DATE_TIME',
    'HIRE_DATE': 'DATE_TIME',
    
    # Contact information
    'EMAIL1': 'EMAIL_ADDRESS',
    'PHONE1': 'PHONE_NUMBER',
    'ADDRESS1': 'LOCATION',
    
    # Financial
    'AMOUNT1': 'AMOUNT',
    'SALARY1': 'AMOUNT',
    'BONUS1': 'PERCENTAGE',
    'STOCK_OPTIONS1': 'NUMBER',
    
    # Document IDs
    'MRN1': 'MEDICAL_RECORD_NUMBER',
    'TX_ID1': 'TRANSACTION_ID',
    'EMP_ID1': 'EMPLOYEE_ID',
    'REF1': 'REFERENCE_NUMBER',
    
    # Other
    'AGE1': 'AGE',
    'GENDER1': 'GENDER',
    'WEEK1': 'DURATION',
    'CURRENCY1': 'CURRENCY',
    'IBAN1': 'IBAN_CODE',
    'ACCOUNT1': 'BANK_NUMBER',
    'SENDER_ACCOUNT1': 'BANK_NUMBER',
    'STATUS1': 'STATUS',
    'TITLE1': 'JOB_TITLE',
    'DEPT1': 'DEPARTMENT',
    'EMP_TYPE1': 'EMPLOYMENT_TYPE',
    'TX_TYPE1': 'TRANSACTION_TYPE',
    'DESC1': 'DESCRIPTION',
    'SYMPTOM1': 'SYMPTOM',
    'DRUG1': 'MEDICATION',
    'DOSAGE1': 'DOSAGE',
    'FREQUENCY1': 'DRUG_FREQUENCY',
    'RESPONSE1': 'RESPONSE',
    'SENDER_NAME1': 'ORGANIZATION',
    'RECEIVER_NAME1': 'ORGANIZATION',
    'INSURER': 'ORGANIZATION',
}

# Entities to ignore when generating spans
IGNORE_ENTITIES = {
    'CURRENCY',  # Common and not typically PII
    'WEEK',      # Duration, not PII
    'RESPONSE',  # Generic response, not PII
    'STATUS',    # Status values, not PII
    'DESC',      # Description, typically not PII
    'TX_TYPE',   # Transaction type, not PII
    'EMP_TYPE',  # Employment type, not PII
    'DEPT',      # Department name, typically not PII
    'TITLE',      # Job title, typically not PII
    'REFERENCE_NUMBER',
    'AMOUNT',
    'AMOUNT1',
    'TRANSACTION_ID',
    'BONUS',
    'BONUS1',
    'STOCK_OPTIONS',
    'STOCK_OPTIONS1',
    'EMPLOYEE_ID',
    'EMPLOYEE_ID1',
    'MEDICAL_RECORD_NUMBER',
    'MEDICAL_RECORD_NUMBER1',
    'TRANSACTION_ID',
    'TRANSACTION_ID1',
    'REF',
    'REF1',
    'MRN',
    'MRN1',
    'TX_ID',
    'TX_ID1',
    'EMP_ID',
    'EMP_ID1',
    'SALARY',
    'SALARY1',
    'RANDOM_ID',
    'RANDOM_ID1',
    'CASE_TYPE',
    'CASE_NUMBER',
    'CASE_NUMBER1',
    'POLICY_TYPE',
    'POLICY_NUMBER',
    'POLICY_NUMBER1',
    'POLICY_NUMBER2',
}

# Constants
DOMAINS = ['medical', 'finance', 'hr', 'legal', 'insurance']
JOB_TITLES = ['Software Engineer', 'Data Scientist', 'HR Manager', 'Financial Analyst', 'Project Manager']
DEPARTMENTS = ['Engineering', 'Finance', 'HR', 'Marketing', 'Operations', 'Sales']
CURRENCIES = ['USD', 'EUR', 'GBP', 'JPY', 'CAD', 'AUD']
STATUSES = ['Active', 'On Leave', 'Terminated', 'Retired']

def generate_entity(entity_type: str) -> Any:
    """Generate a random entity using the appropriate Faker provider."""
    if entity_type == "PERSON":
        return faker.name()
    elif entity_type == "AGE":
        return random.randint(1, 100)
    elif entity_type == "GENDER":
        return random.choice(["Male", "Female", "Other"])
    elif entity_type == "PHONE_NUMBER":
        return faker.phone_number()
    elif entity_type == "EMAIL_ADDRESS":
        return faker.email()
    elif entity_type == "US_SSN":
        return faker.ssn()
    elif entity_type == "US_PASSPORT":
        return faker.passport_number()
    elif entity_type == "US_DRIVER_LICENSE":
        return faker.us_driver_license()
    elif entity_type == "BANK_NUMBER":
        return faker.iban()
    elif entity_type == "IBAN_CODE":
        return faker.iban()
    elif entity_type == "ORGANIZATION":
        return faker.organization()
    elif entity_type == "CREDIT_CARD":
        return faker.credit_card_number()
    elif entity_type == "LOCATION":
        return faker.address().replace('\n', ', ')
    elif entity_type == "MEDICAL_CONDITION":
        return faker.medical_condition()
    elif entity_type == "MEDICAL_PROCEDURE":
        return faker.medical_procedure()
    elif entity_type == "DRUG":
        return faker.drug()
    elif entity_type == "DOSAGE":
        return faker.dosage()
    elif entity_type == "DRUG_FREQUENCY":
        return faker.drug_frequency()
    return ""

def normalize_entity(entity_name: str) -> str:
    """Normalize entity name by removing numbers and looking up in mapping."""
    # If exact match in mapping, return it
    if entity_name in ENTITY_MAPPING:
        return ENTITY_MAPPING[entity_name]
    
    # Try to match patterns like NAME1, PHONE2, etc.
    base_name = re.sub(r'\d+$', '', entity_name)
    if base_name in ENTITY_MAPPING.values():
        return base_name
    
    # Default to original name if no mapping found
    return entity_name

def generate_template(domain: str) -> Tuple[str, List[Dict]]:
    """Generate a template for the specified domain with entity spans."""
    # [Previous template generation code remains the same until the spans generation part]
    if domain == 'medical':
        # First, generate all entities
        entities = {
            'PERSON': generate_entity("PERSON"),
            'RANDOM_ID': f"MRN{random.randint(100000, 999999)}",
            'DATE_TIME': (datetime.now() - timedelta(days=random.randint(18*365, 90*365))).strftime('%Y-%m-%d'),
            'PERSON1': f"Dr. {generate_entity('PERSON')}",
            'DATE1': datetime.now().strftime('%Y-%m-%d'),
            'AGE': str(generate_entity("AGE")),
            'GENDER': generate_entity("GENDER"),
            'SYMPTOM': random.choice(['headache', 'chest pain', 'abdominal pain']),
            'WEEK': str(random.randint(1, 12)),
            'DRUG': generate_entity("DRUG"),
            'DOSAGE': generate_entity("DOSAGE"),
            'FREQUENCY1': generate_entity("DRUG_FREQUENCY"),
            'RESPONSE': random.choice(['good', 'partial', 'no'])
        }
        
        # Create the template with values directly inserted
        template = f"""
        MEDICAL REPORT
        Patient: {entities['PERSON']} (MRN: {entities['RANDOM_ID']})
        Date of Birth: {entities['DATE_TIME']}
        Provider: {entities['PERSON1']}
        Date: {entities['DATE1']}
        
        CHIEF COMPLAINT:
        {entities['PERSON1']} is a {entities['AGE']}-year-old {entities['GENDER']} who presents with {entities['SYMPTOM']}.
        
        HISTORY OF PRESENT ILLNESS:
        The patient reports a {entities['WEEK']}-week history of {entities['SYMPTOM']}. 
        Previous treatment included {entities['DRUG']} {entities['DOSAGE']} {entities['FREQUENCY1']} with {entities['RESPONSE']} response.
        """
        
    elif domain == 'finance':
        # Generate finance entities
        entities = {
            'RANDOM_ID': f"TX{random.randint(100000, 999999)}",
            'DATE_TIME': datetime.now().strftime('%Y-%m-%d'),
            'AMOUNT': f"{random.uniform(100, 100000):.2f}",
            'CURRENCY': random.choice(CURRENCIES),
            'ORGANIZATION': faker.company(),
            'BANK_NUMBER': f"AC{random.randint(10000000, 99999999)}",
            'IBAN_CODE': faker.iban_code(),
            'TX_TYPE': random.choice(['Wire Transfer', 'ACH Payment', 'International Transfer']),
            'DESC': f"Payment for {faker.bs()} services",
            'RANDOM_ID1': f"INV-{random.randint(1000, 9999)}",
            'STATUS': random.choice(['Completed', 'Pending', 'Failed'])
        }
        
        template = f"""
        FINANCIAL TRANSACTION RECORD
        Transaction ID: {entities['RANDOM_ID']}
        Date: {entities['DATE_TIME']}
        Amount: {entities['AMOUNT']} {entities['CURRENCY']}
        
        PARTIES:
        Sender: {entities['ORGANIZATION']} (Account: {entities['BANK_NUMBER']})
        Receiver: {entities['ORGANIZATION']} (IBAN: {entities['IBAN_CODE']})
        
        DETAILS:RANDOM_ID1
        Transaction type: {entities['TX_TYPE']}
        Description: {entities['DESC']}
        Reference: {entities['RANDOM_ID1']}
        Status: {entities['STATUS']}

        DETAILED SUMMARY:
        The transaction was processed on {entities['DATE_TIME']} and the amount was {entities['AMOUNT']} {entities['CURRENCY']}. The transaction was made by {entities['ORGANIZATION']} and the reference number was {entities['RANDOM_ID1']}. The transaction was {entities['STATUS']}.
        """
        
    elif domain == 'hr':
        # Generate HR entities
        entities = {
            'RANDOM_ID': f"EMP{random.randint(10000, 99999)}",
            'PERSON': generate_entity("PERSON"),
            'TITLE': random.choice(JOB_TITLES),
            'DEPT': random.choice(DEPARTMENTS),
            'EMAIL_ADDRESS': faker.email(),
            'PHONE_NUMBER': generate_entity("PHONE_NUMBER"),
            'LOCATION': generate_entity("LOCATION"),
            'HIRE_DATE': (datetime.now() - timedelta(days=random.randint(30, 3650))).strftime('%Y-%m-%d'),
            'STATUS': random.choice(STATUSES),
            'EMP_TYPE': random.choice(['Full-time', 'Part-time', 'Contractor']),
            'SALARY': f"${random.randint(50000, 250000):,}",
            'CURRENCY': random.choice(['USD', 'EUR', 'GBP', 'JPY', 'CAD', 'AUD', 'INR', 'RMB']),
            'BONUS': random.randint(0, 30),
            'STOCK_OPTIONS': random.randint(0, 10000)
        }
        
        template = f"""
        EMPLOYEE RECORD
        Employee ID: {entities['RANDOM_ID']}
        Name: {entities['PERSON']}
        Position: {entities['TITLE']}
        Department: {entities['DEPT']}
        Manager: {entities['PERSON']}
        
        CONTACT INFORMATION:
        Email: {entities['EMAIL_ADDRESS']}
        Phone: {entities['PHONE_NUMBER']}
        Address: {entities['LOCATION']}
        
        EMPLOYMENT DETAILS:
        Hire Date: {entities['HIRE_DATE']}
        Status: {entities['STATUS']}
        Employment Type: {entities['EMP_TYPE']}
        
        COMPENSATION:
        Base Salary: {entities['SALARY']} {entities['CURRENCY']}
        Bonus: {entities['BONUS']}%
        Stock Options: {entities['STOCK_OPTIONS']}

        REMARKS:
        The employee has been with the company for {random.randint(1, 10)} years. The employee is a {entities['EMP_TYPE']} employee. The employee was promoted to {entities['TITLE']} in {random.randint(1, 10)} years.
        """
    elif domain == "legal":
        # Generate legal entities
        entities = {
            'RANDOM_ID': f"CASE{random.randint(10000, 99999)}",
            'DATE_TIME': datetime.now().strftime('%Y-%m-%d'),
            'CASE_TYPE': random.choice(['Criminal', 'Civil', 'Family', 'Business', 'Intellectual Property']),
            'JUDGE': faker.name(),
            'PARTY1': faker.name(),
            'PARTY2': faker.name(),
            'LOCATION': generate_entity("LOCATION"),
            'STATUS': random.choice(['Pending', 'In Progress', 'Completed', 'Dismissed', 'Settled']),
            'CASE_NUMBER': f"CASE-{random.randint(1000, 9999)}",
            'CASE_NUMBER1': f"CASE-{random.randint(1000, 9999)}"
        }
        
        template = f"""
        LEGAL CASE RECORD
        Case ID: {entities['RANDOM_ID']}
        Date: {entities['DATE_TIME']}
        Case Type: {entities['CASE_TYPE']}
        Judge: {entities['JUDGE']}

        INVOLVED PARTIES:
        Party 1: {entities['PARTY1']}
        Party 2: {entities['PARTY2']}
        Location: {entities['LOCATION']}
        Status: {entities['STATUS']}
        Case Number: {entities['CASE_NUMBER']}
        Case Number 1: {entities['CASE_NUMBER1']}
        
        REMARKS:
        The case was filed on {entities['DATE_TIME']} and is currently {entities['STATUS']}. The case involves {entities['CASE_TYPE']} between {entities['PARTY1']} and {entities['PARTY2']}. The case was assigned to Judge {entities['JUDGE']} and is located in {entities['LOCATION']}. The case number is {entities['CASE_NUMBER']} and the case number 1 is {entities['CASE_NUMBER1']}.
        """ 

    elif domain == 'insurance':
        #Generate insurance entities
        entities = {
            'RANDOM_ID': f"POL{random.randint(10000, 99999)}",
            'DATE_TIME': datetime.now().strftime('%Y-%m-%d'),
            'POLICY_TYPE': random.choice(['Health', 'Auto', 'Life', 'Property', 'Travel']),
            'INSURER': faker.company(),
            'POLICY_NUMBER': f"POL-{random.randint(1000, 9999)}",
            'POLICY_NUMBER1': f"POL-{random.randint(1000, 9999)}",
            'POLICY_NUMBER2': f"POL-{random.randint(1000, 9999)}",
            'STATUS': random.choice(['Pending', 'In Progress', 'Completed', 'Rejected', 'Settled'])
        }
        
        template = f"""
        INSURANCE POLICY RECORD
        Policy ID: {entities['RANDOM_ID']}
        Date: {entities['DATE_TIME']}
        Policy Type: {entities['POLICY_TYPE']}
        Insurer: {entities['INSURER']}
        Policy Number: {entities['POLICY_NUMBER']}
        Policy Number 1: {entities['POLICY_NUMBER1']}
        Policy Number 2: {entities['POLICY_NUMBER2']}
        
        REMARKS:
        The policy was filed on {entities['DATE_TIME']} and is currently {entities['STATUS']}. 
        """ 

    # Clean up the template
    template = template.strip()
    # Generate spans for all entities in the text
    spans = []
    for entity_name, value in entities.items():
        # Skip ignored entities
        if entity_name in IGNORE_ENTITIES:
            continue
            
        # Normalize entity name
        normalized_entity = normalize_entity(entity_name)
        
        value_str = str(value)
        start = 0
        while True:
            # Find the entity value in the text
            pos = template.find(value_str, start)
            if pos == -1:
                break
            # Add the span for this occurrence
            spans.append({
                'entity_type': normalized_entity,  # Use normalized entity name
                'entity_value': encrypt(value_str, eKey),
                'start_position': pos,
                'end_position': pos + len(value_str),
                #'original_entity': entity_name  # Keep original for reference
            })
            start = pos + 1
    
    # Sort spans by start position for cleaner output
    spans.sort(key=lambda x: x['start_position'])
    
    return template, spans

def generate_document(domain: str, 
                     min_length: int = 3000,
                     count: int = 1,
                     ignore_entities = None,
                     entity_mapping: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
    """
    Generate a document with entity spans for the specified domain.
    
    Args:
        domain: The domain to generate data for ('medical', 'finance', or 'hr')
        min_length: Minimum length of the generated text in characters
        ignore_entities: Set of entity names to exclude from span generation
        entity_mapping: Custom mapping from template entity names to normalized names
        
    Returns:
        Dictionary containing the generated document and its spans
    """
    # Use provided ignore list or default
    global IGNORE_ENTITIES
    original_ignore = IGNORE_ENTITIES.copy()
    if ignore_entities is not None:
        IGNORE_ENTITIES = ignore_entities
    
    # Use provided entity mapping or default
    global ENTITY_MAPPING
    original_mapping = ENTITY_MAPPING.copy()
    if entity_mapping is not None:
        ENTITY_MAPPING.update(entity_mapping)
    
    try:
        # Generate base template
        text, spans = generate_template(domain)
        
        # If text is too short, append more content
        while len(text) < min_length:
            # Add a section break
            text += "\n\n" + "-" * 50 + "\n\n"
            
            # Generate another template of the same domain
            more_text, more_spans = generate_template(domain)
            
            # Adjust the offsets of the new spans
            offset = len(text)
            for span in more_spans:
                span['start_position'] += offset
                span['end_position'] += offset
            
            # Append the new content and spans
            text += more_text
            spans.extend(more_spans)
        
        return {
            #'domain': domain,
            'full_text': encrypt(text, eKey),
            'template_id': count,
            'spans': spans,
            #'generated_at': datetime.now().isoformat(),
            #'metadata': {
            #    'ignored_entities': list(IGNORE_ENTITIES),
            #    'entity_mapping': ENTITY_MAPPING
            #}
        }
    finally:
        # Restore original values
        IGNORE_ENTITIES = original_ignore
        ENTITY_MAPPING = original_mapping

# Example usage
if __name__ == "__main__":
    # Example of custom ignore list and entity mapping
    custom_ignore = {
        'CURRENCY1',  # Don't tag currency values
        'AMOUNT1',    # Don't tag amounts
        'STATUS1'     # Don't tag status values
    }
    
    custom_mapping = {
        # Add or override entity mappings
        'PERSON1': 'PATIENT',  # In medical context
        'PERSON2': 'DOCTOR',   # In medical context
        'MGR1': 'MANAGER'      # In HR context
    }

    
    number_of_samples = 100

    docList = []
    # Generate one document from each domain
    for i in range(number_of_samples):
        domain = random.choice(DOMAINS)
        print(f"\nGenerating {domain.upper()} document...")
        # Generate document with custom ignore list and entity mapping
        doc = generate_document(
            domain, 
            min_length=1000,
            count=i,
            #ignore_entities=None,#custom_ignore,
            #entity_mapping=None,#custom_mapping
        )
        docList.append(doc)

    # Print summary
    #print(f"Document length: {len(doc['text'])} characters")
    #print(f"Number of entity spans: {len(doc['spans'])}")
    
    # Get unique entity types
    entity_types = set()
    for doc in docList:
        entity_types.update(span['entity_type'] for span in doc['spans'])
    
    print(f"Entity types: {', '.join(sorted(entity_types))}")
    
    # Save to file
    filename = f"../data/test_TEXT_document_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    Path("data").mkdir(exist_ok=True)
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(docList, f, indent=2)
    print(f"Saved to {filename}")
    
    # Print sample of the first few entities
    #print("\nSample entities:")
    #for i, span in enumerate(docList[0]['spans'][:5]):  # Show first 5 entities
    #    print(f"  {span['entity']} (was {span['original_entity']}): {span['value']} (position: {span['start']}-{span['end']})")