In [None]:
# Generate fake PII data using the Presidio Sentence Faker
# Encrypt / Decrypt Functions

import os
import base64
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad, unpad
from Crypto.Hash import SHA256

# Generate a 32-byte key from a string
def generate_key(key_string):
    hash_object = SHA256.new(key_string.encode())
    return hash_object.digest()

# Encrypts the given plaintext using AES and returns the ciphertext
def encrypt(plaintext, key):
    cipher = AES.new(key, AES.MODE_CBC)
    iv = cipher.iv
    padded_plaintext = pad(plaintext.encode(), AES.block_size)  # Pad the plaintext
    ciphertext = cipher.encrypt(padded_plaintext)
    return base64.b64encode(iv + ciphertext).decode('utf-8')  # Prepend IV for decryption

# Decrypts the given ciphertext using AES and returns the plaintext
def decrypt(ciphertext, key):
    ciphertext_bytes = base64.b64decode(ciphertext)
    iv = ciphertext_bytes[:AES.block_size]
    cipher = AES.new(key, AES.MODE_CBC, iv)
    padded_plaintext = cipher.decrypt(ciphertext_bytes[AES.block_size:])
    return unpad(padded_plaintext, AES.block_size).decode('utf-8')  # Unpad the plaintext

# Example usage

key_string = "marveluniverse"  # Use a secure key
eKey = generate_key(key_string)

# Example plaintext
#plaintext = "This is my new test string for testing decryption"

encrypted = "ehpEhRFFLp3kgaLtyHWIjRGqp4wfaM80rC/wjFQJ0lQmnUFZ6rXQU5lW5uCjK0jwutAV0sbPrX3e7EtoX/p+4Jnl/QhYd24XbUXAhDF+uWwwqjYTTLsbEWI7XbFDdCkYuARO2aYt3hNzvovSjNZb11KUpQazEjm1VpDmrhxPFN1uO8CPZIUDEBd+zw9V2SxiYcIkc19EuCgza6zOJUZnQA2GblwahnsL6flPBXDLVmhFTIDVSThnRzWiZKGR2Ru8G7NKI7PyZnddrunc78hp79o/LXzjGtBS3yq7m2zo8lbqM46ihqcVGYxBlH9X8tkTE8DI8SK2EuU9NrDm0KeDz4dlwcfRSTh1I63HD/KxQnTt1yuTf5Hees+0HPFgtQtRFIMEdfTukJXMi/TXCnnawzHBHTcmWZaZexi8Zavj6Ov9THc1MGJhWRq2LlfsokQn"
# Encrypt
#encrypted = encrypt(plaintext, eKey)
#print("Encrypted:", encrypted)

# Decrypt
decrypted = decrypt(encrypted, eKey)
print("Decrypted:", decrypted)


In [None]:
import datetime
import pprint
from collections import Counter
from pathlib import Path
from typing import Dict, List

import pandas as pd
import numpy as np

from presidio_evaluator import InputSample
from presidio_evaluator.data_generator import PresidioSentenceFaker
from presidio_evaluator.data_generator.faker_extensions.providers import *

IpAddressProvider  # Both Ipv4 and IPv6 IP addresses
NationalityProvider  # Read countries + nationalities from file
OrganizationProvider  # Read organization names from file
UsDriverLicenseProvider  # Read US driver license numbers from file
AgeProvider  # Age values (unavailable on Faker
AddressProviderNew  # Extend the default address formats
PhoneNumberProviderNew  # Extend the default phone number formats
ReligionProvider  # Read religions from file
HospitalProvider  # Read hospital names from file
MedicalProvider  # Read medical entities from file
UsPassportProvider

from presidio_evaluator.data_generator.presidio_sentence_faker import PresidioSentenceFaker


column_names = {
    'PERSON': ["name", "first_name", "last_name", "middle_name", "full_name", "person", "employee"],
    'DATE_TIME': ["dob", "date_of_birth", "date", "birthdate", "date_and_time", "date_time"],
    'EMAIL_ADDRESS': ["email", "email_address", "contact"],
    'PHONE_NUMBER': ["phone", "phone_num", "phone_number", "call", "contact_num"],
    'GENDER': ["gender", "sex", "sexual_orientation"],
    'AGE': ["age", "age_group"],
    'DOSAGE': ["dosage", "dose", "medication"],
    'DRUG': ["drug", "tablets", "pharmaceutical", "capsules", "medicine"],
    'MEDICAL_CONDITION': ["condition", "diagnosis", "illness", "disease"],
    'MEDICAL_PROCEDURE': ["procedure", "surgery", "treatment", "lab_test", "lab_results"],
    'DRUG_FREQUENCY': ["frequency", "recommendation", "consumption"],
    'SYMPTOM': ["symptom", "complaint", "sign"],
    'CREDIT_CARD': ["cc", "cc_num", "card", "card_num", "credit_card", "card_number", "credit_card_num"],
    'US_DRIVER_LICENSE': ["usdl", "us_driver_license", "driver_license", "driver_license_num", "us_driver_license_n"],
    'US_PASSPORT': ["us_passport", "passport", "passport_number", "passport_num", "us_passport_n"],
    'US_SSN': ["ssn", "social_security_number", "social_security_number_n", "ssn_num"],
    'URL': ["url", "domain", "uri", "web", "website"],
    'BANK_NUMBER': ["bank_account", "bank_account_num", "bank_account_number", "bank", "account_num"],
    'IBAN_CODE': ["iban", "iban_code", "iban_number", "international_bank_account"],
    'LOCATION': ["location", "address", "place", "city", "country", "address_full"],
    'ORGANIZATION': ["organization", "org", "company", "business", "school", "hospital", "org"],
}

def get_random_column_name_for_entity(entity_type: str) -> str:
    # Get a random name for the specified entity type
    names = column_names.get(entity_type, ["column_name"])
    return random.choice(names) if names else "unknown"

print(get_random_column_name_for_entity('PERSON'))
print(get_random_column_name_for_entity("IBAN_CODE"))
print(get_random_column_name_for_entity("DRUG_FREQUENCY"))

def generate_sentence(entities: List[str], num_columns: int) -> str:
    # Ensure the number of columns is not greater than the number of available entities
    if num_columns > len(entities):
        raise ValueError("Number of columns cannot exceed the number of available entities")
    
    # Randomly select the specified number of entities
    selected_entities = random.sample(entities, num_columns)
    
    # Create a sentence by joining the selected entities with commas
    sentence = "', '".join(selected_entities)
    return "'" + sentence + "'"
# Given a list of entities, generate a sentence with those entities based on the number of columns input
import pprint
import datetime

number_of_samples = 10
lower_case_ratio = 0.05
locale = "en"
cur_time = datetime.date.today().strftime("%B_%d_%Y")

output_file = f"../data/generated_size_date_{cur_time}.json"


entities_list = ["{{PERSON}}", "{{EMAIL_ADDRESS}}", "{{PHONE_NUMBER}}", "{{US_DRIVER_LICENSE}}", 
                 "{{AGE}}", "{{US_PASSPORT}}", "{{GENDER}}", "{{IBAN_CODE}}", "{{CREDIT_CARD}}", 
                 "{{BANK_NUMBER}}", "{{ORGANIZATION}}",  "{{MEDICAL_CONDITION}}", 
                 "{{MEDICAL_PROCEDURE}}", "{{DOSAGE}}", "{{DRUG_FREQUENCY}}", "{{US_SSN}}", "{{DATE_TIME}}",
                 "{{DRUG}}", "{{URL}}", "{{SYMPTOM}}", "{{LOCATION}}" ]
fake_records_1 = []



for _ in range(number_of_samples):
    sentence = generate_sentence(entities_list, random.randint(4, len(entities_list)))
    #print(sentence)

    sentence_templates = []
    sentence_templates.append(sentence)

    sentence_faker = PresidioSentenceFaker(
        "en_US", lower_case_ratio=0.05, sentence_templates=sentence_templates, #base_records=records
    )
    fake_sentence_results = sentence_faker.generate_new_fake_sentences(10)

    # Print the spans of the first sample
    #for text in fake_sentence_results:
    #    print(text.full_text)

    sentence_faker = PresidioSentenceFaker("en_US", lower_case_ratio=0.05, sentence_templates=sentence_templates)

    pd.DataFrame(sentence_faker._sentence_faker.records).head()


    fake_records = sentence_faker.generate_new_fake_sentences(num_samples=number_of_samples)
    #pprint.pprint(fake_records[0])
    full_text = []
    count = 1
    for record in fake_records:
        text = encrypt(record.full_text, eKey)
        full_text.append(text)
        #full_text.append(record.full_text)
        count = count + 1

    fake_records[0].spans = fake_records[0].spans[::-1]

    fake_records_1.append({
            "full_text": list(full_text),
            "spans": [
                {
                    "entity_type": span.entity_type,
                    "entity_value": get_random_column_name_for_entity(span.entity_type),
                    "start_position": span.start_position,
                    "end_position": span.end_position
                }
                for span in fake_records[0].spans
            ]
        })
print(f"Generated records: {len(fake_records_1)}")
#### Which entities did we generate?
count_per_entity = Counter()
for record in fake_records_1:
    #print(record["full_text"])
    #print(record["spans"])
    count_per_entity.update(Counter([span["entity_type"] for span in record["spans"]]))

count_per_entity
#### Save as json
from presidio_evaluator import InputSample
InputSample.structured_data_to_json(dataset=fake_records_1, output_file=output_file)
output_file