In [1]:
from hashlib import sha256
from datetime import datetime, timedelta
import random
import pandas as pd
import pydbgen
from pydbgen import pydbgen
from random_address import real_random_address
from cryptography.fernet import Fernet

In [2]:
addr = real_random_address()

In [3]:
addr

{'address1': '161 Creek Road',
 'address2': '',
 'city': 'Middlebury',
 'state': 'VT',
 'postalCode': '05753',
 'coordinates': {'lat': 44.0025571, 'lng': -73.16035579999999}}

In [4]:
random.sample(range(100), 10)

[66, 44, 19, 12, 26, 81, 49, 64, 43, 30]

Encryption
● Hashing
● Masking
● De Identification
● Anonymization
● PseudoAnonymization

In [5]:
def tokenize(data: str) -> str:
    """ 
    This removes the data from an organization’s internal systems entirely and exchanges it for a randomly generated nonsensitive placeholder.
    The primary differences and benefits of using tokenization vs. encryption is that tokenized data cannot be returned to its original form.
    """
    pass

In [6]:
KEY = Fernet.generate_key()
cipher_suite = Fernet(KEY)

In [7]:
cipher_suite

<cryptography.fernet.Fernet at 0x124944740>

In [8]:
def encrypt(data: str) -> str:
    ssn_bytes = data.encode('utf-8')
    encrypted_ssn = cipher_suite.encrypt(ssn_bytes)
    return encrypted_ssn.decode('utf-8')

encrypted = encrypt("data")
decrypted = cipher_suite.decrypt(encrypted)
print(encrypted, decrypted)

gAAAAABmlsXYKHnjR8l9HmXS2nXjowHrzeMt3NPk9oTeBDy6dk2crSErWM1VYxuK4XkUEX8ALMPVOmx6p26ynr4qRojp16Fl1g== b'data'


In [9]:
def hash(data: str) -> str:
    sha256_hash = sha256()
    sha256_hash.update(data.encode('utf-8'))
    hashed_string = sha256_hash.hexdigest()
    return hashed_string

hash("09/29/1998")

'1b981233042d15d7cd1ea00a84bb5df3bc7a4f53a3498829d8cc41364b0cf6af'

In [10]:
def mask_email(data: str) -> str:
    """ replace the data with random codes """
    email_domain = data.split('@')
    email_domain[0] = hash(email_domain[0]) #TODO: use encryption instead
    anonymized_email = "@".join(email_domain)
    return anonymized_email

mask_email("rob@gmail.com")

'84313ef39b0a979f0608491608870b3f2065f447d73e4373ba75ae2330aa82b5@gmail.com'

In [11]:
'0' * 5

'00000'

In [75]:
def anonymize_id(student_id: int) -> str:
    """ anonymize the data generate string n in a similar format """
    n_digits = len(str(student_id))
    low_bound = int('1' + '0' * (n_digits - 1))
    upper_bound = int('9' * n_digits)
    new_id = random.randint(low_bound, upper_bound)
    
    return new_id

In [30]:
def pseudo_anonymize_address(address: str) -> str:
    """ 
    ADDRESS:
    pseudo_anonymization, the data is still considered (PII); however, it is 
    accomplished by substituting PII values like name, ID number, or date of birth
    with a random code. Pseudo anonymization is reversable where data masking is not 
      
    But there are numerous other methods of pseudonymization, including the use of: 
    - Cryptographic hash techniques, that arbitrarily input strings to fixed 
    length outputs and then apply them directly to the identifier  
    - Random number generators, that create a random number and then assign it to an identifier
    - Message authentication codes, which are keyed-hash functions that require a secret key to generate the 
    pseudonym for each data field 
    - Monotonic counters, that substitute an identifier with a unique, non-repeating value
    - Encryption, that safeguards identifiers as long as the encryption key remains uncompromised  
    """
    return encrypt(address)

In [31]:
def de_identify(data: str) -> str:
    """ 
    EX: 
    replace Robert Campbell with Robert C. 
    replace 3.2 with a range 3.0 - 3.5
    No individual can be individually identified based on their record
    """
    pass

In [32]:
def replace_last_name_with_initial(last_name: str) -> str:
    """ replace last name with only the first letter """
    return last_name[0]

In [33]:
def tokenize_ssn(ssn):
    ssn_int = random.randint(100000000, 999999999)
    ssn_str = str(ssn_int)

    tokenized_ssn = ssn_str[:3] + '-' + ssn_str[3:5] + '-' + ssn_str[5:]
    return tokenized_ssn


def encrypt_tokenize(ssn):
    """ 
    first tokenize the data by replacing ssn with random integers in the correct format
    then encrypt the tokenized SSN
    """
    tokenized = tokenize_ssn(ssn)
    encrypted = encrypt(tokenized)
    return encrypted


In [34]:
def return_same(x):
    return x

In [78]:
col_names = ["first_name", "last_name", "name", "dob", "age", "email", "ssn", "medical_record_number", "student_id"]
agg_map = {
    "name": return_same,
    "first_name": return_same, 
    "last_name": replace_last_name_with_initial, 
    "dob": hash, 
    "age": return_same, 
    "email": mask_email,
    "address": pseudo_anonymize_address,
    "ssn": encrypt_tokenize,
    "medical_record_number": encrypt,
    "student_id": anonymize_id,
    "city": return_same,
    "state": return_same,
    "zip": return_same

}

In [63]:
possible_ids = range(1_000_000, 9_999_999)

def generate_medical_number(x):
    n_digits = 7
    low_bound = int('1' + '0' * (n_digits - 1))
    upper_bound = int('9' * n_digits)
    return str(random.randint(low_bound, upper_bound))


def generate_student_id(x) -> int:
    sample = random.sample(possible_ids, 1)
    return sample[0]


def generate_age(x) -> int:
    return random.randint(18, 70)


def get_location(row):
    addr = real_random_address()
    state = addr["state"]
    address = addr['address1']
    city, zip = None, None

    try: city = addr["city"] 
    except Exception as e: print(e)
    
    try: zip = addr["postalCode"] 
    except Exception as e: print(e)
    
    return pd.Series([address, city, state, zip], index=["address", "city", "state", "zip"])

In [64]:
# create synthetic dataset
src_db = pydbgen.pydb()
pydb_df = src_db.gen_dataframe(1000, fields=col_names, phone_simple=True)
pydb_df.head()

Unnamed: 0,first_name,last_name,name,dob,age,email,ssn,medical_record_number,student_id
0,,,Debbie Ryan,,,Ryan_Debbie82@hotmail.com,664-10-9052,,
1,,,Jodi Dawson,,,Jodi.Dawson@gmail.com,350-07-9643,,
2,,,Jessica Smith,,,JSmith@hotmail.com,047-10-6611,,
3,,,Keith Stewart,,,Keith.S70@yahoo.com,565-04-1872,,
4,,,Richard Wells,,,Richard_Wells76@hotmail.com,450-49-1096,,


In [65]:
pydb_df["student_id"] = pydb_df["student_id"].apply(generate_student_id)
# pydb_df["age"] = pydb_df["age"].apply(generate_age) TODO: get age by dob
loc_info = pydb_df.apply(get_location, result_type='expand', axis=1)
pydb_df = pd.concat([pydb_df, loc_info], axis=1)

'city'
'city'
'city'
'city'


In [66]:
def generate_dob(start_year=1950, end_year=2000):
    """
    Generates a random date between January 1, 1950, and December 31, 2000.
    :param start_year: The start year for the date range (inclusive)
    :param end_year: The end year for the date range (inclusive)
    :return: A random date as a string in the format YYYY-MM-DD
    """
    start_date = datetime(start_year, 1, 1)
    end_date = datetime(end_year, 12, 31)
    
    delta = end_date - start_date
    random_days = random.randint(0, delta.days)
    
    random_date = start_date + timedelta(days=random_days)
    
    return random_date.strftime('%Y-%m-%dT%H:%M:%S.%fZ')

In [67]:
pydb_df["dob"] = pydb_df["dob"].apply(lambda x: generate_dob())

In [68]:
def get_first_name(name: str) -> str:
    names = name.split(" ")
    names.pop(-1)
    return " ".join(names)

def get_last_name(name: str) -> str:
    names = name.split(" ")
    return names.pop(-1)

In [69]:
pydb_df["first_name"] = pydb_df["name"].apply(get_first_name)
pydb_df["last_name"] = pydb_df["name"].apply(get_last_name)
pydb_df["medical_record_number"] = pydb_df["medical_record_number"].apply(generate_medical_number)

In [57]:
pydb_df.agg(agg_map)

  pydb_df.agg(agg_map)
  pydb_df.agg(agg_map)
  pydb_df.agg(agg_map)
  pydb_df.agg(agg_map)
  pydb_df.agg(agg_map)
  pydb_df.agg(agg_map)


AttributeError: 'Series' object has no attribute 'encode'

In [70]:
pydb_df.columns

Index(['first_name', 'last_name', 'name', 'dob', 'age', 'email', 'ssn',
       'medical_record_number', 'student_id', 'address', 'city', 'state',
       'zip'],
      dtype='object')

In [71]:
final_df = pydb_df.copy(deep=True)

In [80]:
for col in final_df.columns:
    final_df[col] = final_df[col].apply(agg_map[col])

In [81]:
final_df

Unnamed: 0,first_name,last_name,name,dob,age,email,ssn,medical_record_number,student_id,address,city,state,zip
0,Debbie,R,Debbie Ryan,b74f48bb1fadf26b520fc64af29faa712551dfe02255b9...,,cfedea060190d5bb6d816774d5f75e9885d8d0ba2df215...,gAAAAABmls3X5ICViDaOQ12F1yUm2PjVtDNzikB_qP392L...,gAAAAABmls3X5d6gMa43arSP-4IWyn7_qfZmW6Ssx_4xpl...,4112073,gAAAAABmls3X-qE-r7j7yiVcdRPDSdFPwvoY1WitW2YLjo...,Louisville,KY,40222
1,Jodi,D,Jodi Dawson,24c9e018654dadf1fcbe73122e37f4f3e7540c8c24ed13...,,91a0263ecac36a14cca2221400590a8b6c3617bd0356c8...,gAAAAABmls3XN_fa_QokIVsLCKEUYMWzdfZn4ih6dX9OXW...,gAAAAABmls3Xt3rdV9TGT_Eql91E9QVn4AqcoYNOv6UWrt...,3050983,gAAAAABmls3XmUBsIoyJa8p1h5C3oiWbRVyPmcn5Y_uENd...,Montgomery,AL,36107
2,Jessica,S,Jessica Smith,f15f9469985280d23ea33f9e05b616d425906941efda5b...,,b1cbeeafbceff3268b5f16db4349b2443e96384f201499...,gAAAAABmls3XlAY4PC1e7s5cfNZuq2G8wtJk_0SgUC7qvl...,gAAAAABmls3XT6T72k2vQDTsXnBxULP8OD8_W2ll4aBaWb...,1134240,gAAAAABmls3XZvPODJBhJ3hCSwWYfgAKZKO_0qgju1_28y...,Panama City,FL,32404
3,Keith,S,Keith Stewart,75fc372006c3a6b5e40e9ec31fcc1d600d8e62fa56da98...,,d1e0bfe98698d7be74cf4df420e69f8fda0fd7903230bf...,gAAAAABmls3Xpuy2JC16DfXn6mYKFkUNkChG5mjqrD6e78...,gAAAAABmls3Xsdnd9UqVOtbJz3SjToltE0FlY318PnznDE...,2925863,gAAAAABmls3XaEdShHwJJUYkCu1jHYXdXAcEeUEWP3Wf7y...,Edmond,OK,73012
4,Richard,W,Richard Wells,2634e401eeceb34cf2d10f0d9bf81d6fb07aa698217a2f...,,730bccbf76d7bf6b760d637252616b437c141e7f492aca...,gAAAAABmls3XMQan-lMgVkWEG2XgmWS2IIXZAF3p4JINJJ...,gAAAAABmls3X8HwnRceqDv_N3XDEivLxDK2YkGT2ucbRFS...,1251733,gAAAAABmls3X4e1ehQ5m8thE3n101WCEhV-WyM6XIRRt50...,Montpelier,VT,05602
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Jason,F,Jason Franklin,60c79fa4a52677cca183a2096a3d37e6494cd9f1bec492...,,37cda40331fbdbb1125e06461a5d4bbc0c02b60e97ed1b...,gAAAAABmls3XJy2-8B0ZR9jIG9jRzcOcfoAuR6yQk8bbNz...,gAAAAABmls3XTybxWhiTMaXQ1_eojhcWF1XdChqTnOqJo9...,4533122,gAAAAABmls3XQJ8x2E0DKgM7f-q4pAqhxKIQeY-kOvhkyQ...,Laurel,MD,20724
996,Jennifer,P,Jennifer Perez,5c12930c2fcaa70f93d520230e65eb4077ff90e4ea6a24...,,1ecbe08ce0a03a08416e6c1819170261d8cf41d6c78d51...,gAAAAABmls3XlAEVm8i3AtREa4xGKiOkYmlbUWaQaF_5j_...,gAAAAABmls3XqqoUehnIdhx91N7PZvqRYuAgdgtM0GDi8V...,6439408,gAAAAABmls3XnjXbT0k-DeWQwMTyo5gNI8YOnrclhFqkmw...,Prospect,KY,40059
997,Jimmy,P,Jimmy Pacheco,eaf6ffa574f6a6ce9e29e5928fddf51661962453942cec...,,5d6a1f54154da32dfab32f9ebf0bec1d83b178992e2d0d...,gAAAAABmls3XmpeMqYgiilg69yK-iinrOkftbgSD30awhB...,gAAAAABmls3XhtSTZ11TDtZ2hJr3Vz6OkGzW-ztbOZznnH...,9590789,gAAAAABmls3XQtKfjZ6q2YBIGVJKZf7zTiIdiVr8EdodZ7...,Nashville,TN,37217
998,Bradley,R,Bradley Reilly,e9b3d0507d610e11c793f737b7a0733e3b4602714e1054...,,4bb1f321a7b3b7a8e54ee0ee937a32eb19b17d007a17eb...,gAAAAABmls3XsORvETWEjlppwdIF2KFs4sfXrJd3spy3rf...,gAAAAABmls3X0pY9Taju-jrOdRWP38_GqoNZXWUZiRjX0Z...,8546173,gAAAAABmls3XEwjAdnLsuYoi3xPmvDQbaG3SK0mdir9obm...,Groton,CT,06340
