In [1]:
import os
import math
import string
import hashlib
import numpy as np
import pandas as pd

datasets_path = os.path.join(os.curdir, "datasets")
csv_path = os.path.join(datasets_path, "dataset - Privacy-Engineering.csv")
zip_path = os.path.join(datasets_path, "matched_zips.csv")

save_final_path = os.path.join(datasets_path, "anonymized_hospital.csv")

df = pd.read_csv(csv_path)
df_zip = pd.read_csv(zip_path, dtype={"zip": 'string'})
matched_zips = df_zip.set_index('zip')['state'].to_dict()

Keep useful columns
---

In [2]:
columns_to_keep = ["id", "zipcode", "dob", "disease"]
df = df[columns_to_keep]
df.head(5)

Unnamed: 0,id,zipcode,dob,disease
0,Amelia Morrow,26904,12/1950,Alzheimer's disease
1,Lillian Cole,78088,4/1940,multiple sclerosis
2,Lillian Simpson,51502,12/1966,heart disease
3,Avery Richards,54080,4/1943,endometriosis
4,Sophia Alvarado,68785,10/1947,gastritis


Generate a unique id based on the hash of the initial id, the date of birth and the zipcode
---

In [3]:

def generate_salt(length:int) -> str:
    """
        Generate a salt of ascii characters of length {length}
    """
    return "".join(np.random.choice(list(string.printable), size=length))

salt = generate_salt(50)

df.id = df.apply(lambda x : hashlib.sha512((str(x.id) + str(x.zipcode) + str(x.dob) + salt).encode()).hexdigest(), axis=1)

print(f"All id unique ? {len(df.id) == df.id.nunique()}")

df.head(5)

All id unique ? True


Unnamed: 0,id,zipcode,dob,disease
0,7fe1cfe586f6f7865762a7dd4567721176092fe6016c84...,26904,12/1950,Alzheimer's disease
1,797e01b2f02c9944c043dc482447700a7f2a7440a5b421...,78088,4/1940,multiple sclerosis
2,1f43cd5962fd09813bf727850101899bd0235e98ef8e39...,51502,12/1966,heart disease
3,65168546582933b734037e5908fd93fd2c221c14a0c62f...,54080,4/1943,endometriosis
4,3a6f5eabf34660c5dc90dcd90903e244c99bb1dba8ebef...,68785,10/1947,gastritis


Change all zipcodes to corresponding state
---

In [4]:
def find_state(zip_code):
    """
        Return the state corresponding to the zipcode
    """
    return matched_zips.get(str(zip_code)[0:3], "Unknown")

df['zipcode'] = df.zipcode.apply(find_state)
df.rename(columns={'zipcode':'state'}, inplace=True)
df.head(5)

Unnamed: 0,id,state,dob,disease
0,7fe1cfe586f6f7865762a7dd4567721176092fe6016c84...,Unknown,12/1950,Alzheimer's disease
1,797e01b2f02c9944c043dc482447700a7f2a7440a5b421...,(TX) Texas,4/1940,multiple sclerosis
2,1f43cd5962fd09813bf727850101899bd0235e98ef8e39...,(IA) Iowa,12/1966,heart disease
3,65168546582933b734037e5908fd93fd2c221c14a0c62f...,(WI) Wisconsin,4/1943,endometriosis
4,3a6f5eabf34660c5dc90dcd90903e244c99bb1dba8ebef...,(NE) Nebraska,10/1947,gastritis


Change all date of birth to its corresponding five years range
--- 

In [5]:
def change_dob(dob, power_ranger=5):
    """
        Return the corresonding year range with the range of length {power_ranger}
    """
    _, year = dob.split('/')
    year = int(year)
    while year % power_ranger != 0: year -= 1
    return f"[{year} - {year+power_ranger}]"

df.dob = df.dob.apply(change_dob)
df.head(5)

Unnamed: 0,id,state,dob,disease
0,7fe1cfe586f6f7865762a7dd4567721176092fe6016c84...,Unknown,[1950 - 1955],Alzheimer's disease
1,797e01b2f02c9944c043dc482447700a7f2a7440a5b421...,(TX) Texas,[1940 - 1945],multiple sclerosis
2,1f43cd5962fd09813bf727850101899bd0235e98ef8e39...,(IA) Iowa,[1965 - 1970],heart disease
3,65168546582933b734037e5908fd93fd2c221c14a0c62f...,(WI) Wisconsin,[1940 - 1945],endometriosis
4,3a6f5eabf34660c5dc90dcd90903e244c99bb1dba8ebef...,(NE) Nebraska,[1945 - 1950],gastritis


Drop all unique quasi-identifiers and not respecting the 2-diversity
---

In [6]:
g = ["state", "dob"]
gbqi = df[g+["disease"]].groupby(g) # Group by quasi-identifiers
indexes_to_drop = gbqi.filter(lambda x:x.disease.nunique()==1).index # Get the index of unique disease corresponding to a set of quasi-indentifiers
final_df = df[["id"]+g+["disease"]].drop(indexes_to_drop).reset_index(drop=True) # Keep the rows respecting the 2-diversity

print(f"Percentage dropped to respect 2-anonimity and 2-diversity = {100*len(indexes_to_drop)/len(df)} %")

if input(f"Save the dataset to {save_final_path} ? [y/n]").lower() in ["y", "yes"]:
    final_df.to_csv(save_final_path, index=False)
    print(f"CSV saved to {save_final_path}")

final_df.head(5)

Percentage drop to respect 2-anonimity and 2-diversity = 8.25 %
CSV saved to ./datasets/anonymized_hospital.csv


Unnamed: 0,id,state,dob,disease
0,7fe1cfe586f6f7865762a7dd4567721176092fe6016c84...,Unknown,[1950 - 1955],Alzheimer's disease
1,797e01b2f02c9944c043dc482447700a7f2a7440a5b421...,(TX) Texas,[1940 - 1945],multiple sclerosis
2,1f43cd5962fd09813bf727850101899bd0235e98ef8e39...,(IA) Iowa,[1965 - 1970],heart disease
3,65168546582933b734037e5908fd93fd2c221c14a0c62f...,(WI) Wisconsin,[1940 - 1945],endometriosis
4,3a6f5eabf34660c5dc90dcd90903e244c99bb1dba8ebef...,(NE) Nebraska,[1945 - 1950],gastritis
