In [1]:
import os
import string
import hashlib
import numpy as np
import pandas as pd

datasets_path = os.path.join(os.curdir, "datasets")
csv_path = os.path.join(datasets_path, "dataset - Privacy-Engineering.csv")
zip_path = os.path.join(datasets_path, "matched_zips.csv")

save_final_path = os.path.join(datasets_path, "final", "anonymized_hospital.csv")

df = pd.read_csv(csv_path)
df_zip = pd.read_csv(zip_path, dtype={"zip": 'string'})
matched_zips = df_zip.set_index('zip')['state'].to_dict()

Keep useful columns
---

In [2]:
columns_to_keep = ["id", "zipcode", "dob", "disease"]
df = df[columns_to_keep]
df.head(5)

Unnamed: 0,id,zipcode,dob,disease
0,Amelia Morrow,26904,12/1950,Alzheimer's disease
1,Lillian Cole,78088,4/1940,multiple sclerosis
2,Lillian Simpson,51502,12/1966,heart disease
3,Avery Richards,54080,4/1943,endometriosis
4,Sophia Alvarado,68785,10/1947,gastritis


Generate a unique id based on the hash of the initial id, the date of birth, the zipcode and a random salt
---

In [3]:
def generate_salt(length:int) -> str:
    """
        Generate a salt of ascii characters of length {length}
    """
    return "".join(np.random.choice(list(string.printable), size=length))

salt = generate_salt(50)

df.id = df.apply(lambda x : hashlib.sha512((str(x.id) + str(x.zipcode) + str(x.dob) + salt).encode()).hexdigest(), axis=1)

print(f"All id unique ? {len(df.id) == df.id.nunique()}")

df.head(5)

All id unique ? True


Unnamed: 0,id,zipcode,dob,disease
0,5b454f15d0181fab6923e261a967641cf83a1fa5d5e514...,26904,12/1950,Alzheimer's disease
1,4acf5f0c0196c18a572b075328bf6b28d19cf2e7936bd3...,78088,4/1940,multiple sclerosis
2,98d365eec92c7e174095d38b329442a03c92ddcfd2b4da...,51502,12/1966,heart disease
3,daeb261d4c063875429b170c73e7c85cd003f8de665600...,54080,4/1943,endometriosis
4,204fd1fd4dd472f09d615d5a08ef3ae2cb9e7e421a0997...,68785,10/1947,gastritis


Change all zipcodes to corresponding state
---

In [4]:
def find_state(zip_code):
    """
        Return the state corresponding to the zipcode
    """
    return matched_zips.get(str(zip_code)[0:3], "Unknown")

df['zipcode'] = df.zipcode.apply(find_state)
df.rename(columns={'zipcode':'state'}, inplace=True)
df.head(5)

Unnamed: 0,id,state,dob,disease
0,5b454f15d0181fab6923e261a967641cf83a1fa5d5e514...,Unknown,12/1950,Alzheimer's disease
1,4acf5f0c0196c18a572b075328bf6b28d19cf2e7936bd3...,(TX) Texas,4/1940,multiple sclerosis
2,98d365eec92c7e174095d38b329442a03c92ddcfd2b4da...,(IA) Iowa,12/1966,heart disease
3,daeb261d4c063875429b170c73e7c85cd003f8de665600...,(WI) Wisconsin,4/1943,endometriosis
4,204fd1fd4dd472f09d615d5a08ef3ae2cb9e7e421a0997...,(NE) Nebraska,10/1947,gastritis


Change all date of birth to its corresponding five years range
--- 

In [5]:
def change_dob(dob, power_ranger=5):
    """
        Return the corresonding year range with the range of length {power_ranger}
    """

    _, year = dob.split('/')
    year = int(year)
    while year % power_ranger != 0: year -= 1
    return f"[{year} - {year+power_ranger-1}]"

df.dob = df.dob.apply(change_dob)
df.head(5)

Unnamed: 0,id,state,dob,disease
0,5b454f15d0181fab6923e261a967641cf83a1fa5d5e514...,Unknown,[1950 - 1954],Alzheimer's disease
1,4acf5f0c0196c18a572b075328bf6b28d19cf2e7936bd3...,(TX) Texas,[1940 - 1944],multiple sclerosis
2,98d365eec92c7e174095d38b329442a03c92ddcfd2b4da...,(IA) Iowa,[1965 - 1969],heart disease
3,daeb261d4c063875429b170c73e7c85cd003f8de665600...,(WI) Wisconsin,[1940 - 1944],endometriosis
4,204fd1fd4dd472f09d615d5a08ef3ae2cb9e7e421a0997...,(NE) Nebraska,[1945 - 1949],gastritis


Drop all unique quasi-identifiers and those who do not respect the 2-diversity
---

In [6]:
g = ["state", "dob"]
gbqi = df[g+["disease"]].groupby(g) # Group by quasi-identifiers
indexes_to_drop = gbqi.filter(lambda x:x.disease.nunique()==1).index # Get the index of unique disease corresponding to a set of quasi-indentifiers
final_df = df[["id"]+g+["disease"]].drop(indexes_to_drop).reset_index(drop=True) # Keep the rows respecting the 2-diversity

if input(f"Save the dataset to {save_final_path} ? [y/n]").lower() in ["y", "yes"]:
    final_df.to_csv(save_final_path, index=False)
    print(f"CSV saved to {save_final_path}")

print(f"Percentage dropped to respect 2-anonimity and 2-diversity = {100*len(indexes_to_drop)/len(df)} %")
final_df.head(5)

Percentage dropped to respect 2-anonimity and 2-diversity = 8.25 %


Unnamed: 0,id,state,dob,disease
0,5b454f15d0181fab6923e261a967641cf83a1fa5d5e514...,Unknown,[1950 - 1954],Alzheimer's disease
1,4acf5f0c0196c18a572b075328bf6b28d19cf2e7936bd3...,(TX) Texas,[1940 - 1944],multiple sclerosis
2,98d365eec92c7e174095d38b329442a03c92ddcfd2b4da...,(IA) Iowa,[1965 - 1969],heart disease
3,daeb261d4c063875429b170c73e7c85cd003f8de665600...,(WI) Wisconsin,[1940 - 1944],endometriosis
4,204fd1fd4dd472f09d615d5a08ef3ae2cb9e7e421a0997...,(NE) Nebraska,[1945 - 1949],gastritis


Verificate if the final dataset is 2-anonym and 2-diverse
---

In [7]:
from utils import is_k_anonym, is_l_diverse # Personnal implemented functions

quasi_identifiers = list(
    set(final_df.columns.values)
    -
    set(("id", "disease"))
)

k, l = 2, 2
_is_k_anonym = is_k_anonym(final_df, quasi_identifiers, k)
_is_l_diverse = is_l_diverse(final_df, "disease", quasi_identifiers, l)

print(f"Is the final dataset {k}-anonym? {_is_k_anonym}")
print(f"Is the final dataset {l}-diverse? {_is_l_diverse}")

Is the final dataset 2-anonym? True
Is the final dataset 2-diverse? True
