In [1]:
import os
import string
import hashlib
import numpy as np
import pandas as pd

datasets_path = os.path.join(os.curdir, "datasets")
csv_path = os.path.join(datasets_path, "dataset - Privacy-Engineering.csv")
zip_path = os.path.join(datasets_path, "matched_zips.csv")

save_final_path = os.path.join(datasets_path, "final", "anonymized_hospital.csv")

raw_df = df = pd.read_csv(csv_path)
df_zip = pd.read_csv(zip_path, dtype={"zip": 'string'})
matched_zips = df_zip.set_index('zip')['state'].to_dict()

Keep useful columns
---

In [2]:
columns_to_keep = ["id", "zipcode", "dob", "disease"]
df = df[columns_to_keep]
df.head(5)

Unnamed: 0,id,zipcode,dob,disease
0,Amelia Morrow,26904,12/1950,Alzheimer's disease
1,Lillian Cole,78088,4/1940,multiple sclerosis
2,Lillian Simpson,51502,12/1966,heart disease
3,Avery Richards,54080,4/1943,endometriosis
4,Sophia Alvarado,68785,10/1947,gastritis


Generate a unique id based on the hash of the initial id, the date of birth, the zipcode and a random salt
---

In [3]:
df.id = df.apply(lambda x : hashlib.sha512((str(x.id) + str(x.zipcode) + str(x.dob)).encode() + os.urandom(256)).hexdigest(), axis=1)

print(f"All id unique ? {len(df.id) == df.id.nunique()}")

df.head(5)

All id unique ? True


Unnamed: 0,id,zipcode,dob,disease
0,acca8006f5ea3612008a334725b46072188c4768da162d...,26904,12/1950,Alzheimer's disease
1,962178310ebab3ca4bf3bc1f844034c5bc56f5fefb098e...,78088,4/1940,multiple sclerosis
2,399eb8bd888120fc3c63157c05fdc1cd29ce415ae6f704...,51502,12/1966,heart disease
3,e68f162b121686cc13485585be270e1c6789fd0d0afdac...,54080,4/1943,endometriosis
4,075209297829549515c45fa3763485eb493179bc63044e...,68785,10/1947,gastritis


Change all zipcodes to corresponding state
---

In [4]:
def find_state(zip_code):
    """
        Return the state corresponding to the zipcode
    """
    return matched_zips.get(str(zip_code)[0:3], "Unknown")

df['zipcode'] = df.zipcode.apply(find_state)
df.rename(columns={'zipcode':'state'}, inplace=True)
df.head(5)

Unnamed: 0,id,state,dob,disease
0,acca8006f5ea3612008a334725b46072188c4768da162d...,Unknown,12/1950,Alzheimer's disease
1,962178310ebab3ca4bf3bc1f844034c5bc56f5fefb098e...,(TX) Texas,4/1940,multiple sclerosis
2,399eb8bd888120fc3c63157c05fdc1cd29ce415ae6f704...,(IA) Iowa,12/1966,heart disease
3,e68f162b121686cc13485585be270e1c6789fd0d0afdac...,(WI) Wisconsin,4/1943,endometriosis
4,075209297829549515c45fa3763485eb493179bc63044e...,(NE) Nebraska,10/1947,gastritis


Change all date of birth to its corresponding five years range
--- 

In [5]:
def change_dob(dob, power_ranger=5):
    """
        Return the corresonding year range with the range of length {power_ranger}
    """

    _, year = dob.split('/')
    year = int(year)
    while year % power_ranger != 0: year -= 1
    return f"[{year} - {year+power_ranger-1}]"

df.dob = df.dob.apply(change_dob)
df.head(5)

Unnamed: 0,id,state,dob,disease
0,acca8006f5ea3612008a334725b46072188c4768da162d...,Unknown,[1950 - 1954],Alzheimer's disease
1,962178310ebab3ca4bf3bc1f844034c5bc56f5fefb098e...,(TX) Texas,[1940 - 1944],multiple sclerosis
2,399eb8bd888120fc3c63157c05fdc1cd29ce415ae6f704...,(IA) Iowa,[1965 - 1969],heart disease
3,e68f162b121686cc13485585be270e1c6789fd0d0afdac...,(WI) Wisconsin,[1940 - 1944],endometriosis
4,075209297829549515c45fa3763485eb493179bc63044e...,(NE) Nebraska,[1945 - 1949],gastritis


Drop all unique quasi-identifiers and those who do not respect the 2-diversity
---

In [6]:
quasi_identifiers = ["state", "dob"]
sensitive_column = "disease"

gbqi = df[quasi_identifiers+[sensitive_column]].groupby(quasi_identifiers) # Group by quasi-identifiers
indexes_to_drop = gbqi.filter(lambda x:x.disease.nunique()==1).index # Get the index of unique disease corresponding to a set of quasi-indentifiers
df = df[["id"]+quasi_identifiers+[sensitive_column]].drop(indexes_to_drop) # Keep the rows respecting the 2-diversity

To ensure that it's impossible to do a skewness attack, we need to drop 2 people
---

In [7]:
idx_to_drop_sorry_guys = [346, 846]
df.drop(idx_to_drop_sorry_guys, inplace=True)

Save the final dataset
---

In [8]:
if input(f"Save the dataset to {save_final_path} ? [y/n]").lower() in ["y", "yes"]:
    df.to_csv(save_final_path, index=False)
    print(f"CSV saved to {save_final_path}")

print(f"Percentage dropped to respect 2-anonimity and 2-diversity = {100*len(indexes_to_drop)/len(raw_df)} %")
df.head(5)

CSV saved to ./datasets/final/anonymized_hospital.csv
Percentage dropped to respect 2-anonimity and 2-diversity = 8.25 %


Unnamed: 0,id,state,dob,disease
0,acca8006f5ea3612008a334725b46072188c4768da162d...,Unknown,[1950 - 1954],Alzheimer's disease
1,962178310ebab3ca4bf3bc1f844034c5bc56f5fefb098e...,(TX) Texas,[1940 - 1944],multiple sclerosis
2,399eb8bd888120fc3c63157c05fdc1cd29ce415ae6f704...,(IA) Iowa,[1965 - 1969],heart disease
3,e68f162b121686cc13485585be270e1c6789fd0d0afdac...,(WI) Wisconsin,[1940 - 1944],endometriosis
4,075209297829549515c45fa3763485eb493179bc63044e...,(NE) Nebraska,[1945 - 1949],gastritis


Verificate if the final dataset is 2-anonym and 2-diverse
---

In [9]:
from utils import is_k_anonym, is_l_diverse # Personnal implemented functions

k, l = 2, 2
_is_k_anonym = is_k_anonym(df, quasi_identifiers, k)
_is_l_diverse = is_l_diverse(df, sensitive_column, quasi_identifiers, l)

print(f"Is the final dataset {k}-anonym? {_is_k_anonym}")
print(f"Is the final dataset {l}-diverse? {_is_l_diverse}")

Is the final dataset 2-anonym? True
Is the final dataset 2-diverse? True
