In [1]:
import os
import hashlib
import pandas as pd
import numpy as np
from collections import Counter
import math, statistics

datasets_path = os.path.join(os.curdir, "datasets")
csv_path = os.path.join(datasets_path, "dataset - Privacy-Engineering.csv")
zip_path = os.path.join(datasets_path, "matched_zips.csv")

save_final_path = os.path.join(datasets_path, "final", "anonymized_dataset.csv")

raw_df = df = pd.read_csv(csv_path)
df_zip = pd.read_csv(zip_path, dtype={"zip": 'string'})
matched_zips = df_zip.set_index('zip')['state'].to_dict()

Drop useless columns
---

In [2]:
columns_to_drop = ["ancestry"]
df.drop(columns_to_drop, axis=1, inplace=True)
df.head(5)

Unnamed: 0,id,gender,dob,zipcode,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,disease
0,Amelia Morrow,female,12/1950,26904,High School,Retired,2,married,2,0.0,Own house,Alzheimer's disease
1,Lillian Cole,female,4/1940,78088,PhD/md,Retired,1,married,1,0.0,Rent flat,multiple sclerosis
2,Lillian Simpson,female,12/1966,51502,Bachelor,Employed,1,married,0,0.1,Rent flat,heart disease
3,Avery Richards,female,4/1943,54080,Bachelor,Retired,1,married,0,0.0,Rent room,endometriosis
4,Sophia Alvarado,female,10/1947,68785,High School,Retired,1,married,0,0.0,Rent flat,gastritis


Generate a unique id based on the hash of the initial id, the date of birth, the zipcode and a salt
---

In [3]:

df.id = df.apply(lambda x : hashlib.sha512((str(x.id) + str(x.dob) + str(x.zipcode)).encode() + os.urandom(256)).hexdigest(), axis=1)

print(f"All id unique ? {len(df.id) == df.id.nunique()}")

df.head(5)

All id unique ? True


Unnamed: 0,id,gender,dob,zipcode,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,disease
0,71b16769bddf0440ff64ec84b0b45b6195d6615bdd4081...,female,12/1950,26904,High School,Retired,2,married,2,0.0,Own house,Alzheimer's disease
1,6b74fc359232226f86832a5a410c4bff578b9ff4b27ee0...,female,4/1940,78088,PhD/md,Retired,1,married,1,0.0,Rent flat,multiple sclerosis
2,848cebbf8e346ae57828741171c11be134f3ce55d3bb71...,female,12/1966,51502,Bachelor,Employed,1,married,0,0.1,Rent flat,heart disease
3,1c245b3234d6dc888021e3002d01a3593d20e51606ebf0...,female,4/1943,54080,Bachelor,Retired,1,married,0,0.0,Rent room,endometriosis
4,4212e9f5b1b5596f78cd3ffe1f7298d4a44cb2d0011658...,female,10/1947,68785,High School,Retired,1,married,0,0.0,Rent flat,gastritis


Change all zipcodes to corresponding state
---

In [4]:
def find_state(zip_code):
    """
        Return the state corresponding to the zipcode
    """
    return matched_zips.get(str(zip_code)[0:3], "Unknown")

df['zipcode'] = df.zipcode.apply(find_state)
df.rename(columns={'zipcode':'state'}, inplace=True)
df.head(5)

Unnamed: 0,id,gender,dob,state,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,disease
0,71b16769bddf0440ff64ec84b0b45b6195d6615bdd4081...,female,12/1950,Unknown,High School,Retired,2,married,2,0.0,Own house,Alzheimer's disease
1,6b74fc359232226f86832a5a410c4bff578b9ff4b27ee0...,female,4/1940,(TX) Texas,PhD/md,Retired,1,married,1,0.0,Rent flat,multiple sclerosis
2,848cebbf8e346ae57828741171c11be134f3ce55d3bb71...,female,12/1966,(IA) Iowa,Bachelor,Employed,1,married,0,0.1,Rent flat,heart disease
3,1c245b3234d6dc888021e3002d01a3593d20e51606ebf0...,female,4/1943,(WI) Wisconsin,Bachelor,Retired,1,married,0,0.0,Rent room,endometriosis
4,4212e9f5b1b5596f78cd3ffe1f7298d4a44cb2d0011658...,female,10/1947,(NE) Nebraska,High School,Retired,1,married,0,0.0,Rent flat,gastritis


Change columns to a less accurate attribute in order to generalise
---

In [5]:
def has_child(child):
    if child == 0: return False
    return True

def more_than_1_hour(x):
    if isinstance(x, str): return x
    elif x == 0.0: return '0'
    elif x <= 1.0: return '[0-1]'
    elif x > 1.0: return '1+'

def change_accom(accomod):
    how, _ = accomod.split(' ')
    return how

def change_education(educ):
    if educ == "PhD/md" or educ == "Masters": return "Masters and higher"
    return educ

def change_dob(dob):
    _, year = dob.split('/')
    year = int(year)
    while year % 3 != 0: year -= 1
    return f"[{year} - {year+2}]"

df.dob = df.dob.apply(change_dob)

df.accommodation = df.accommodation.apply(change_accom)

df.education = df.education.apply(change_education)

df.children=df.children.apply(has_child)

df.commute_time=df.commute_time.apply(more_than_1_hour)

df.head(5)

Unnamed: 0,id,gender,dob,state,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,disease
0,71b16769bddf0440ff64ec84b0b45b6195d6615bdd4081...,female,[1950 - 1952],Unknown,High School,Retired,True,married,2,0,Own,Alzheimer's disease
1,6b74fc359232226f86832a5a410c4bff578b9ff4b27ee0...,female,[1938 - 1940],(TX) Texas,Masters and higher,Retired,True,married,1,0,Rent,multiple sclerosis
2,848cebbf8e346ae57828741171c11be134f3ce55d3bb71...,female,[1965 - 1967],(IA) Iowa,Bachelor,Employed,True,married,0,[0-1],Rent,heart disease
3,1c245b3234d6dc888021e3002d01a3593d20e51606ebf0...,female,[1941 - 1943],(WI) Wisconsin,Bachelor,Retired,True,married,0,0,Rent,endometriosis
4,4212e9f5b1b5596f78cd3ffe1f7298d4a44cb2d0011658...,female,[1947 - 1949],(NE) Nebraska,High School,Retired,True,married,0,0,Rent,gastritis


Compute the entropy for all possible columns combination in order to find the best entropy
---

In [6]:
def entropy(C, N=2000): return round(- sum([(ci/N) * math.log(ci/N, 2) for i,ci in enumerate(C)]),4)
# Max : 11

In [7]:
from itertools import *
from tqdm import tqdm


covariates = ["education", "gender", "dob", "state", "children", "employment", "marital_status", "number_vehicles", "commute_time", "accommodation"]

results = []

for c in tqdm(range(1, len(covariates)+1)):
    for used in combinations(covariates, c):
        used = used + ("disease",)
        used = list(used)
        ct = df.groupby(used).size().to_frame("count")
        len_groups = len(ct)
        len_unique = sum(ct['count'] < 2)

        C = ct['count'].to_numpy()
        CR = C[C > 1]

        size_cr = 2000-len_unique

        entropy_full = entropy(C)
        entropy_cut = entropy(CR, size_cr)

        results.append(
            {
                "used": used,
                "len_groups": len_groups,
                "len_unique": len_unique,
                "diff": size_cr,
                "entropy_full": entropy_full,
                "entropy_cut": entropy_cut
            }
        )

100%|██████████| 10/10 [00:05<00:00,  1.97it/s]


In [8]:
results_sort = sorted(results, key=lambda x: (x['diff'], x['len_groups'],  x['entropy_cut'], x['entropy_full'], ), reverse=True)
results_filter = [ dic for dic in results_sort if dic.get('len_unique') > 0 and dic.get('len_unique') < 100 and dic.get('entropy_full') > 5]    
results_filter_sorted = sorted(results_filter, key=lambda x: x['entropy_cut'], reverse=True)

results_filter_sorted[:5]

[{'used': ['gender', 'dob', 'disease'],
  'len_groups': 533,
  'len_unique': 80,
  'diff': 1920,
  'entropy_full': 8.8033,
  'entropy_cut': 8.6543},
 {'used': ['education',
   'gender',
   'number_vehicles',
   'accommodation',
   'disease'],
  'len_groups': 430,
  'len_unique': 94,
  'diff': 1906,
  'entropy_full': 8.3062,
  'entropy_cut': 8.1056},
 {'used': ['dob', 'children', 'disease'],
  'len_groups': 394,
  'len_unique': 90,
  'diff': 1910,
  'entropy_full': 8.2104,
  'entropy_cut': 8.0142},
 {'used': ['education', 'number_vehicles', 'commute_time', 'disease'],
  'len_groups': 392,
  'len_unique': 76,
  'diff': 1924,
  'entropy_full': 8.1299,
  'entropy_cut': 7.962},
 {'used': ['dob', 'disease'],
  'len_groups': 304,
  'len_unique': 16,
  'diff': 1984,
  'entropy_full': 7.9867,
  'entropy_cut': 7.9511}]

Drop all unique quasi-identifiers and those who do not respect the 2-diversity
---

In [9]:
quasi_identifiers = ["education", "gender", "number_vehicles", "accommodation", "employment", "children", "commute_time"]
sensitive_column = "disease"

gbqi = df[quasi_identifiers+[sensitive_column]].groupby(quasi_identifiers)
indexes_to_drop = gbqi.filter(lambda x:x[sensitive_column].nunique()==1).index
df = df[["id"]+quasi_identifiers+[sensitive_column]].drop(indexes_to_drop)

To ensure that it's impossible to do a skewness attack, we need to drop 2 people
---

In [10]:
idx_to_drop_sorry_guys = [346, 846]
df.drop(idx_to_drop_sorry_guys, inplace=True)

Save the final dataset
---

In [11]:
if input(f"Save the dataset to {save_final_path} ? [y/n]").lower() in ["y", "yes"]:
    df.to_csv(save_final_path, index=False)
    print(f"CSV saved to {save_final_path}")

print(f"Percentage dropped to respect 2-anonymity and 2-diversity = {100*len(indexes_to_drop)/len(df)} %")
final_df.head(5)

CSV saved to ./datasets/final/anonymized_dataset.csv
Percentage dropped to respect 2-anonimity and 2-diversity = 7.4 %


Unnamed: 0,id,education,gender,number_vehicles,accommodation,employment,children,commute_time,disease
0,71b16769bddf0440ff64ec84b0b45b6195d6615bdd4081...,High School,female,2,Own,Retired,True,0,Alzheimer's disease
2,848cebbf8e346ae57828741171c11be134f3ce55d3bb71...,Bachelor,female,0,Rent,Employed,True,[0-1],heart disease
3,1c245b3234d6dc888021e3002d01a3593d20e51606ebf0...,Bachelor,female,0,Rent,Retired,True,0,endometriosis
4,4212e9f5b1b5596f78cd3ffe1f7298d4a44cb2d0011658...,High School,female,0,Rent,Retired,True,0,gastritis
5,b1962816e48d9d6f89ce83a7c1fbc364345df4df797d7b...,High School,male,1,Own,Retired,True,0,endometriosis


Verify if the final dataset is 2-anonym and 2-diverse
---

In [12]:
from utils import is_k_anonym, is_l_diverse # Personnal implemented functions

k, l = 2, 2
_is_k_anonym = is_k_anonym(df, quasi_identifiers, k)
_is_l_diverse = is_l_diverse(df, sensitive_column, quasi_identifiers, l)

print(f"Is the final dataset {k}-anonym? {_is_k_anonym}")
print(f"Is the final dataset {l}-diverse? {_is_l_diverse}")

Is the final dataset 2-anonym? True
Is the final dataset 2-diverse? True
