In [48]:
import os
import hashlib
import pandas as pd
import numpy as np
from collections import Counter
import math, statistics

csv_path = os.path.join(os.curdir, "datasets", "dataset - Privacy-Engineering.csv")
zip_path = os.path.join(os.curdir, "datasets", "matched_zips.csv")

df = pd.read_csv(csv_path)
df_zip = pd.read_csv(zip_path, dtype={"zip": 'string'})
matched_zips = df_zip.set_index('zip')['state'].to_dict()

In [49]:
columns_to_drop = ["ancestry"]
df = df.drop(columns_to_drop, axis=1)

In [50]:
salt = b"\x91\xff=s\x18?JH\x940\xf1$wN\xa4\xbf\x9f\xcdi\x8c\xa2\xf3\xe3I\xeb\xf4\x1f\xc0y\x94\x84\xa8"

def pseudoAnonymisation(i, row):
    name, sex, dob, *rest = row
    hashed = hashlib.sha512(str(name+sex+dob).encode() + salt).hexdigest()
    df.loc[i] = [hashed, sex, dob, *rest]

for i, row in df.iterrows(): pseudoAnonymisation(i, row)

In [52]:
df.head()

Unnamed: 0,id,gender,dob,zipcode,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,disease
0,1df0ffe53bf3f9f933b262768d89fe91254c17f97b6479...,female,12/1950,26904,High School,Retired,2,married,2,0.0,Own house,Alzheimer's disease
1,8500bd676f3124a12cbcc212c4d01f7f3e79705df4d0e1...,female,4/1940,78088,PhD/md,Retired,1,married,1,0.0,Rent flat,multiple sclerosis
2,235ce4408c70dc7edae44b052edb154c658c710176c882...,female,12/1966,51502,Bachelor,Employed,1,married,0,0.1,Rent flat,heart disease
3,e6c07ae522682aea1d375f0bfa33f85a3340baef3aea9c...,female,4/1943,54080,Bachelor,Retired,1,married,0,0.0,Rent room,endometriosis
4,c09207b520ed7d82c04eee506c6be57bcfe3df62598461...,female,10/1947,68785,High School,Retired,1,married,0,0.0,Rent flat,gastritis


In [53]:
def find_state(zip_code):
    res = matched_zips.get(str(zip_code)[0:3])
    if res is None: return "Unknown"
    return res

df['state'] = df.zipcode.apply(find_state)
df = df.drop("zipcode", axis=1)

In [54]:
def more_than_2_children(x):
    if isinstance(x, str): return x
    elif x == 0: return '0'
    elif x == 1: return '1'
    elif x > 1: return '2+'

def has_child(childs):
    if childs == 0: return False
    return True

def more_than_1_hour(x):
    if isinstance(x, str): return x
    elif x == 0.0: return '0'
    elif x <= 1.0: return '[0-1]'
    elif x > 1.0: return '1+'

def change_accom(accomod):
    how, _ = accomod.split(' ')
    return how

def change_education(educ):
    if educ == "PhD/md" or educ == "Masters": return "Masters and higher"
    return educ

def change_dob(dob):
    _, year = dob.split('/')
    year = int(year)
    while year % 3 != 0: year -= 1
    return f"[{year} - {year+2}]"

df.dob = df.dob.apply(change_dob)

df.accommodation = df.accommodation.apply(change_accom)
df.education = df.education.apply(change_education)
# df.children=df.children.apply(more_than_2_children)

df.children=df.children.apply(has_child)

df.commute_time=df.commute_time.apply(more_than_1_hour)

In [55]:
def entropy(C, N=2000): return round(- sum([(ci/N) * math.log(ci/N, 2) for i,ci in enumerate(C)]),4)
# Max : 11

In [56]:
df.head()

Unnamed: 0,id,gender,dob,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,disease,state
0,1df0ffe53bf3f9f933b262768d89fe91254c17f97b6479...,female,[1950 - 1952],High School,Retired,True,married,2,0,Own,Alzheimer's disease,Unknown
1,8500bd676f3124a12cbcc212c4d01f7f3e79705df4d0e1...,female,[1938 - 1940],Masters and higher,Retired,True,married,1,0,Rent,multiple sclerosis,(TX) Texas
2,235ce4408c70dc7edae44b052edb154c658c710176c882...,female,[1965 - 1967],Bachelor,Employed,True,married,0,[0-1],Rent,heart disease,(IA) Iowa
3,e6c07ae522682aea1d375f0bfa33f85a3340baef3aea9c...,female,[1941 - 1943],Bachelor,Retired,True,married,0,0,Rent,endometriosis,(WI) Wisconsin
4,c09207b520ed7d82c04eee506c6be57bcfe3df62598461...,female,[1947 - 1949],High School,Retired,True,married,0,0,Rent,gastritis,(NE) Nebraska


In [57]:
from itertools import *
from tqdm import tqdm

# Compute all possibilities
covariates = ["education", "gender", "marital_status", "number_vehicles", "accommodation"]

results = []

for c in tqdm(range(1, len(covariates)+1)):
    for used in combinations(covariates, c):
        used = used + ("employment", "children", "commute_time")
        used = list(used)
        ct = df.groupby(used).size().to_frame("count")
        len_groups = len(ct)
        len_unique = sum(ct['count'] < 2)

        C = ct['count'].to_numpy()
        CR = C[C > 1]

        size_cr = 2000-len_unique

        entropy_full = entropy(C)
        entropy_cut = entropy(CR, size_cr)

        results.append(
            {
                "used": used,
                "len_groups": len_groups,
                "len_unique": len_unique,
                "diff": size_cr,
                "entropy_full": entropy_full,
                "entropy_cut": entropy_cut
            }
        )

100%|██████████| 5/5 [00:00<00:00, 31.33it/s]


In [58]:
results_sort = sorted(results, key=lambda x: (x['entropy_cut'], x['diff'], x['len_groups'],   x['entropy_full'], ), reverse=True)

In [59]:
# import csv

# with open(os.path.join(os.curdir, "datasets", "entropy_results.csv"), "w", newline='') as out:
#     dict_writer = csv.DictWriter(out, results_sort[0].keys())
#     dict_writer.writeheader()
#     dict_writer.writerows(results_sort)

In [60]:
results_filter = [ dic for dic in results_sort if dic.get('len_unique') < 150 and dic.get('entropy_full') > 5]    
results_filter_sorted = sorted(results_filter, key=lambda x: x['entropy_cut'], reverse=True)

In [61]:
# Best ?
results_filter_sorted[0:5]

[{'used': ['education',
   'gender',
   'marital_status',
   'accommodation',
   'employment',
   'children',
   'commute_time'],
  'len_groups': 361,
  'len_unique': 147,
  'diff': 1853,
  'entropy_full': 7.4561,
  'entropy_cut': 7.0676},
 {'used': ['education',
   'gender',
   'number_vehicles',
   'accommodation',
   'employment',
   'children',
   'commute_time'],
  'len_groups': 343,
  'len_unique': 138,
  'diff': 1862,
  'entropy_full': 7.3307,
  'entropy_cut': 6.9581},
 {'used': ['education',
   'marital_status',
   'number_vehicles',
   'employment',
   'children',
   'commute_time'],
  'len_groups': 309,
  'len_unique': 113,
  'diff': 1887,
  'entropy_full': 7.1426,
  'entropy_cut': 6.8298},
 {'used': ['education',
   'gender',
   'number_vehicles',
   'employment',
   'children',
   'commute_time'],
  'len_groups': 230,
  'len_unique': 59,
  'diff': 1941,
  'entropy_full': 6.8563,
  'entropy_cut': 6.6881},
 {'used': ['gender',
   'marital_status',
   'number_vehicles',
   'ac

In [63]:
g = ["education", "gender", "number_vehicles", "accommodation", "employment", "children", "commute_time"]

# Demandez pas pourquoi les noms sont ct..
# ct = df.groupby(g).size().to_frame("count")
ctt = df[df.duplicated(g, keep=False)].sort_values(by=g) # Enlève ceux qui sont uniques (sans disease pris en compte)

g2 = g.copy()
g2.append("disease")

g3 = []
cttt = ctt.groupby(g2).size().to_frame("count") # disease inclus


def checkForUnique(grouped_frame):
    checked = {}    
    for i,row in enumerate(grouped_frame.iterrows()):
        *identifiers, _ = row[0] 
        identifiers = ':'.join([str(iden) for iden in identifiers])
        if identifiers in checked:
            if checked.get(identifiers).get("found") is True: continue
            else : checked[identifiers]["found"] = True
        else: checked[identifiers] = {"found": False, "index": i+2}
    return checked

false_checked = [ v.get('index') for k,v in checkForUnique(cttt).items() if v.get("found") == False]
print(false_checked)

g_final = ["id", "gender", "education", "employment", "children", "number_vehicles", "commute_time", "accommodation", "disease"]
base_1 = ctt[g_final].reset_index().drop(index=false_checked) # enlève ceux qui sont pas bon pour le l-diversity
base_1 = base_1.drop(columns=["index"])

# base_1.to_csv(os.path.join(os.curdir, "datasets", "final", "base_1_sorted.csv"), index=False)
# base_1.sort_values(by="id").to_csv(os.path.join(os.curdir, "datasets", "final", "base_1_final.csv"), index=False)

[106, 202, 227, 873, 917]
