In [1]:
import os
import hashlib
import pandas as pd
import numpy as np
from collections import Counter
import math, statistics

csv_path = os.path.join(os.curdir, "datasets", "dataset - Privacy-Engineering.csv")
zip_path = os.path.join(os.curdir, "datasets", "matched_zips.csv")

df = pd.read_csv(csv_path)
df_zip = pd.read_csv(zip_path, dtype={"zip": 'string'})
matched_zips = df_zip.set_index('zip')['state'].to_dict()

In [2]:
columns_to_drop = ["ancestry"]
df = df.drop(columns_to_drop, axis=1)

In [3]:
salt = b"\x91\xff=s\x18?JH\x940\xf1$wN\xa4\xbf\x9f\xcdi\x8c\xa2\xf3\xe3I\xeb\xf4\x1f\xc0y\x94\x84\xa8"

df.id = df.id.apply(lambda x : hashlib.sha512(str(x).encode() + salt).hexdigest())

In [4]:
def find_state(zip_code):
    res = matched_zips.get(str(zip_code)[0:3])
    if res is None: return "Unknown"
    return res

df['state'] = df.zipcode.apply(find_state)
df = df.drop("zipcode", axis=1)

In [5]:
def more_than_2_children(x):
    if isinstance(x, str): return x
    elif x == 0: return '0'
    elif x == 1: return '1'
    elif x > 1: return '2+'

def has_child(childs):
    if childs == 0: return False
    return True

def more_than_1_hour(x):
    if isinstance(x, str): return x
    elif x == 0.0: return '0'
    elif x <= 1.0: return '1'
    elif x > 1.0: return '1+'

def change_accom(accomod):
    how, _ = accomod.split(' ')
    return how

def change_education(educ):
    if educ == "PhD/md" or educ == "Masters": return "Masters and higher"
    return educ

def change_dob(dob):
    _, year = dob.split('/')
    year = int(year)
    while year % 3 != 0: year -= 1
    return f"[{year} - {year+2}]"

df.dob = df.dob.apply(change_dob)

df.accommodation = df.accommodation.apply(change_accom)
df.education = df.education.apply(change_education)
# df.children=df.children.apply(more_than_2_children)

df.children=df.children.apply(has_child)

df.commute_time=df.commute_time.apply(more_than_1_hour)

In [6]:
def entropy(C, N=2000): return round(- sum([(ci/N) * math.log(ci/N, 2) for i,ci in enumerate(C)]),4)
# Max : 11

In [7]:
from itertools import *
from tqdm import tqdm


covariates = ["education", "gender", "dob", "state", "children", "employment", "marital_status", "number_vehicles", "commute_time", "accommodation"]

results = []

for c in tqdm(range(1, len(covariates)+1)):
    for used in combinations(covariates, c):
        used = used + ("disease",)
        used = list(used)
        ct = df.groupby(used).size().to_frame("count")
        len_groups = len(ct)
        len_unique = sum(ct['count'] < 2)

        C = ct['count'].to_numpy()
        CR = C[C > 1]

        size_cr = 2000-len_unique

        entropy_full = entropy(C)
        entropy_cut = entropy(CR, size_cr)

        results.append(
            {
                "used": used,
                "len_groups": len_groups,
                "len_unique": len_unique,
                "diff": size_cr,
                "entropy_full": entropy_full,
                "entropy_cut": entropy_cut
            }
        )

100%|██████████| 10/10 [00:08<00:00,  1.20it/s]


In [10]:
results_sort = sorted(results, key=lambda x: (x['diff'], x['len_groups'],  x['entropy_cut'], x['entropy_full'], ), reverse=True)

In [45]:
# import csv

# with open(os.path.join(os.curdir, "datasets", "entropy_results.csv"), "w", newline='') as out:
#     dict_writer = csv.DictWriter(out, results_sort[0].keys())
#     dict_writer.writeheader()
#     dict_writer.writerows(results_sort)

In [11]:
results_filter = [ dic for dic in results_sort if dic.get('len_unique') > 0 and dic.get('len_unique') < 100 and dic.get('entropy_full') > 5]    
results_filter_sorted = sorted(results_filter, key=lambda x: x['entropy_cut'], reverse=True)

In [12]:
# Best ?
results_filter_sorted[0:10]

[{'used': ['gender', 'dob', 'disease'],
  'len_groups': 533,
  'len_unique': 80,
  'diff': 1920,
  'entropy_full': 8.8033,
  'entropy_cut': 8.6543},
 {'used': ['education',
   'gender',
   'number_vehicles',
   'accommodation',
   'disease'],
  'len_groups': 430,
  'len_unique': 94,
  'diff': 1906,
  'entropy_full': 8.3062,
  'entropy_cut': 8.1056},
 {'used': ['dob', 'children', 'disease'],
  'len_groups': 394,
  'len_unique': 90,
  'diff': 1910,
  'entropy_full': 8.2104,
  'entropy_cut': 8.0142},
 {'used': ['education', 'number_vehicles', 'commute_time', 'disease'],
  'len_groups': 392,
  'len_unique': 76,
  'diff': 1924,
  'entropy_full': 8.1299,
  'entropy_cut': 7.962},
 {'used': ['dob', 'disease'],
  'len_groups': 304,
  'len_unique': 16,
  'diff': 1984,
  'entropy_full': 7.9867,
  'entropy_cut': 7.9511},
 {'used': ['gender',
   'number_vehicles',
   'commute_time',
   'accommodation',
   'disease'],
  'len_groups': 385,
  'len_unique': 86,
  'diff': 1914,
  'entropy_full': 8.0614,

In [49]:
g = ["education", "gender", "number_vehicles", "accommodation","disease"]
ct = df.groupby(g).size().to_frame("count")
ctt = df[df.duplicated(g, keep=False)].sort_values(by=g) # Enlève ceux qui sont uniques
ctt

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count
education,gender,number_vehicles,accommodation,disease,Unnamed: 5_level_1
Bachelor,female,0,Own,HIV/AIDS,5
Bachelor,female,0,Own,breast cancer,2
Bachelor,female,0,Own,diabetes,2
Bachelor,female,0,Own,endometriosis,1
Bachelor,female,0,Own,gastritis,5
...,...,...,...,...,...
Masters and more,male,3,Own,hypertension,1
Masters and more,male,3,Own,kidney disease,2
Masters and more,male,3,Own,prostate cancer,1
Masters and more,male,3,Own,skin cancer,2
