In [6]:
import os
import hashlib
import pandas as pd
import numpy as np
from collections import Counter
import math, statistics

csv_path = os.path.join(os.curdir, "datasets", "dataset - Privacy-Engineering.csv")
zip_path = os.path.join(os.curdir, "datasets", "matched_zips.csv")

df = pd.read_csv(csv_path)
df_zip = pd.read_csv(zip_path, dtype={"zip": 'string'})
matched_zips = df_zip.set_index('zip')['state'].to_dict()

In [7]:
columns_to_drop = ["ancestry"]
df = df.drop(columns_to_drop, axis=1)

In [8]:
salt = b"\x91\xff=s\x18?JH\x940\xf1$wN\xa4\xbf\x9f\xcdi\x8c\xa2\xf3\xe3I\xeb\xf4\x1f\xc0y\x94\x84\xa8"

df.id = df.id.apply(lambda x : hashlib.sha512(str(x).encode() + salt).hexdigest())
df.dob = df.dob.apply(lambda x : x.split("/")[1])

In [9]:
df.head(5)

Unnamed: 0,id,gender,dob,zipcode,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,disease
0,97eb7a57528a1baea5a7be646149f9d75cc53f3c5ba87f...,female,1950,26904,High School,Retired,2,married,2,0.0,Own house,Alzheimer's disease
1,730151ea90e546b7b8fe5911d91ac943a9095ebb6691ba...,female,1940,78088,PhD/md,Retired,1,married,1,0.0,Rent flat,multiple sclerosis
2,ef150a1bf99c1188ff526b1408d959e2cdb1729c4f1153...,female,1966,51502,Bachelor,Employed,1,married,0,0.1,Rent flat,heart disease
3,d9240d1730615fa39e50b22de35439c79814e162454e9e...,female,1943,54080,Bachelor,Retired,1,married,0,0.0,Rent room,endometriosis
4,a445d7cdb7efbbc10c59a27331869c8151ddb55313fcd0...,female,1947,68785,High School,Retired,1,married,0,0.0,Rent flat,gastritis


In [10]:
def find_state(zip_code):
    res = matched_zips.get(str(zip_code)[0:3])
    if res is None: return "Unknown"
    return res

df['state'] = df.zipcode.apply(find_state)
df = df.drop("zipcode", axis=1)

In [11]:
def more_than_2_children(x):
    if isinstance(x, str): return x
    elif x == 0: return '0'
    elif x == 1: return '1'
    elif x > 1: return '2+'

def has_child(childs):
    if childs == 0: return False
    return True#

def more_than_1_hour(x):
    if isinstance(x, str): return x
    elif x == 0.0: return '0'
    elif x <= 1.0: return '1'
    elif x > 1.0: return '1+'

def change_accom(accomod):
    how, _ = accomod.split(' ')
    return how

def change_education(educ):
    if educ == "PhD/md" or educ == "Masters": return "Masters and more"
    return educ

df.accommodation = df.accommodation.apply(change_accom)
df.education = df.education.apply(change_education)
# df.children=df.children.apply(more_than_2_children)

df.children=df.children.apply(has_child)

df.commute_time=df.commute_time.apply(more_than_1_hour)
df.head()

Unnamed: 0,id,gender,dob,education,employment,children,marital_status,number_vehicles,commute_time,accommodation,disease,state
0,97eb7a57528a1baea5a7be646149f9d75cc53f3c5ba87f...,female,1950,High School,Retired,True,married,2,0,Own,Alzheimer's disease,Unknown
1,730151ea90e546b7b8fe5911d91ac943a9095ebb6691ba...,female,1940,Masters and more,Retired,True,married,1,0,Rent,multiple sclerosis,(TX) Texas
2,ef150a1bf99c1188ff526b1408d959e2cdb1729c4f1153...,female,1966,Bachelor,Employed,True,married,0,1,Rent,heart disease,(IA) Iowa
3,d9240d1730615fa39e50b22de35439c79814e162454e9e...,female,1943,Bachelor,Retired,True,married,0,0,Rent,endometriosis,(WI) Wisconsin
4,a445d7cdb7efbbc10c59a27331869c8151ddb55313fcd0...,female,1947,High School,Retired,True,married,0,0,Rent,gastritis,(NE) Nebraska


In [12]:
def entropy(C, N=2000): return round(- sum([(ci/N) * math.log(ci/N, 2) for i,ci in enumerate(C)]),4)
# Max : 11

In [30]:
from itertools import *
from tqdm import tqdm


covariates = ["education", "gender", "dob", "state", "children", "employment", "marital_status", "number_vehicles", "commute_time", "accommodation"]

results = []

for c in tqdm(range(1, len(covariates)+1)):
    for used in combinations(covariates, c):
        used = used + ("disease",)
        used = list(used)
        ct = df.groupby(used).size().to_frame("count")
        len_groups = len(ct)
        len_unique = sum(ct['count'] < 2)

        C = ct['count'].to_numpy()
        CR = C[C > 1]

        size_cr = 2000-len_unique

        entropy_full = entropy(C)
        entropy_cut = entropy(CR, size_cr)

        results.append(
            {
                "used": used,
                "len_groups": len_groups,
                "len_unique": len_unique,
                "diff": size_cr,
                "entropy_full": entropy_full,
                "entropy_cut": entropy_cut
            }
        )

100%|██████████| 10/10 [00:09<00:00,  1.06it/s]


In [58]:
results_sort = sorted(results, key=lambda x: (x['new_size'], x['len_groups'],  x['entropy_cut'], x['entropy_full'], ), reverse=True)

In [45]:
# import csv

# with open(os.path.join(os.curdir, "datasets", "entropy_results.csv"), "w", newline='') as out:
#     dict_writer = csv.DictWriter(out, results_sort[0].keys())
#     dict_writer.writeheader()
#     dict_writer.writerows(results_sort)

In [70]:
results_filter = [ dic for dic in results_sort if dic.get('len_unique') > 0 and dic.get('len_unique') < 100 and dic.get('entropy_full') > 5]    
results_filter_sorted = sorted(results_filter, key=lambda x: x['entropy_cut'], reverse=True)

In [71]:
# Best ?
results_filter_sorted[0:10]

[{'used': ['education',
   'gender',
   'number_vehicles',
   'accommodation',
   'disease'],
  'len_groups': 430,
  'len_unique': 94,
  'new_size': 1906,
  'entropy_full': 8.3062,
  'entropy_cut': 8.1056},
 {'used': ['education', 'number_vehicles', 'commute_time', 'disease'],
  'len_groups': 392,
  'len_unique': 76,
  'new_size': 1924,
  'entropy_full': 8.1299,
  'entropy_cut': 7.962},
 {'used': ['gender',
   'number_vehicles',
   'commute_time',
   'accommodation',
   'disease'],
  'len_groups': 385,
  'len_unique': 86,
  'new_size': 1914,
  'entropy_full': 8.0614,
  'entropy_cut': 7.8675},
 {'used': ['marital_status', 'number_vehicles', 'commute_time', 'disease'],
  'len_groups': 392,
  'len_unique': 88,
  'new_size': 1912,
  'entropy_full': 7.996,
  'entropy_cut': 7.7944},
 {'used': ['education', 'gender', 'number_vehicles', 'disease'],
  'len_groups': 277,
  'len_unique': 27,
  'new_size': 1973,
  'entropy_full': 7.7541,
  'entropy_cut': 7.6905},
 {'used': ['education', 'gender', 