In [1]:
#import packages
import pandas as pd
import os
import numpy as np
import sys
import math

#retrieve race keys
%store -r races
%store -r races2

In [2]:
###########################################################################################################################
#CREATE_DATA: LOAD IN DATASET
###########################################################################################################################
# INPUT: 
#      path: path csv containing block data
#      age_buckets: value for number of items in each age bucket

#  -reads in csv file to pandas dataframe
#  -replaces race string with equivalent ints 0-63 race if applicable
#  -replaces ages with appropriate age bucket labels if applicable

# OUTPUT: dataframe containing dataset with modified age and race values
##########################################################################################################################

def create_data(path, age_groupsize, max_age):
    #import original data
    df_orig = pd.read_csv (path)
    df_orig = df_orig.loc[:, ~df_orig.columns.str.contains('^Unnamed')]
    
    #adjust values of necessary
    for index, row in df_orig.iterrows():    
        
        #RACE
        d = row['race']
        h = row['hispanic']
        #replace race w/ value 0-63
        if d in races:
            df_orig.at[index, 'race'] = races.index(d)
        elif d in races2:
            df_orig.at[index, 'race'] = races2.index(d)        
            
        #AGE
        #if age buckets are implemented:
        if age_groupsize > 1: 
            a = row['age']
            #get number of times groupsize goes into age:
            #ex. get new value for 12 with a group size of 5:
            #    12/5 = 2 w/remainder of 3 --> would return 2
            #set value in original dataset to this age bucket value
            df_orig.at[index, 'age'] = np.floor(a/age_groupsize)
                 
    return df_orig

In [3]:
###########################################################################################################################
#CREATE ATTRIBUTE COMBOS TO LOOK FOR IN DATASETS
###########################################################################################################################
#INPUT:
#     age_range: all possible values for age
#     race_range: all possible values for race
#     size_range: all possible values for household_size

#  -create all possible combos of age, race, household size

#OUTPUT:
#  -M: list containing all age, race, household size combos
###########################################################################################################################

def create_combos(age_range, race_range, size_range):
    
    #list containing all age, race, household size combos
    M = []
    
    #create all combos; append to M
    for age in age_range:
            for race in race_range:
                for size in size_range:
                    M.append([age, race, size])
                    
    return M

In [4]:
###########################################################################################################################
#FIND UNIQUE ROWS
###########################################################################################################################
#INPUT:
#     M: set of attributes shared between D and P ['age', 'race', 'household_size']
#     D: dataframe to search for unique rows
#     theta_d: attribute set size limit in D. we only examine attribute tuples with at most theta_d matching entries in D

#  -get how many times each attribute combination occurs in the dataframe
#  -if only occurs once, add id to unique list; if occurs under the given theta_d threshold, add id to under_threshold list
#  -return these lists of rows that are unique or under threshold 

#OUTPUT:
#  -unique: list of ids of rows that are unique for (age, race, household_size) in the dataset
#  -under_threshold: list of ids of rows that are under the given occurence threshold in the dataset
###########################################################################################################################
    
def get_unique_rows(M, D, theta_d):

    #vals to be returned:
    #1. unique rows in D 
    unique = []
    #2. those under theta_d threshold
    under_threshold = []
   
    #for attribute combination in M
    for row in M:   
        
        #true case - get how many times that the attribute combination occurs in the deidentified dataset 
        att_val_combos = D.loc[(D['age'] == row[0]) & (D['race'] == row[1]) & (D['household_size'] == row[2])]
        
        #if combo occurs at least once and less than threshold in deidentified dataset
        if len(att_val_combos) <= theta_d and len(att_val_combos) > 0:
            #add IDs to under_threshold list
            for index, row in att_val_combos.iterrows():
                under_threshold.append([row['id'], [row['age'], row['race'], row['household_size']]])
            
        if len(att_val_combos) == 1:
            #add IDs to unique list
            for index, row in att_val_combos.iterrows():
                unique.append([row['id'], [row['age'], row['race'], row['household_size']]])
    
    return unique, under_threshold

In [5]:
###########################################################################################################################
#FIND ROW MATCHES
###########################################################################################################################
#INPUT:
#     public_dataset: publically published dataset
#     P_unique: ids of unique rows in public dataset
#     de_id_dataset: deidentified dataset
#     D_unique: ids of unique rows in de-identified dataset

#  -find all unique rows with matching row values
#  -return ids of matches

#OUTPUT:
#     matches: list of ids of all matching unique values [id in public, id in de_id]
###########################################################################################################################
def find_matches(P_unique, D_unique):
    
    #create list of unique row values only
    D_unique_rows = []
    for d in D_unique:
        D_unique_rows.append(d[1])
        
    #LIST OF ROWS WITH MATCHES
    matches = []
    
    #for each row in P_unique, see if there's a matching row in D_unique
    for p_row in P_unique:
        #see if values of [age, race, sex] associated with unique row in p (p_row) are also a unique row in D
        if p_row[1] in D_unique_rows:
            # if so, find associated ID
            for d_row in D_unique:
                if d_row[1] == p_row[1]:
                    matches.append([p_row[0], d_row[0]])
                    
    return matches

In [6]:
###########################################################################################################################
#CHECK VALIDITY OF MATCHES
###########################################################################################################################
#INPUT:
#     public_dataset: publically published dataset
#     de_id_dataset: deidentified dataset
#     matches: list of ids of all matching unique values [id in public, id in de_id]
#     minority_races: list containing minority races for county 

#  -for each match, check if var predicted is the same, and if vars are the same

#OUTPUT:
#     correct_matches: matches with same id (meaning the same person's data)
#     correct_values: all reported matches that had same values (not necessarily an exact ID match)
###########################################################################################################################


def check_true_vals(matches, public_dataset, de_id_dataset, minority_races):
    correct_matches = []
    correct_values = []
    correct_matches_minority = []
    correct_values_minority = []
    
    #for each match
    for match in matches:
        
        #get values for p_id and d_id
        p_id = match[0]
        d_id = match[1]
        
        #get row values are all the same for p_id row and d_id row
        p_row = public_dataset.loc[public_dataset['id'] == d_id]
        d_row = de_id_dataset.loc[de_id_dataset['id'] == p_id]
        #check if all relevant column values match for p_id and d_id rows
        if ((p_row.iloc[0]['hispanic']==d_row.iloc[0]['hispanic'])):
            #if all values match, add to correct_values
            correct_values.append(match)
            if p_row.iloc[0]['race'] in minority_races:
                correct_values_minority.append(match)
        if p_row.iloc[0]['id']==d_row.iloc[0]['id']:
            #if ids match, add to correct_matches
            correct_matches.append(match)   
            if p_row.iloc[0]['race'] in minority_races:
                correct_matches_minority.append(match)
    
    return correct_matches, correct_values, correct_matches_minority, correct_values_minority

In [7]:

def evaluate_privacy(attribute_ranges, original_data_path, minority_races, de_id_data_directory, age_groupsize, theta_d, k1, k2):
    
    correct = []
    total = []
    correct_minority = []
    total_minority = []
    
    #create list of all combinations of attributes that could be present in either the public or de-identified dataset
    attribute_combos = create_combos(attribute_ranges[0], attribute_ranges[1], attribute_ranges[2])

    #create public dataset
    public_dataset = create_data(original_data_path, age_groupsize, len(attribute_ranges[0])-1)
        
    #find unique rows in public dataset
    P_unique, P_under_threshold = get_unique_rows(attribute_combos, public_dataset, theta_d)
    
    #these are the swap rates to run privacy analysis on
    swaprates = np.arange(k1, k2, .01, float).tolist()
    
    #for each de-id in given directory:
    for swaprate in swaprates:
        
        correct_matches_all =[]
        total_matches_all = []
        correct_matches_minority_all =[]
        total_matches_minority_all = []
        
        #for all de-id files in directory:
        for filename in os.listdir(de_id_data_directory):
            
            #check if file is a match for the given swaprate:
            swap_file = filename[(filename.find("_"))+1:filename.rfind("_")]
            
            #if match:
            if filename.endswith(".csv") and math.isclose(float(swaprate), float(swap_file)):
                
                #create deid-ed dataset 
                de_id_dataset = create_data(de_id_data_directory+filename, age_groupsize, len(attribute_ranges[0])-1)
                
                #find unique rows in de-ided
                D_unique, D_under_threshold = get_unique_rows(attribute_combos, de_id_dataset, theta_d)

                #find matches for unique rows
                matches = find_matches(P_unique, D_unique)

                #use ID to check true values (predict var & see if var is correct)
                correct_matches, total_matches, correct_matches_minority, total_matches_minority = check_true_vals(matches, public_dataset, de_id_dataset, minority_races)

                #add to long result list for given swaprate 
                correct_matches_all.append(len(correct_matches))
                total_matches_all.append(len(total_matches))
                correct_matches_minority_all.append(len(correct_matches_minority))
                total_matches_minority_all.append(len(total_matches_minority))
        
        
        #get average value for each swap rate
        print(correct_matches_all)
        print(correct_matches_minority_all)
        #sum values for total:
        #format: [match value, # of runs at swaprate]
        correct_matches_total = [0,0]
        total_matches_total = [0,0]
        #sum values for minority groups:
        #format: [match value, # of runs at swaprate]
        correct_matches_minority_total = [0,0]
        total_matches_minority_total = [0,0]
        
        for s in correct_matches_all:
            correct_matches_total[0] = correct_matches_total[0] + s
            correct_matches_total[1] = correct_matches_total[1] + 1

        for s in total_matches_all:
            total_matches_total[0] = total_matches_total[0] + s
            total_matches_total[1] = total_matches_total[1] + 1
            
        for s in correct_matches_minority_all:
            correct_matches_minority_total[0] = correct_matches_minority_total[0] + s
            correct_matches_minority_total[1] = correct_matches_minority_total[1] + 1

        for s in total_matches_minority_all:
            total_matches_minority_total[0] = total_matches_minority_total[0] + s
            total_matches_minority_total[1] = total_matches_minority_total[1] + 1
            
            
        #find average value (sum divided by # of values)
        corr_matches = correct_matches_total[0]/correct_matches_total[1]
        tot_matches = total_matches_total[0]/total_matches_total[1]
        corr_minority_matches = correct_matches_minority_total[0]/correct_matches_minority_total[1]
        tot_minority_matches = total_matches_minority_total[0]/total_matches_minority_total[1]
        
        correct.append([swaprate, corr_matches])
        total.append([swaprate, tot_matches])
        correct_minority.append([swaprate, corr_minority_matches])
        total_minority.append([swaprate, tot_minority_matches])
        
        
        print([swaprate, corr_matches], [swaprate, tot_matches], [swaprate, corr_minority_matches], [swaprate, tot_minority_matches])
    
    return correct, total, correct_minority, total_minority
    

In [9]:
#RUNNER FUNCTION

def runner_function(county, swapping_directory_type, age_groupsize, theta_d, start_rate, stop_rate, minority_races):
    attribute_ranges = [range(int(90/age_groupsize)), range(len(races)), [1,2,3,4]]
    correct, total, minority_correct, minority_total = evaluate_privacy(attribute_ranges, '../homemade_data/' + county + '.csv', minority_races,
                 '../swapping/swap_runs/'+county+'/'+swapping_directory_type+'/',
                 age_groupsize, theta_d, start_rate, stop_rate)
    with open(county+"_"+swapping_directory_type+"_"+age_groupsize+'_privacy.txt', "a") as f:
        f.write("Total Correct Matches:\n\n")
        for c in correct:
            f.write(c + "\n")
        f.write("Total Value Matches:\n\n")
        for t in total:
            f.write(t + "\n")
        f.write("Minority Correct Matches:\n\n")
        for c in minority_correct:
            f.write(c + "\n")
        f.write("Minority Value Matches:\n\n")
        for t in minority_total:
            f.write(t + "\n")
            

#WILL NEED TO ADJUST THIS FROM COUNTY TO COUNTY
alameda_minority_races = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62]


            
#UNCOMMENT THIS FOR VMS
# county = str(sys.argv[1])
# swapping_type = str(sys.arg[2])
# groupsize = int((sys.argv[3]))
# theta_d = int((sys.argv[4]))
# start_rate = float((sys.argv[5]))
# stop_rate = float ((sys.argv[6]))


#COMMENT THIS OUT FOR VMS
county = 'alameda'
swapping_type = 'similar_3'
groupsize = 15
theta_d = 3
start_rate = .01
stop_rate = 1.0

runner_function(county, swapping_type, groupsize, theta_d, start_rate, stop_rate, alameda_minority_races)

[23, 28, 27]
[20, 22, 21]
[0.01, 26.0] [0.01, 26.333333333333332] [0.01, 21.0] [0.01, 21.0]
[25, 27, 27]
[19, 21, 21]
[0.02, 26.333333333333332] [0.02, 26.333333333333332] [0.02, 20.333333333333332] [0.02, 20.333333333333332]


KeyboardInterrupt: 