In [1]:
#import packages
import csv
import pandas as pd
import numpy as np
from diffprivlib import tools as dp
from diffprivlib.mechanisms import Geometric
import os
from collections import Counter
import math

#import race data label lists
%store -r races
%store -r races2

In [2]:
#This method imports the block data and creates a pandas dataframe containing county block group data
#-------------------------------------------------
#INPUT:
# path: file path to county block group data
#-------------------------------------------------
#OUTPUT: 
# df: dataframe containing county block group data 

def create_block_data(path):
    df = pd.read_csv(path)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    return df

In [3]:
#This method creates the list of all possible combinations of attributes
#-------------------------------------------------
#INPUT:
# lists pertaining to the possible values for each variable (ex. [0,1] for nonhispanic, hispanic or range(63) for 63 races)
#-------------------------------------------------
#OUTPUT: 
# combos: list containing all possible combinations/iterations of data and a secondary value
    # [[age, hisp, race, size], count_value]
# noisy queries: empty 4D numpy array for storing query values 

def create_combos(age_list, hispanic_list, race_list, housesize_list):
    combos = []
    noisy_queries = np.zeros(shape=(len(age_list), len(hispanic_list), len(race_list), len(housesize_list)+1))
    for age in age_list:
        for hispanic in hispanic_list:
            for race in race_list:
                for household_size in housesize_list:
                    combos.append([[age, hispanic, race, household_size],0])
        
    return combos, noisy_queries

In [4]:
# This method computes the real population sizes for each (voting age Y/N)*(hispanic Y/N)*(race) combination
#INPUT:
# dft: dataframe corresponding to a block group
# shape: combos structure all possible values for [voting age][hisp][race][hsize]
        # includes 0 for household size, even though no values in dataset for
        # sake of simplicity
#-------------------------------------------------
#OUTPUT: 
# queries: numpy 3D array where each entry is [[voting age, hisp, race, size], count_value]
#          count_value is the size of the population with attributes [voting age, hisp, race, size]

def get_true_counts_binary(dft, shape):
    #queries is list of all possible combinations, noisy_queries is the shape of all values
    queries, _ = create_combos(shape[0], shape[1], shape[2], shape[3])
    
    for i in range(0,len(queries)):

        #age: filter the og dataset to values that match the given age range (for voting)
        ag = queries[i][0][0]
        if ag == 0:
            count = dft[dft['age']<18]
        else:
            count = dft[dft['age']>=18]
            
        
        #hispanic: filter the og dataset to values that match the given hispanic val
        hispan = queries[i][0][1]
        count = count[count['hispanic']==hispan]
        

        #race: filter the og dataset to values that match the given race
        ra = queries[i][0][2]
        count = count[count['race']==ra]
        
        
        #housesize: filter the og dataset to values that match the given household size (vals 1-4)
        hs = queries[i][0][3]
        count = count[count['household_size']==hs]
    
        #number of values fitting these constraints in dataset
        count_num = count.shape[0]
        
        #set count value for og queries
        queries[i][1] = count_num
        
    return queries

In [5]:
# This method computes the real population sizes for each (age group of group_size yrs)*(hispanic Y/N)*(race) combination
#INPUT:
# dft: dataframe corresponding to a block group
# shape: combos structure all possible values for [age group][hisp][race][hsize]
        # includes 0 for household size, even though no values in dataset for
        # sake of simplicity
#-------------------------------------------------
#OUTPUT: 
# queries: numpy 3D array where each entry is [[age group, hisp, race, size], count_value]
#          count_value is the number of individuals with attributes [age group, hisp, race, size]

def get_true_counts_age_race(dft, shape, group_size):
    #queries is list of all possible combinations, noisy_queries is the shape of all values
    queries, _ = create_combos(shape[0], shape[1], shape[2], shape[3])
    
    for i in range(0,len(queries)):

        #age: filter the og dataset to values that match the given age range (for voting)
        ag = queries[i][0][0]
        count = dft[dft['age'] >= ag*group_size]
        count = count[count['age'] < (ag+1)*group_size]
            
        
        #hispanic: filter the og dataset to values that match the given hispanic val
        hispan = queries[i][0][1]
        count = count[count['hispanic']==hispan]
        

        #race: filter the og dataset to values that match the given race
        ra = queries[i][0][2]
        count = count[count['race']==ra]
        
        
        #housesize: filter the og dataset to values that match the given household size (vals 1-4)
        hs = queries[i][0][3]
        count = count[count['household_size']==hs]
    
        #number of values fitting these constraints in dataset
        count_num = count.shape[0]
        
        #set count value for og queries
        queries[i][1] = count_num
        
    return queries

In [6]:
#This method computes noisy population sizes for each (age group)*(hispanic Y/N)*race combination using
# a geometric mechanism.
#-------------------------------------------------
#INPUT:
# queries: numpy 3D array where each entry is [[age group, hisp, race, size], count_value]
#          count_value is the number of individuals with attributes [age group, hisp, race, size]
# shape: combos structure all possible values for [age group][hisp][race][hsize]
        # includes 0 for household size, even though no values in dataset for
        # sake of simplicity
# ep: epsilon value for the county-wide mechanism
#-------------------------------------------------
#OUTPUT: 
# noisy_queries: numpy 4D array 
#          noisy_queries[ag][hispan][ra][hs] is a noisy count of the number of individuals with attributes (ag, hispan, ra, hs)
#          the noisy count is obtained using a geometric mechanism with epsilon value ep, sensitivity=1
def get_noisy_queries(queries, shape, ep):
    
    #create geometric mechanism
    geo = Geometric(epsilon=ep, sensitivity=1)   
    
    #queries is list of all possible combinations, noisy_queries is the shape of all values
    _, noisy_queries = create_combos(shape[0], shape[1], shape[2], shape[3])
    

    
    #orig and random values
    randomizeds = []
    
    for i in range(0,len(queries)):

        #age: filter the og dataset to values that match the given age range (for voting)
        ag = queries[i][0][0]
            
        
        #hispanic: filter the og dataset to values that match the given hispanic val
        hispan = queries[i][0][1]
        

        #race: filter the og dataset to values that match the given race
        ra = queries[i][0][2]
        
        
        #housesize: filter the og dataset to values that match the given household size (vals 1-4)
        hs = queries[i][0][3]

        count_num = queries[i][1]
        
        #randomize the count value using differentially private geometric mechanism 
        randomized =  geo.randomise(count_num)
        
        randomizeds.append([count_num, randomized])
        
        #set noisy queries value to new randomized value
        noisy_queries[ag][hispan][ra][hs] = randomized
    
    race_histogram = np.zeros(shape=(2, 63))
    
    #sum all noisy_queries values to get sum values for race and hispanic 
    for i in shape[1]:
        for j in shape[2]:
            for k in shape[3]:
                race_histogram[i][j] = race_histogram[i][j] + noisy_queries[0][i][j][k] + noisy_queries[1][i][j][k]
    
    return race_histogram, randomizeds


In [7]:
#This method computes noisy population sizes for each (age group)*(hispanic Y/N)*race combination using
# a geometric mechanism.
#-------------------------------------------------
#INPUT:
# dft: dataframe corresponding to a block group
# shape: combos structure all possible values for [age group][hisp][race][hsize]
        # includes 0 for household size, even though no values in dataset for
        # sake of simplicity
# ep: epsilon value for the county-wide mechanism
#-------------------------------------------------
#OUTPUT: 
# race_histogram: numpy 2D array where each entry is ((race group 0-62), pop count) 
# noisy_queries: numpy 4D array 
#          noisy_queries[ag][hispan][ra][hs] is a noisy count of the number of individuals with attributes (ag, hispan, ra, hs)
#          the noisy count is obtained by taking the true pop count and adding geo noise with epsilon=ep, sensitivity = 1
# randomizeds: (for testing purposes). 2D array, where each entry is [true count, noisy count] for a combo of 
#             (age group) x (hisp) x (race) x (size)


def get_noisy_queries_age_race(queries, shape, ep):
    
    #create geometric mechanism
    geo = Geometric(epsilon=ep, sensitivity=1)   
    
    #queries is list of all possible combinations, noisy_queries is the shape of all values
    _, noisy_queries = create_combos(shape[0], shape[1], shape[2], shape[3])
    

    
    #orig and random values
    randomizeds = []
    
    for i in range(0,len(queries)):

        #age: filter the og dataset to values that match the given age range (for voting)
        ag = queries[i][0][0]
            
        
        #hispanic: filter the og dataset to values that match the given hispanic val
        hispan = queries[i][0][1]
        

        #race: filter the og dataset to values that match the given race
        ra = queries[i][0][2]
        
        
        #housesize: filter the og dataset to values that match the given household size (vals 1-4)
        hs = queries[i][0][3]

        count_num = queries[i][1]
        
        #randomize the count value using differentially private geometric mechanism 
        randomized =  geo.randomise(count_num)
        
        randomizeds.append([count_num, randomized])
        
        #set noisy queries value to new randomized value
        noisy_queries[ag][hispan][ra][hs] = randomized
    
    race_histogram = np.zeros(shape=(2, 63))
    
    #sum all noisy_queries values to get sum values for race and hispanic 
    for l in shape[0]:
        for i in shape[1]:
            for j in shape[2]:
                for k in shape[3]:
                    race_histogram[i][j] = race_histogram[i][j] + noisy_queries[l][i][j][k]
    
    return race_histogram, noisy_queries, randomizeds


In [8]:
# Returns noisy queries for age, with people in age groups of size group_size
# shape is the number of age groups (e.g., if age goes up to 90 and group_size is 15, shape should be 6)

#This method computes noisy population sizes for each (age group)
#-------------------------------------------------
#INPUT:
# dft: dataframe corresponding to a block group
# shape: number of age groups
# group_size: size (number of ages included) of each age group
# ep: epsilon value for the county-wide mechanism
#-------------------------------------------------
#OUTPUT:
# noisy_queries: numpy 2D array of length shape, where each entry is [age_group, noisy_pop_size]
# true_queries: numpy 2D array of length shape, where each entry is [age_group, true_pop_size]

def get_noisy_queries_age(dft, shape, group_size, ep):
    geo = Geometric(epsilon=ep, sensitivity=1)
    true_queries = []
    noisy_queries = []
    for i in shape:
        
        # compute true number of ppl in age group i
        count = dft[dft['age'] >= i*group_size]
        count = count[count['age'] < (i+1)*group_size]
        count_num = count.shape[0]
        
        # append true number to true_queries
        true_queries.append([i, count_num])
        
        # compute noisy query
        r = geo.randomise(count_num)
        noisy_queries.append([i, r])
    return noisy_queries, true_queries

In [9]:









################################################### RUNNING THE CODE #####################################################











In [None]:
def runner_function(county, group_size):

    #set epsilon values
    epsilon = np.arange(.01, 10, .05, float)

    #set values for list options (make each value equal to number of possible options)
    voting_list = range(7)
    hispanic_list = range(2)
    race_list = range(63)
    housesize_list = [1,2,3,4]
    shape = [voting_list, hispanic_list, race_list, housesize_list]

    #create original dataset 
    df_first = create_block_data('../homemade_data/'+county+'.csv')

    #copy dataset, and fix races to numeric values
    orig_df = df_first.copy()
    for index, row in orig_df.iterrows():
        d = row['race']
        orig_df.at[index, 'race'] = races.index(d)           
    df = orig_df.copy()
        
    true_counts_age_race = get_true_counts_age_race(df, shape, group_size)
    true_counts_binary = get_true_counts_binary(df, shape)

    #for each epsilon val, make histograms
    for ep in epsilon:
        
        print(str(ep)[0:4]) 
        
        for k in range(0,25):
            
            if group_size == 2:
                #FOR RACE BINARY
                racehist, randomizeds = get_noisy_queries(true_counts_binary, shape, ep)
                df_final = pd.DataFrame(list(zip(racehist[0], racehist[1])), columns = ['Nonhispanic', 'Hispanic'])
                filename = "new_dp_runs/"+county+"/binary_groupsize/dprun_"+str(ep)[0:4]+"_"+str(k)+".csv"
                csv_orig_data = df_final.to_csv(filename, index = True)
            
            else:
                #FOR AGE_RACE
                racehist_age_race, full_noisy_age_race, randomizeds = get_noisy_queries_age_race(true_counts_age_race, shape, ep)
                df_final_race = pd.DataFrame(list(zip(racehist_age_race[0], racehist_age_race[1])), columns = ['Nonhispanic', 'Hispanic'])
                filename = "dp_runs/"+county+"/"+str(group_size)+"_groupsize/dprun_"+str(ep)[0:4]+"_"+str(k)+".csv"
                csv_orig_data = df_final_race.to_csv(filename, index = True)
            
            
#UNCOMMENT THIS FOR VMS
# county = str(sys.argv[1])
# groupsize = int((sys.argv[1]))

#COMMENT THIS OUT FOR VMS
county = 'alameda'
groupsize = 15

runner_function(county, groupsize)