In [56]:
#import packages
import csv
import pandas as pd
import numpy as np
from diffprivlib import tools as dp
from diffprivlib.mechanisms import Geometric
import os
from collections import Counter
import math

#import race data label lists
%store -r races
%store -r races2

In [37]:
#This method imports the block data and creates a pandas dataframe containing county block data
#-------------------------------------------------
#INPUT:
# path: file path to county block data
#-------------------------------------------------
#OUTPUT: 
# df: dataframe containing county block data 

def create_block_data(path):
    df = pd.read_csv(path)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    return df

In [38]:
#This method creates the list of all possible combinations of data 
#-------------------------------------------------
#INPUT:
# lists pertaining to the possible values for each variable (ex. [0,1] for nonhispanic, hispanic or range(63) for 63 races)
#-------------------------------------------------
#OUTPUT: 
# combos: list containing all possible combinations/iterations of data and a secondary value
    # [[age, hisp, race, size], count_value]
# noisy queries: empty data structure for storing query values 

def create_combos(voting_list, hispanic_list, race_list, housesize_list):
    combos = []
    noisy_queries = np.zeros(shape=(len(voting_list), len(hispanic_list), len(race_list), len(housesize_list)+1))
    for voting_age in voting_list:
        for hispanic in hispanic_list:
            for race in race_list:
                for household_size in housesize_list:
                    combos.append([[voting_age, hispanic, race, household_size],0])
        
    return combos, noisy_queries

In [39]:
# This method computes the real population sizes for each (voting age Y/N)*(hispanic Y/N)*(race) combination
#INPUT:
# dft: dataframe corresponding to a BLOCK
# shape: combos structure all possible values for [voting][hisp][race][hsize]
        # includes 0 for household size, even though no values in dataset for
        # sake of simplicity
#-------------------------------------------------
#OUTPUT: 
# noisy_queries: numpy 3D array with dimensions (2, 2, 63)
#          the ith element is an integer representing the population size for the category in combos[i]

def get_true_counts(dft, shape):
    #queries is list of all possible combinations, noisy_queries is the shape of all values
    queries, _ = create_combos(shape[0], shape[1], shape[2], shape[3])
    
    for i in range(0,len(queries)):

        #age: filter the og dataset to values that match the given age range (for voting)
        ag = queries[i][0][0]
        if ag == 0:
            count = dft[dft['age']<18]
        else:
            count = dft[dft['age']>=18]
            
        
        #hispanic: filter the og dataset to values that match the given hispanic val
        hispan = queries[i][0][1]
        count = count[count['hispanic']==hispan]
        

        #race: filter the og dataset to values that match the given race
        ra = queries[i][0][2]
        count = count[count['race']==ra]
        
        
        #housesize: filter the og dataset to values that match the given household size (vals 1-4)
        hs = queries[i][0][3]
        count = count[count['household_size']==hs]
    
        #number of values fitting these constraints in dataset
        count_num = count.shape[0]
        
        #set count value for og queries
        queries[i][1] = count_num
        
    return queries

In [40]:
# This method computes the real population sizes for each (age group of group_size yrs)*(hispanic Y/N)*(race) combination
#INPUT:
# dft: dataframe corresponding to a BLOCK
# shape: combos structure all possible values for [age group][hisp][race][hsize]
        # includes 0 for household size, even though no values in dataset for
        # sake of simplicity
#-------------------------------------------------
#OUTPUT: 
# noisy_queries: numpy 3D array with dimensions (2, 2, 63)
#          the ith element is an integer representing the population size for the category in combos[i]

def get_true_counts_age_race(dft, shape, group_size):
    #queries is list of all possible combinations, noisy_queries is the shape of all values
    queries, _ = create_combos(shape[0], shape[1], shape[2], shape[3])
    
    for i in range(0,len(queries)):

        #age: filter the og dataset to values that match the given age range (for voting)
        ag = queries[i][0][0]
        count = dft[dft['age'] >= ag*group_size]
        count = count[count['age'] < (ag+1)*group_size]
            
        
        #hispanic: filter the og dataset to values that match the given hispanic val
        hispan = queries[i][0][1]
        count = count[count['hispanic']==hispan]
        

        #race: filter the og dataset to values that match the given race
        ra = queries[i][0][2]
        count = count[count['race']==ra]
        
        
        #housesize: filter the og dataset to values that match the given household size (vals 1-4)
        hs = queries[i][0][3]
        count = count[count['household_size']==hs]
    
        #number of values fitting these constraints in dataset
        count_num = count.shape[0]
        
        #set count value for og queries
        queries[i][1] = count_num
        
    return queries

In [41]:
#This method computes noisy population sizes for each (voting age Y/N)*(hispanic Y/N)*race combination using
# a geometric mechanism.
#-------------------------------------------------
#INPUT:
# dft: dataframe corresponding to a BLOCK
# shape: combos structure all possible values for [voting][hisp][race][hsize]
        # includes 0 for household size, even though no values in dataset for
        # sake of simplicity
# ep: epsilon value for the county-wide mechanism
#-------------------------------------------------
#OUTPUT: 
# noisy_queries: numpy 3D array with dimensions (2, 2, 63)
#          the ith element is an integer representing the (noisy) population size for the category in combos[i]

# def get_noisy_queries(dft, queries, shape, ep):
def get_noisy_queries(queries, shape, ep):
    
    #create geometric mechanism
    geo = Geometric(epsilon=ep, sensitivity=1)   
    
    #queries is list of all possible combinations, noisy_queries is the shape of all values
    _, noisy_queries = create_combos(shape[0], shape[1], shape[2], shape[3])
    

    
    #orig and random values
    randomizeds = []
    
    for i in range(0,len(queries)):

        #age: filter the og dataset to values that match the given age range (for voting)
        ag = queries[i][0][0]
            
        
        #hispanic: filter the og dataset to values that match the given hispanic val
        hispan = queries[i][0][1]
        

        #race: filter the og dataset to values that match the given race
        ra = queries[i][0][2]
        
        
        #housesize: filter the og dataset to values that match the given household size (vals 1-4)
        hs = queries[i][0][3]

        count_num = queries[i][1]
        
        #randomize the count value using differentially private geometric mechanism 
        randomized =  geo.randomise(count_num)
        
        randomizeds.append([count_num, randomized])
        
        #set noisy queries value to new randomized value
        noisy_queries[ag][hispan][ra][hs] = randomized
    
    race_histogram = np.zeros(shape=(2, 63))
    
    #sum all noisy_queries values to get sum values for race and hispanic 
    for i in shape[1]:
        for j in shape[2]:
            for k in shape[3]:
                race_histogram[i][j] = race_histogram[i][j] + noisy_queries[0][i][j][k] + noisy_queries[1][i][j][k]
    
    return race_histogram, randomizeds


In [42]:
#This method computes noisy population sizes for each (voting age Y/N)*(hispanic Y/N)*race combination using
# a geometric mechanism.
#-------------------------------------------------
#INPUT:
# dft: dataframe corresponding to a BLOCK
# shape: combos structure all possible values for [voting][hisp][race][hsize]
        # includes 0 for household size, even though no values in dataset for
        # sake of simplicity
# ep: epsilon value for the county-wide mechanism
#-------------------------------------------------
#OUTPUT: 
# noisy_queries: numpy 3D array with dimensions (2, 2, 63)
#          the ith element is an integer representing the (noisy) population size for the category in combos[i]

# def get_noisy_queries(dft, queries, shape, ep):
def get_noisy_queries_age_race(queries, shape, ep):
    
    #create geometric mechanism
    geo = Geometric(epsilon=ep, sensitivity=1)   
    
    #queries is list of all possible combinations, noisy_queries is the shape of all values
    _, noisy_queries = create_combos(shape[0], shape[1], shape[2], shape[3])
    

    
    #orig and random values
    randomizeds = []
    
    for i in range(0,len(queries)):

        #age: filter the og dataset to values that match the given age range (for voting)
        ag = queries[i][0][0]
            
        
        #hispanic: filter the og dataset to values that match the given hispanic val
        hispan = queries[i][0][1]
        

        #race: filter the og dataset to values that match the given race
        ra = queries[i][0][2]
        
        
        #housesize: filter the og dataset to values that match the given household size (vals 1-4)
        hs = queries[i][0][3]

        count_num = queries[i][1]
        
        #randomize the count value using differentially private geometric mechanism 
        randomized =  geo.randomise(count_num)
        
        randomizeds.append([count_num, randomized])
        
        #set noisy queries value to new randomized value
        noisy_queries[ag][hispan][ra][hs] = randomized
    
    race_histogram = np.zeros(shape=(2, 63))
    
    #sum all noisy_queries values to get sum values for race and hispanic 
    for l in shape[0]:
        for i in shape[1]:
            for j in shape[2]:
                for k in shape[3]:
                    race_histogram[i][j] = race_histogram[i][j] + noisy_queries[l][i][j][k]
    
    return race_histogram, noisy_queries, randomizeds


In [43]:
# Returns noisy queries for age, with people in age groups of size group_size
# shape is the number of age groups (e.g., if age goes up to 90 and group_size is 15, shape should be 6)
def get_noisy_queries_age(dft, shape, group_size, ep):
    geo = Geometric(epsilon=ep, sensitivity=1)
    true_queries = []
    noisy_queries = []
    for i in shape:
        
        # compute true number of ppl in age group i
        count = dft[dft['age'] >= i*group_size]
        count = count[count['age'] < (i+1)*group_size]
        count_num = count.shape[0]
        
        # append true number to true_queries
        true_queries.append([i, count_num])
        
        # compute noisy query
        r = geo.randomise(count_num)
        noisy_queries.append([i, r])
    return noisy_queries, true_queries

In [44]:









################################################### RUNNING THE CODE #####################################################











In [61]:
# SAMPLE RUN FOR NEW CODE
# TESTED TO MAKE SURE THAT FOR ep=.01, NOISY QUERIES AND TRUE QUERIES ARE VERY DIFFERENT
# FOR ep=100, NOISY QUERIES AND TRUE QUERIES ARE ESSENTIALLY THE SAME
# SEEMS TO BE WORKING!
# NEEDS HELP CONVERTING ARRAYS TO CSVS

counties = ['alameda']

#set epsilon values
epsilon = np.arange(.01, 10, .05, float)

#set values for list options (make each value equal to number of possible options)
voting_list = range(7)
hispanic_list = range(2)
race_list = range(63)
housesize_list = [1,2,3,4]
shape = [voting_list, hispanic_list, race_list, housesize_list]

group_size = 15


for county in counties:
    print(county)
    #create original dataset 
    df_first = create_block_data('../homemade_data/'+county+'.csv')

    #copy dataset, and fix races to numeric values
    orig_df = df_first.copy()
    for index, row in orig_df.iterrows():
        d = row['race']
        orig_df.at[index, 'race'] = races.index(d)
       
    
    df = orig_df.copy()
        
#     true_counts_age_race = get_true_counts_age_race(df, shape, group_size)
    true_counts = get_true_counts(df, shape)

    #for each epsilon val, make histograms
    for ep in epsilon:
        print(str(ep)[0:4]) 
        for k in range(0,25):
            
            #FOR RACE BINARY
            racehist, randomizeds = get_noisy_queries(true_counts, shape, ep)
            df_final = pd.DataFrame(list(zip(racehist[0], racehist[1])), columns = ['Nonhispanic', 'Hispanic'])
            filename = "new_dp_runs/"+county+"/binary_age/dprun_"+str(ep)[0:4]+"_"+str(k)+".csv"
            csv_orig_data = df_final.to_csv(filename, index = True)
            
            
#             #FOR AGE_RACE
#             racehist_age_race, full_noisy_age_race, randomizeds = get_noisy_queries_age_race(true_counts_age_race, shape, ep)
#             df_final_race = pd.DataFrame(list(zip(racehist_age_race[0], racehist_age_race[1])), columns = ['Nonhispanic', 'Hispanic'])
#             filename = "new_dp_runs/"+county+"/15_age_race/dprun_"+str(ep)[0:4]+"_"+str(k)+".csv"
#             csv_orig_data = df_final_race.to_csv(filename, index = True)

            #FOR AGE ONLY
#             hist_ageonly, true_ageonly = get_noisy_queries_age(df, voting_list, group_size, ep)    
#             df_final_race = pd.DataFrame(list(zip(hist_ageonly[0], hist_ageonly[1])), columns = ['Nonhispanic', 'Hispanic'])            
#             filename = "new_dp_runs/"+county+"3/dprun_age_"+str(ep)[0:4]+"_"+str(k)+".csv"
#             csv_orig_data = df_final_race.to_csv(filename, index = True)


alameda
0.01
0.06
0.11
0.16
0.21
0.26
0.31
0.36
0.41
0.46
0.51
0.56
0.61
0.66
0.71
0.76
0.81
0.86
0.91
0.96
1.01
1.06
1.11
1.16
1.21
1.26
1.31
1.36
1.41
1.46
1.51
1.56
1.61
1.66
1.71
1.76
1.81
1.86
1.91
1.96
2.01
2.06
2.11
2.15
2.21
2.26
2.31
2.36
2.41
2.46
2.51
2.56
2.61
2.66
2.71
2.76
2.81
2.86
2.91
2.96
3.01
3.06
3.11
3.16
3.21
3.26
3.31
3.36
3.41
3.46
3.51
3.56
3.61
3.66
3.71
3.76
3.81
3.86
3.91
3.96
4.01
4.06
4.11
4.16
4.21
4.26
4.31
4.36
4.41
4.46
4.51
4.56
4.61
4.66
4.71
4.76
4.81
4.86
4.91
4.96
5.01
5.06
5.11
5.16
5.21
5.26
5.31
5.36
5.41
5.46
5.51
5.56
5.61
5.66
5.71
5.76
5.81
5.86
5.91
5.96
6.01
6.06
6.11
6.16
6.21
6.26
6.31
6.36
6.41
6.46
6.51
6.56
6.61
6.66
6.71
6.76
6.81
6.86
6.91
6.96
7.01
7.06
7.11
7.16
7.21
7.26
7.31
7.36
7.41
7.46
7.51
7.56
7.61
7.66
7.71
7.76
7.81
7.86
7.91
7.96
8.01
8.06
8.11
8.16
8.21
8.26
8.31
8.36
8.41
8.46
8.51
8.56
8.61
8.66
8.71
8.76
8.81
8.86
8.91
8.96
9.01
9.06
9.11
9.16
9.21
9.26
9.31
9.36
9.41
9.46
9.51
9.56
9.61
9.66
9.71
9.76
9.81
9.86
9.

In [None]:
#TESTING 

# #get counties and states
# states = ['california']
# counties = ['alameda']

# #set epsilon values
# epsilon = [.01, 1, 2, 3, 4, 5]

# #set values for list options (make each value equal to number of possible options)
# voting_list = range(2)
# hispanic_list = range(2)
# race_list = range(63)
# housesize_list = [1,2,3,4]
# shape = [voting_list, hispanic_list, race_list, housesize_list]

# #create original dataset 
# df_first = create_block_data('../homemade_data/'+county+'.csv')

# #copy dataset, and fix races to numeric values
# orig_df = df_first.copy()
# for index, row in orig_df.iterrows():
#     d = row['race']
#     orig_df.at[index, 'race'] = races.index(d)

# #for each epsilon val, make histograms
# for e in epsilon:
#     print(str(e)[0:4])              
#     df = orig_df.copy()
#     racehist, randomizeds = get_noisy_queries(df, shape, e)
#     df_final = pd.DataFrame(list(zip(racehist[0], racehist[1])), columns = ['Nonhispanic', 'Hispanic'])
#     #creates dp run file containing the new differentially private dataset
#     filename = "test_data/dprun_"+str(e)[0:4]+".csv"
#     csv_orig_data = df_final.to_csv(filename, index = True)
#     df_randoms = pd.Series(randomizeds)
#     #creates randoms file containing the old and new values (after randomization)
#     filename = "test_data/randoms_"+str(e)[0:4]+".csv"
#     csv_rand_data = df_randoms.to_csv(filename, index = True)

In [None]:
# #REAL RUN 

# #county and state data
# states = ["california", "pennsylvania", "newmexico", "georgia", "northdakota", "hawaii", "missouri", "massachussets", "vermont"]
# states = ["missouri", "massachussets", "vermont"]
# counties = ['alameda', 'armstrong', 'cibola', 'fayette', 'grandforks', 'hawaii', 'jefferson', 'nantucket', 'washington']
# counties = ['jefferson', 'nantucket', 'washington']

# #set epsilon values
# epsilon = np.arange(.01, 10, .05, float)


# #set values for list options (make each value equal to number of possible options)
# voting_list = range(2)
# hispanic_list = range(2)
# race_list = range(63)
# housesize_list = [1,2,3,4]
# shape = [voting_list, hispanic_list, race_list, housesize_list]


# for county in counties:
#     print(county)
#     #create original dataset 
#     df_first = create_block_data('../homemade_data/'+county+'.csv')

#     #copy dataset, and fix races to numeric values
#     orig_df = df_first.copy()
#     for index, row in orig_df.iterrows():
#         d = row['race']
#         orig_df.at[index, 'race'] = races.index(d)
       
    
#     df = orig_df.copy()
        
#     true_counts = get_true_counts(df, shape)

#     #for each epsilon val, make histograms
#     for e in epsilon:
#         print(str(e)[0:4]) 
#         for i in range(100):
#             racehist, randomizeds = get_noisy_queries(true_counts, shape, e)
#             df_final = pd.DataFrame(list(zip(racehist[0], racehist[1])), columns = ['Nonhispanic', 'Hispanic'])
#             #creates dp run file containing the new differentially private dataset
#             filename = "dp_runs/"+county+"/dprun_"+str(e)[0:4]+"_"+str(i)+".csv"
#             csv_orig_data = df_final.to_csv(filename, index = True)