In [25]:
#import packages
import csv
import pandas as pd
import numpy as np
from diffprivlib import tools as dp
from diffprivlib.mechanisms import Geometric
import os
from collections import Counter

#import race data label lists
%store -r races
%store -r races2

In [26]:
#This method imports the block data and creates a pandas dataframe containing county block data
#-------------------------------------------------
#INPUT:
# path: file path to county block data
#-------------------------------------------------
#OUTPUT: 
# df: dataframe containing county block data 

def create_block_data(path):
    df = pd.read_csv(path)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    return df

In [27]:
#This method creates the list of all possible combinations of data 
#-------------------------------------------------
#INPUT:
# lists pertaining to the possible values for each variable (ex. [0,1] for nonhispanic, hispanic or range(63) for 63 races)
#-------------------------------------------------
#OUTPUT: 
# combos: list containing all possible combinations/iterations of data and a secondary value
    # [[age, hisp, race, size], count_value]
# noisy queries: empty data structure for storing query values 

def create_combos(voting_list, hispanic_list, race_list, housesize_list):
    combos = []
    noisy_queries = np.zeros(shape=(len(voting_list), len(hispanic_list), len(race_list), len(housesize_list)+1))
    for voting_age in voting_list:
        for hispanic in hispanic_list:
            for race in race_list:
                for household_size in housesize_list:
                    combos.append([[voting_age, hispanic, race, household_size],0])
        
    return combos, noisy_queries

In [28]:
# This method computes the real population sizes for each (voting age Y/N)*(hispanic Y/N)*(race) combination
#INPUT:
# dft: dataframe corresponding to a BLOCK
# shape: combos structure all possible values for [voting][hisp][race][hsize]
        # includes 0 for household size, even though no values in dataset for
        # sake of simplicity
#-------------------------------------------------
#OUTPUT: 
# noisy_queries: numpy 3D array with dimensions (2, 2, 63)
#          the ith element is an integer representing the population size for the category in combos[i]

def get_true_counts(dft, shape):
    #queries is list of all possible combinations, noisy_queries is the shape of all values
    queries, noisy_queries= create_combos(shape[0], shape[1], shape[2], shape[3])
    
    for i in range(0,len(queries)):

        #age: filter the og dataset to values that match the given age range (for voting)
        ag = queries[i][0][0]
        if ag == 0:
            count = dft[dft['age']<18]
        else:
            count = dft[dft['age']>=18]
            
        
        #hispanic: filter the og dataset to values that match the given hispanic val
        hispan = queries[i][0][1]
        count = count[count['hispanic']==hispan]
        

        #race: filter the og dataset to values that match the given race
        ra = queries[i][0][2]
        count = count[count['race']==ra]
        
        
        #housesize: filter the og dataset to values that match the given household size (vals 1-4)
        hs = queries[i][0][3]
        count = count[count['household_size']==hs]
    
        #number of values fitting these constraints in dataset
        count_num = count.shape[0]
        
        #set count value for og queries
        queries[i][1] = count_num
        
    return queries

In [33]:
#This method computes noisy population sizes for each (voting age Y/N)*(hispanic Y/N)*race combination using
# a geometric mechanism.
#-------------------------------------------------
#INPUT:
# dft: dataframe corresponding to a BLOCK
# shape: combos structure all possible values for [voting][hisp][race][hsize]
        # includes 0 for household size, even though no values in dataset for
        # sake of simplicity
# ep: epsilon value for the county-wide mechanism
#-------------------------------------------------
#OUTPUT: 
# noisy_queries: numpy 3D array with dimensions (2, 2, 63)
#          the ith element is an integer representing the (noisy) population size for the category in combos[i]

# def get_noisy_queries(dft, queries, shape, ep):
def get_noisy_queries(queries, shape, ep):
    
    #create geometric mechanism
    geo = Geometric(epsilon=ep, sensitivity=1)   
    
    #queries is list of all possible combinations, noisy_queries is the shape of all values
    _, noisy_queries = create_combos(shape[0], shape[1], shape[2], shape[3])
    

    
    #orig and random values
    randomizeds = []
    
    for i in range(0,len(queries)):

        #age: filter the og dataset to values that match the given age range (for voting)
        ag = queries[i][0][0]
            
        
        #hispanic: filter the og dataset to values that match the given hispanic val
        hispan = queries[i][0][1]
        

        #race: filter the og dataset to values that match the given race
        ra = queries[i][0][2]
        
        
        #housesize: filter the og dataset to values that match the given household size (vals 1-4)
        hs = queries[i][0][3]

        count_num = queries[i][1]
        
        #randomize the count value using differentially private geometric mechanism 
        randomized =  geo.randomise(count_num)
        
        randomizeds.append([count_num, randomized])
        
        #set noisy queries value to new randomized value
        noisy_queries[ag][hispan][ra][hs] = randomized
    
    race_histogram = np.zeros(shape=(2, 63))
    
    #sum all noisy_queries values to get sum values for race and hispanic 
    for i in shape[1]:
        for j in shape[2]:
            for k in shape[3]:
                race_histogram[i][j] = race_histogram[i][j] + noisy_queries[0][i][j][k] + noisy_queries[1][i][j][k]
    
    return race_histogram, randomizeds


In [34]:









################################################### RUNNING THE CODE #####################################################











In [35]:
#TESTING 

# #get counties and states
# states = ['california']
# counties = ['alameda']

# #set epsilon values
# epsilon = [.01, 1, 2, 3, 4, 5]

# #set values for list options (make each value equal to number of possible options)
# voting_list = range(2)
# hispanic_list = range(2)
# race_list = range(63)
# housesize_list = [1,2,3,4]
# shape = [voting_list, hispanic_list, race_list, housesize_list]

# #create original dataset 
# df_first = create_block_data('../homemade_data/'+county+'.csv')

# #copy dataset, and fix races to numeric values
# orig_df = df_first.copy()
# for index, row in orig_df.iterrows():
#     d = row['race']
#     orig_df.at[index, 'race'] = races.index(d)

# #for each epsilon val, make histograms
# for e in epsilon:
#     print(str(e)[0:4])              
#     df = orig_df.copy()
#     racehist, randomizeds = get_noisy_queries(df, shape, e)
#     df_final = pd.DataFrame(list(zip(racehist[0], racehist[1])), columns = ['Nonhispanic', 'Hispanic'])
#     #creates dp run file containing the new differentially private dataset
#     filename = "test_data/dprun_"+str(e)[0:4]+".csv"
#     csv_orig_data = df_final.to_csv(filename, index = True)
#     df_randoms = pd.Series(randomizeds)
#     #creates randoms file containing the old and new values (after randomization)
#     filename = "test_data/randoms_"+str(e)[0:4]+".csv"
#     csv_rand_data = df_randoms.to_csv(filename, index = True)

In [40]:
#REAL RUN 

#county and state data
states = ["california", "pennsylvania", "newmexico", "georgia", "northdakota", "hawaii", "missouri", "massachussets", "vermont"]
states = ["missouri", "massachussets", "vermont"]
counties = ['alameda', 'armstrong', 'cibola', 'fayette', 'grandforks', 'hawaii', 'jefferson', 'nantucket', 'washington']
counties = ['jefferson', 'nantucket', 'washington']

#set epsilon values
epsilon = np.arange(.01, 10, .05, float)


#set values for list options (make each value equal to number of possible options)
voting_list = range(2)
hispanic_list = range(2)
race_list = range(63)
housesize_list = [1,2,3,4]
shape = [voting_list, hispanic_list, race_list, housesize_list]


for county in counties:
    print(county)
    #create original dataset 
    df_first = create_block_data('../homemade_data/'+county+'.csv')

    #copy dataset, and fix races to numeric values
    orig_df = df_first.copy()
    for index, row in orig_df.iterrows():
        d = row['race']
        orig_df.at[index, 'race'] = races.index(d)
       
    
    df = orig_df.copy()
        
    true_counts = get_true_counts(df, shape)

    #for each epsilon val, make histograms
    for e in epsilon:
        print(str(e)[0:4]) 
        for i in range(100):
            racehist, randomizeds = get_noisy_queries(true_counts, shape, e)
            df_final = pd.DataFrame(list(zip(racehist[0], racehist[1])), columns = ['Nonhispanic', 'Hispanic'])
            #creates dp run file containing the new differentially private dataset
            filename = "dp_runs/"+county+"/dprun_"+str(e)[0:4]+"_"+str(i)+".csv"
            csv_orig_data = df_final.to_csv(filename, index = True)

alameda
0.01
0.06
0.11
0.16
0.21
0.26
0.31
0.36
0.41
0.46
0.51
0.56
0.61
0.66
0.71
0.76
0.81
0.86
0.91
0.96
1.01
1.06
1.11
1.16
1.21
1.26
1.31
1.36
1.41
1.46
1.51
1.56
1.61
1.66
1.71
1.76
1.81
1.86
1.91
1.96
2.01
2.06
2.11
2.15
2.21
2.26
2.31
2.36
2.41
2.46
2.51
2.56
2.61
2.66
2.71
2.76
2.81
2.86
2.91
2.96
3.01
3.06
3.11
3.16
3.21
3.26
3.31
3.36
3.41
3.46
3.51
3.56
3.61
3.66
3.71
3.76
3.81
3.86
3.91
3.96


KeyboardInterrupt: 