In [2]:
#import packages
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import Counter

combos = []
for voting_age in [0, 1]:
    for hispanic in [0, 1]:
        for race in range(63):
            combos.append([voting_age, hispanic, race])

#This method computes noisy population sizes for each (voting age Y/N)*(hispanic Y/N)*race combination using
# a geometric mechanism.
#-------------------------------------------------
#INPUT:
# dft: dataframe corresponding to a BLOCK
# ep: epsilon value for the county-wide mechanism
#-------------------------------------------------
#OUTPUT: 
# noisy_queries: numpy 3D array with dimensions (2, 2, 63)
#          the ith element is an integer representing the (noisy) population size for the category in combos[i]
def get_noisy_queries(dft,  ep):
    
    # here, we use epsilon = ep*0.12*0.5, as per Table 2.1 in https://www.nap.edu/read/25978/chapter/3#13
    geo = Geometric(epsilon=ep*0.12*0.5, sensitivity=1)
    
    queries = []
    noisy_queries = np.zeros(2, 2, 63)
    for voting_age in [0, 1]:
        for hispanic in [0, 1]:
            for race in range(63):
                queries.append([[voting_age, hispanic, race], 0])
    
    
                
    # idk how to use pandas still... but here is pseudocode!
    # for i in range(len(queries)):
    #   category = queries[i][0]
    #   count = num entries in dft with those values of voting age, hispanic, race
    #   queries[i][1] = count
    #   noisy_queries[voting_age][hispanic][race] = geo.randomise(count)    
    
    return noisy_queries

#This method computes a (noisy) histogram of population size by race for the specified county, 
# summing over all blocks in the county the output of get_noisy_queries.
#--------------------------------------------------------------------------------------------
#INPUT: 
# county: dataframe containing county data  
# ep: epsilon value for the county-wide mechanism
#--------------------------------------------------------------------------------------------
#OUTPUT:
# race_histogram: numpy 2D array with dimension (2, 63)
#          for row i,
#          column 0 contains the population size of non-Hispanic individuals with race i
#          column 1 contains the population size of Hispanic individuals with race i
def compute_race_histogram(county, ep):
    
    race_histogram = np.zeros(2, 63)
    noisy_queries_total = np.zeros(2, 2, 63)
    
    # iterate through blocks in the county
    # for each block in county
    #      noisy_queries_total = noisy_queries_total + get_noisy_queries(block, ep)
    
    
    # after noisy_queries_total includes noisy queries from all blocks in the county, iterate through all hispanic/race
    # combos and add their population size to race_histogram
    # there's probably some clever way to vectorize this but I'm rusty
    for i in range(2):
        for j in range(63):
            race_histogram[i][j] = noisy_queries_total[0][i][j] + noisy_queries_total[1][i][j]
    
    return race_histogram
    
    