## Disaggregation

Inputs: each old geography has an estimate and a margin of error (output from census aggregator), then it has the proportion of the old geography that goes in each of the new geographies


Outputs: estimates and margins of error for the new geographies

In [29]:
import math
import numpy
household_income_2013_acs5 = [
            
        dict( n=900, moe=8)
            
        ]

household_income_la_2013_acs1 = [
      
      dict( n=1927, moe = 50)
  ]


test_input = [household_income_2013_acs5,household_income_la_2013_acs1]

makeup_1 = [0.8, 0.2] ## 80% of old geography 1 goes to new geography 1, 20% of old geography 1 goes to new geograph 2
makeup_2 =  [0.3, 0.7]

makeup_input = [makeup_1, makeup_2]


In [30]:
def disaggregate_sum(data_list, makeup, deterministic = True, simulations=50):
    
    #  if you just want an estimate and ignore margin of error
    if deterministic:
        results_sum = []
        for i in range(len(data_list)):
            total = []
            results_sum.append([data_list[i][0]['n']*z for z in makeup[i]]) #  get proportion of n going to each new geography
        rr = numpy.vstack(results_sum) #  get into array format
        est = numpy.apply_along_axis(sum, 0, rr) #  sum each column (total for each new geography)
        return est, None
    # otherwise deal with margin of error via simulation
    else:
        simulation_results = []
        for s in range(simulations):
            results_sum = []
            for i in range(len(data_list)):
                total = []
                se = data_list[i][0]['moe'] / 1.645 #  convert moe to se
                new_n = round(numpy.random.normal(data_list[i][0]['n'], se)) # u se moe to introduce randomness into number in bin
                new_n = int(new_n) #  clean it up
                results_sum.append([new_n*z for z in makeup[i]]) #  get proportion of simulated n going to each new geography
            rr = numpy.vstack(results_sum) # get into array format
            simulation_results.append(numpy.apply_along_axis(sum, 0, rr)) #  sum each column (total for each new geography)
        ss = numpy.vstack(simulation_results) #  get simulation into array format
        est = numpy.apply_along_axis(numpy.mean, 0, ss) #  mean across simulations
        t1 = numpy.apply_along_axis(numpy.quantile, axis=0, arr =ss, q=0.95) - est #  higher quantile across simulations
        t2 = est - numpy.apply_along_axis(numpy.quantile, axis=0, arr = ss, q=0.05) #  lower quantile across simulations
        margin_of_error = numpy.amax(numpy.column_stack((t1,t2)), 1) #  take the larger of each row to be conservative
        return est, margin_of_error

In [31]:
disaggregate_sum(test_input, makeup_input, deterministic = True, simulations=50)

(array([1298.1, 1528.9]), None)

In [32]:
disaggregate_sum(test_input, makeup_input, deterministic = False, simulations=50)

(array([1298.502, 1528.938]), array([13.933, 27.603]))

In [33]:
def disaggregate_mean(data_list, makeup, deterministic = True, simulations=50):
    #  if you just want an estimate and ignore margin of error
    if deterministic:
        denom = disaggregate_sum(data_list, makeup, deterministic = True, simulations=50)[0] #  get the weighted sum for denominator of weighted mean
        results = []
        grand_total = []
        for i in range(len(data_list)):
            total = data_list[i][0]['n'] * data_list[i][0]['val'] #  weighted value
            results.append([total*z for z in makeup[i]])  #  partition by new geograph
        rr = numpy.vstack(results) #  get in array format
        rr2 = numpy.apply_along_axis(sum, 0, rr) #  weighted sum
        est = numpy.divide(rr2, denom) #  get the weighted average
        return est, None
    #  otherwise deal with margin of error via simulation
    else:
        #  have to redo simulation can't just use sum call because randomness won't match up
        simulation_results = [] #  weighted sum
        simulation_results_v = [] #  weighted sum and values
        for s in range(simulations):
            results = [] #   intermediate weighted sum
            results_v = [] #  intermediate weighted sum and values
            for i in range(len(data_list)):
                se = data_list[i][0]['moe'] / 1.645 #  convert moe to se
                new_n = round(numpy.random.normal(data_list[i][0]['n'], se))
                #  use moe to introduce randomness into number in bin
                new_n = int(new_n) #  clean it up
                results.append([new_n * z for z in makeup[i]]) 
                results_v.append([new_n * data_list[i][0]['val']*z  for z in makeup[i]])
            rr = numpy.vstack(results) #  get results into array format
            rr_v = numpy.vstack(results_v) #  get results into array format
            simulation_results.append(numpy.apply_along_axis(sum, 0, rr)) #  do sum for numerator of weighted mean
            simulation_results_v.append(numpy.apply_along_axis(sum, 0, rr_v)) #  do sum for denominator of weighted mean
        ss = numpy.vstack(simulation_results) #  get simulation into array format
        ss_v = numpy.vstack(simulation_results_v) #  get simulation into array format
        ssd = numpy.divide(ss_v, ss) #  get weighted mean
        est = numpy.apply_along_axis(numpy.mean, 0, ssd) #  take mean of weighted means across simulations
        t1 = numpy.apply_along_axis(numpy.quantile, axis=0, arr =ssd, q=0.95) - est #  upper quantile of weighted means across simulations
        t2 = est - numpy.apply_along_axis(numpy.quantile, axis=0, arr = ssd, q=0.05) #  lower quantile of weighted means across simulations
        margin_of_error = numpy.amax(numpy.column_stack((t1,t2)), 1) #  take max across rows to be conservative
        return est, margin_of_error


In [34]:
household_income_2013_acs5 = [
            
            dict( val=34999, n=900, moe=8)
            
        ]

household_income_la_2013_acs1 = [
      
      dict( val=49929, n=1757, moe = 50)
  ]


test_input = [household_income_2013_acs5,household_income_la_2013_acs1]

makeup_1 = [0.8, 0.2]
makeup_2 =  [0.3, 0.7]

makeup_input = [makeup_1, makeup_2]

In [35]:
disaggregate_mean(test_input, makeup_input, deterministic = True, simulations=5)

(array([41309.32234785, 48022.90736932]), None)

In [36]:
disaggregate_mean(test_input, makeup_input, deterministic = False, simulations=5)

(array([41311.78934568, 48023.90355135]), array([61.01254397, 27.74911393]))