## Disaggregation

Inputs: each old geography has an estimate and a margin of error (output from census aggregator), then it has the proportion of the old geography that goes in each of the new geographies


Outputs: estimates and margins of error for the new geographies

In [1]:
import math
import numpy
household_income_2013_acs5 = [
            
        dict( n=900, moe=8)
            
        ]

household_income_la_2013_acs1 = [
      
      dict( n=1927, moe = 50)
  ]

## note aggregator will need to provide n and moe n as outputs somehow for this


test_input = [household_income_2013_acs5,household_income_la_2013_acs1]

makeup_1 = [0.8, 0.2] ## 80% of old geography 1 goes to new geography 1, 20% of old geography 1 goes to new geograph 2
makeup_2 =  [0.3, 0.7]

makeup_input = [makeup_1, makeup_2] ## should have a test that each sums to 1


In [5]:
def disaggregate_sum(data_list, makeup, deterministic = True, simulations=50):
    
    #  if you just want an estimate and ignore margin of error
    if deterministic:
        results_sum = []
        for i in range(len(data_list)):
            total = []
            results_sum.append([data_list[i][0]['n']*z for z in makeup[i]]) #  get proportion of n going to each new geography
        rr = numpy.vstack(results_sum) #  get into array format
        est = numpy.apply_along_axis(sum, 0, rr) #  sum each column (total for each new geography)
        return est, None
    # otherwise deal with margin of error via simulation
    else:
        simulation_results = []
        for s in range(simulations):
            results_sum = []
            for i in range(len(data_list)):
                total = []
                se = data_list[i][0]['moe'] / 1.645 #  convert moe to se
                new_n = round(numpy.random.normal(data_list[i][0]['n'], se)) # u se moe to introduce randomness into number in bin
                new_n = int(new_n) #  clean it up
                results_sum.append([new_n*z for z in makeup[i]]) #  get proportion of simulated n going to each new geography
            rr = numpy.vstack(results_sum) # get into array format
            simulation_results.append(numpy.apply_along_axis(sum, 0, rr)) #  sum each column (total for each new geography)
        ss = numpy.vstack(simulation_results) #  get simulation into array format
        est = numpy.apply_along_axis(numpy.mean, 0, ss) #  mean across simulations
        t1 = numpy.apply_along_axis(numpy.quantile, axis=0, arr =ss, q=0.95) - est #  higher quantile across simulations
        t2 = est - numpy.apply_along_axis(numpy.quantile, axis=0, arr = ss, q=0.05) #  lower quantile across simulations
        margin_of_error = numpy.amax(numpy.column_stack((t1,t2)), 1) #  take the larger of each row to be conservative
        return est, margin_of_error

In [31]:
disaggregate_sum(test_input, makeup_input, deterministic = True, simulations=50)

(array([1298.1, 1528.9]), None)

In [32]:
disaggregate_sum(test_input, makeup_input, deterministic = False, simulations=50)

(array([1298.502, 1528.938]), array([13.933, 27.603]))

In [96]:
def disaggregate_mean(data_list, makeup, deterministic = True, simulations=50):
    #  if you just want an estimate and ignore margin of error
    if deterministic:
        denom = disaggregate_sum(data_list, makeup, deterministic = True, simulations=50)[0] #  get the weighted sum for denominator of weighted mean
        results = []
        grand_total = []
        for i in range(len(data_list)):
            total = data_list[i][0]['n'] * data_list[i][0]['val'] #  weighted value
            results.append([total*z for z in makeup[i]])  #  partition by new geograph
        rr = numpy.vstack(results) #  get in array format
        rr2 = numpy.apply_along_axis(sum, 0, rr) #  weighted sum
        est = numpy.divide(rr2, denom) #  get the weighted average
        return est, None
    #  otherwise deal with margin of error via simulation
    else:
        simulation_results = [[] for i in range(simulations)]
        #simulation_results
        for s in range(simulations):
            results=[[] for i in range(len(makeup))]
            for i in range(len(data_list)):
                se_n = data_list[i][0]['moe_n'] / 1.645 #  convert moe to se
                se_val=data_list[i][0]['moe_val'] / 1.645 #  convert moe to se
                new_n = round(numpy.random.normal(data_list[i][0]['n'], se_n))  
                #  use moe to introduce randomness into number in bin
                new_n = int(new_n) #  clean it up
                for j in range(len(makeup)):
                    new_n_piece = round(new_n * makeup[i][j])
                    #  use moe to introduce randomness into value
                    #  draw individuals based on this new estimate, partitioned into new geographies
                    results[j].append(numpy.random.normal(data_list[i][0]['val'], se_val, new_n_piece)) 
            mean_results = [] ## for new geographies
            for j in range(len(makeup)):
                mean_results.append(numpy.mean(numpy.concatenate(results[j])))
            simulation_results[s]=mean_results
        ss = numpy.vstack(simulation_results) #  get simulation into array format
        est = numpy.apply_along_axis(numpy.nanmean, 0, ss) #  take mean of means across simulations
        t1 = numpy.apply_along_axis(numpy.nanquantile, axis=0, arr =ss, q=0.95) - est #  upper quantile of means across simulations
        t2 = est - numpy.apply_along_axis(numpy.nanquantile, axis=0, arr = ss, q=0.05) #  lower quantile of means across simulations
        margin_of_error = numpy.amax(numpy.column_stack((t1,t2)), 1) #  take max across rows to be conservative
        return est, margin_of_error

In [93]:
household_income_2013_acs5 = [
            
            dict( n=90, moe_n=1,val=34999, moe_val=25)
            
        ]

household_income_la_2013_acs1 = [
      
      dict( n=50, moe_n = 1, val=49929, moe_val=25)
  ]


test_input = [household_income_2013_acs5,household_income_la_2013_acs1]

makeup_1 = [0.8, 0.2,0]
makeup_2 =  [0.3, 0.6,0.1]
makeup_3 = [0.5, 0.4, 0.1]

## test that these proportions times n round to at least one
## else warning, moe might be deceptively small

makeup_input = [makeup_1, makeup_2, makeup_3]

In [94]:
disaggregate_mean(test_input, makeup_input, deterministic = True, simulations=5)

(array([37573.13793103, 44330.25      , 49929.        ]), None)

In [97]:
test=disaggregate_mean(test_input, makeup_input, deterministic = False, simulations=50)
#len(test)
test

(array([37578.37249058, 44332.10329477, 49928.56874733]),
 array([ 34.30912835, 121.01428162,  10.5982792 ]))

In [77]:
## median

def disaggregate_median(data_list, makeup, deterministic = True, simulations=50):
    #  if you just want an estimate and ignore margin of error
    if deterministic:
        #denom = disaggregate_sum(data_list, makeup, deterministic = True, simulations=50)[0] #  get the weighted sum for denominator of weighted mean
        #results = []
        new_medians = []
        for j in range(len(makeup)):  
            results = []
            for i in range(len(data_list)):
                results.append(numpy.repeat(data_list[i][0]['val'], round(numpy.prod([data_list[i][0]['n'],makeup[i][j]])), axis=0))
            ## make results work in median
            new_medians.append(numpy.median(results))
        return est, None
    #  otherwise deal with margin of error via simulation
    else:
        #  have to redo simulation can't just use sum call because randomness won't match up
        simulation_results = [] #  weighted sum
        simulation_results_v = [] #  weighted sum and values
        for s in range(simulations):
            results = [] #   intermediate weighted sum
            results_v = [] #  intermediate weighted sum and values
            for j in range(len(makeup)):
                for i in range(len(data_list)):
                    se = data_list[i][0]['moe'] / 1.645 #  convert moe to se
                    new_n = round(numpy.random.normal(data_list[i][0]['n'], se))
                    #  use moe to introduce randomness into number in bin
                    new_n = int(new_n) #  clean it up
                results.append([new_n * z for z in makeup[i]]) 
                results_v.append([new_n * data_list[i][0]['val']*z  for z in makeup[i]])
            rr = numpy.vstack(results) #  get results into array format
            rr_v = numpy.vstack(results_v) #  get results into array format
            simulation_results.append(numpy.apply_along_axis(sum, 0, rr)) #  do sum for numerator of weighted mean
            simulation_results_v.append(numpy.apply_along_axis(sum, 0, rr_v)) #  do sum for denominator of weighted mean
        ss = numpy.vstack(simulation_results) #  get simulation into array format
        ss_v = numpy.vstack(simulation_results_v) #  get simulation into array format
        ssd = numpy.divide(ss_v, ss) #  get weighted mean
        est = numpy.apply_along_axis(numpy.mean, 0, ssd) #  take mean of weighted means across simulations
        t1 = numpy.apply_along_axis(numpy.quantile, axis=0, arr =ssd, q=0.95) - est #  upper quantile of weighted means across simulations
        t2 = est - numpy.apply_along_axis(numpy.quantile, axis=0, arr = ssd, q=0.05) #  lower quantile of weighted means across simulations
        margin_of_error = numpy.amax(numpy.column_stack((t1,t2)), 1) #  take max across rows to be conservative
        return est, margin_of_error


In [78]:
disaggregate_median(test_input, makeup_input, deterministic = True, simulations=5)

[array([34999, 34999, 34999, 34999, 34999, 34999, 34999]), array([49929, 49929])]
<class 'list'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[34999 34999 34999 34999 34999 34999 34999]


ValueError: operands could not be broadcast together with shapes (2,) (7,) 

In [53]:
import numpy
numpy.median([numpy.array([1,2]),numpy.array([3,4])])

2.5

In [76]:
numpy.median([numpy.ndarray([1,2]),numpy.ndarray([3,4])])

ValueError: operands could not be broadcast together with shapes (3,4) (1,2) 

In [56]:
numpy.ndarray([1,2])

array([[0.e+000, 5.e-324]])

In [57]:
numpy.array([1,2])

array([1, 2])

In [31]:
numpy.median([[34999, 34999, 34999, 34999, 34999, 34999, 34999], [49929, 49929]])

TypeError: unsupported operand type(s) for /: 'list' and 'int'

In [118]:
test=[[] for i in range(10)]
test
test[0].append(1)
test

[[1], [], [], [], [], [], [], [], [], []]

In [63]:
numpy.concatenate([numpy.array([1,2]),numpy.array([])])

array([1., 2.])