## Disaggregation

Inputs: each old geography has an estimate and a margin of error (output from census aggregator), then it has the proportion of the old geography that goes in each of the new geographies


Outputs: estimates and margins of error for the new geographies

In [2]:
import math
import numpy
household_income_2013_1 = [
            
        dict( n=900, moe=8)
            
        ]

household_income_la_2013_2 = [
      
      dict( n=1927, moe = 50)
  ]

## note aggregator will need to provide n and moe n as outputs somehow for this


test_input = [household_income_2013_1,household_income_la_2013_2]

makeup_1 = [0.8, 0.2] ## 80% of old geography 1 goes to new geography 1, 20% of old geography 1 goes to new geograph 2
makeup_2 =  [0.3, 0.7]

makeup_input = [makeup_1, makeup_2] ## should have a test that each sums to 1




In [5]:
def disaggregate_sum(data_list, makeup, deterministic = True, simulations=50):
    """
    Take aggregated sums for a set of geographies and disaggregate into new sums for a different set of geographies
    Args:
        data_list (list of dictionaries): 
            Each dictionary should have two keys:
                * n (int): The number of people, households or other unit in the old region
                * moe (float): the margin of error on the n for the old region
        makeup (list of lists): inner list is length of new geographies, outer list is length of old geographies,
                                the values in inner list j partition old region j into each of the new regions
    Returns:
        list of two arrays (estimate of n values for new regions, margins of error for new n values)
    Examples:
        >>> disaggregate_sum(data_list, makeup)
        (array([1298.1, 1528.9]), None)
        >>> disaggregate_sum(data_list, makeup, deterministic = F)
        (array([1298.502, 1528.938]), array([13.933, 27.603]))
        """
    
    #  if you just want an estimate and ignore margin of error
    if deterministic:
        results_sum = []
        for i in range(len(data_list)):
            total = []
            results_sum.append([data_list[i][0]['n']*z for z in makeup[i]]) #  get proportion of n going to each new geography
        rr = numpy.vstack(results_sum) #  get into array format
        est = numpy.apply_along_axis(sum, 0, rr) #  sum each column (total for each new geography)
        return est, None
    # otherwise deal with margin of error via simulation
    else:
        simulation_results = []
        for s in range(simulations):
            results_sum = []
            for i in range(len(data_list)):
                total = []
                se = data_list[i][0]['moe'] / 1.645 #  convert moe to se
                new_n = round(numpy.random.normal(data_list[i][0]['n'], se)) # u se moe to introduce randomness into number in bin
                new_n = int(new_n) #  clean it up
                results_sum.append([new_n*z for z in makeup[i]]) #  get proportion of simulated n going to each new geography
            rr = numpy.vstack(results_sum) # get into array format
            simulation_results.append(numpy.apply_along_axis(sum, 0, rr)) #  sum each column (total for each new geography)
        ss = numpy.vstack(simulation_results) #  get simulation into array format
        est = numpy.apply_along_axis(numpy.mean, 0, ss) #  mean across simulations
        t1 = numpy.apply_along_axis(numpy.quantile, axis=0, arr =ss, q=0.95) - est #  higher quantile across simulations
        t2 = est - numpy.apply_along_axis(numpy.quantile, axis=0, arr = ss, q=0.05) #  lower quantile across simulations
        margin_of_error = numpy.amax(numpy.column_stack((t1,t2)), 1) #  take the larger of each row to be conservative
        return est, margin_of_error

In [31]:
disaggregate_sum(test_input, makeup_input, deterministic = True, simulations=50)

(array([1298.1, 1528.9]), None)

In [32]:
disaggregate_sum(test_input, makeup_input, deterministic = False, simulations=50)

(array([1298.502, 1528.938]), array([13.933, 27.603]))

In [5]:
def disaggregate_mean(data_list, makeup, deterministic = True, simulations=50):
    """
    Take aggregated means for a set of geographies and disaggregate into new means for a different set of geographies
    Args:
        data_list (list of dictionaries): 
            Each dictionary should have two keys:
                * n (int): The number of people, households or other unit in the old region
                * val (float): The estimate in the old region
                * moe_n (float): The moe for the n
                * moe_val (float): The moe for the value
        makeup (list of lists): inner list is length of new geographies, outer list is length of old geographies,
                                the values in inner list j partition old region j into each of the new regions
    Returns:
        list of two arrays (estimate of mean values for new regions, margins of error for new mean values)
    Examples:
        >>> disaggregate_mean(data_list, makeup)
        (array([37573.13793103, 44330.25      , 49929.        ]), None)
        >>> disaggregate_mean(data_list, makeup, deterministic = F)
        (array([37571.20936   , 44317.40688447, 49928.91120178]),
         array([ 33.46206937, 127.01398643,   9.02957785]))
        """
    
    for m in makeup:
        if not math.isclose(sum(m), 1, abs_tol=10**-3): ## to three decimal places
            warnings.warn("", PartitionWarning) ## at least one of your partitions does not sum to one
    
    #  if you just want an estimate and ignore margin of error
    if deterministic:
        denom = disaggregate_sum(data_list, makeup, deterministic = True, simulations=50)[0] #  get the weighted sum for denominator of weighted mean
        results = []
        grand_total = []
        for i in range(len(data_list)):
            total = data_list[i][0]['n'] * data_list[i][0]['val'] #  weighted value
            results.append([total*z for z in makeup[i]])  #  partition by new geograph
        rr = numpy.vstack(results) #  get in array format
        rr2 = numpy.apply_along_axis(sum, 0, rr) #  weighted sum
        est = numpy.divide(rr2, denom) #  get the weighted average
        return est, None
    #  otherwise deal with margin of error via simulation
    else:
        simulation_results = [[] for i in range(simulations)]
        #simulation_results
        for s in range(simulations):
            results=[[] for i in range(len(makeup[0]))] ## number of new polygons is how many the old polygons are split into
            for i in range(len(data_list)):
                se_n = data_list[i][0]['moe_n'] / 1.645 #  convert moe to se
                se_val=data_list[i][0]['moe_val'] / 1.645 #  convert moe to se
                new_n = round(numpy.random.normal(data_list[i][0]['n'], se_n))  
                #  use moe to introduce randomness into number in bin
                new_n = int(new_n) #  clean it up
                for j in range(len(makeup[0])):
                    new_n_piece = round(new_n * makeup[i][j])
                    #  use moe to introduce randomness into value
                    #  draw individuals based on this new estimate, partitioned into new geographies
                    results[j].append(numpy.random.normal(data_list[i][0]['val'], se_val, new_n_piece)) 
            mean_results = [] ## for new geographies
            for j in range(len(makeup[0])):
                mean_results.append(numpy.mean(numpy.concatenate(results[j])))
            simulation_results[s]=mean_results
        ss = numpy.vstack(simulation_results) #  get simulation into array format
        est = numpy.apply_along_axis(numpy.nanmean, 0, ss) #  take mean of means across simulations
        t1 = numpy.apply_along_axis(numpy.nanquantile, axis=0, arr =ss, q=0.95) - est #  upper quantile of means across simulations
        t2 = est - numpy.apply_along_axis(numpy.nanquantile, axis=0, arr = ss, q=0.05) #  lower quantile of means across simulations
        margin_of_error = numpy.amax(numpy.column_stack((t1,t2)), 1) #  take max across rows to be conservative
        return est, margin_of_error

In [3]:
household_income_2013_acs5 = [
            
            dict( n=90, moe_n=1,val=34999, moe_val=25)
            
        ]

household_income_la_2013_acs1 = [
      
      dict( n=50, moe_n = 1, val=49929, moe_val=25)
  ]


test_input = [household_income_2013_acs5,household_income_la_2013_acs1]

makeup_1 = [0.8, 0.2,0]
makeup_2 =  [0.3, 0.6,0.1]
#makeup_3 = [0.5, 0.4, 0.1]

## test that these proportions times n round to at least one
## else warning, moe might be deceptively small

makeup_input = [makeup_1, makeup_2]

In [107]:
disaggregate_mean(test_input, makeup_input, deterministic = True, simulations=5)

(array([37573.13793103, 44330.25      , 49929.        ]), None)

In [108]:
test=disaggregate_mean(test_input, makeup_input, deterministic = False, simulations=50)
#len(test)
test

(array([37571.20936   , 44317.40688447, 49928.91120178]),
 array([ 33.46206937, 127.01398643,   9.02957785]))

In [115]:
## median

def disaggregate_median(data_list, makeup, deterministic = True, simulations=50):
    """
    Take aggregated medians for a set of geographies and disaggregate into new medians for a different set of geographies
    Args:
        data_list (list of dictionaries): 
            Each dictionary should have two keys:
                * n (int): The number of people, households or other unit in the old region
                * val (float): The estimate in the old region
                * moe_n (float): The moe for the n
                * moe_val (float): The moe for the value
        makeup (list of lists): inner list is length of new geographies, outer list is length of old geographies,
                                the values in inner list j partition old region j into each of the new regions
    Returns:
        list of two arrays (estimate of median values for new regions, margins of error for new median values)
    Examples:
        >>> disaggregate_median(data_list, makeup)
        ([34999.0, 49929.0, 49929.0], None)
        >>> disaggregate_median(data_list, makeup, deterministic = F)
        (array([35003.25758268, 49917.76306533, 49925.44251865]),
            array([2.16161205, 3.19795328, 9.34154849]))
        """
    for m in makeup:
        if not math.isclose(sum(m), 1, abs_tol=10**-3): ## to three decimal places
            warnings.warn("", PartitionWarning) ## at least one of your partitions does not sum to one
            
    #  if you just want an estimate and ignore margin of error
    if deterministic:
        new_medians = []
        for j in range(len(makeup[0])):  
            results = []
            for i in range(len(data_list)):
                results.append(numpy.repeat(data_list[i][0]['val'], round(numpy.prod([data_list[i][0]['n'],makeup[i][j]])), axis=0))
            new_medians.append(numpy.median(numpy.concatenate(results)))
        return new_medians, None
    #  otherwise deal with margin of error via simulation
    else:
        simulation_results = [[] for i in range(simulations)]
        #simulation_results
        for s in range(simulations):
            results=[[] for i in range(len(makeup[0]))] ## number of new polygons is how many the old polygons are split into
            for i in range(len(data_list)):
                se_n = data_list[i][0]['moe_n'] / 1.645 #  convert moe to se
                se_val=data_list[i][0]['moe_val'] / 1.645 #  convert moe to se
                new_n = round(numpy.random.normal(data_list[i][0]['n'], se_n))  
                #  use moe to introduce randomness into number in bin
                new_n = int(new_n) #  clean it up
                for j in range(len(makeup[0])):
                    new_n_piece = round(new_n * makeup[i][j])
                    #  use moe to introduce randomness into value
                    #  draw individuals based on this new estimate, partitioned into new geographies
                    results[j].append(numpy.random.normal(data_list[i][0]['val'], se_val, new_n_piece)) 
            median_results = [] ## for new geographies
            for j in range(len(makeup[0])):
                median_results.append(numpy.median(numpy.concatenate(results[j])))
            simulation_results[s]=median_results
        ss = numpy.vstack(simulation_results) #  get simulation into array format
        est = numpy.apply_along_axis(numpy.nanmean, 0, ss) #  take mean of medians across simulations
        t1 = numpy.apply_along_axis(numpy.nanquantile, axis=0, arr =ss, q=0.95) - est #  upper quantile of medians across simulations
        t2 = est - numpy.apply_along_axis(numpy.nanquantile, axis=0, arr = ss, q=0.05) #  lower quantile of medians across simulations
        margin_of_error = numpy.amax(numpy.column_stack((t1,t2)), 1) #  take max across rows to be conservative
        return est, margin_of_error

In [116]:
disaggregate_median(test_input, makeup_input, deterministic = True, simulations=5)

([34999.0, 49929.0, 49929.0], None)

In [119]:
disaggregate_median(test_input, makeup_input, deterministic = False, simulations=5)


(array([35003.25758268, 49917.76306533, 49925.44251865]),
 array([2.16161205, 3.19795328, 9.34154849]))

Warnings to include somewhere in function

In [4]:
for m in makeup_input:
    if not math.isclose(sum(m), 1, abs_tol = .001):
        warnings.warn("", PartitionWarning) ## at least one of your partitions does not sum to one


In [None]:
## test that these proportions times n round to at least one
## else warning, moe might be deceptively small

## min in makeup times that is not zero? all n values it will interact with
## but could have others contributing that will make it fine?

## total in a bucket can't be too close to zero

## 

## also when drawing from normal could be negative if mean is too small 
numpy.concatenate(makeup_input)

for m in makeup_input:
    if not math.isclose(sum(m), 1, abs_tol = .001):
        warnings.warn("", SmallPopulationWarning) ## moe might be deceptively small if a region has too few people

array([0.8, 0.2, 0. , 0.3, 0.6, 0.1])