In [8]:
## standardized measure of uncertainty
## share of the estimate that the error represents
## higher number means more uncertainty

def coefficient_of_variation(estimate, moe):
    """
    Estimate a coefficient of variation to help interpret the uncertainty of an estimate.
    This diagnostic comes from `Splelman and Folch '15'`_ and references `American Community Survey materials`_.
    Args:
        estimate (float): 
        design_factor (float): 
    Returns:
        A two-item tuple with the median followed by the approximated margin of error.
        (42211.096153846156, 10153.200960954948)
    Examples:
        Estimating the median for a range of median household incomes.
        >>> coefficient_of_variation(42211.096153846156, 10153.200960954948)
        0.14622123567658166
    ... _Splelman and Folch '15':
        https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0115626#abstract0
    ... _American Community Survey materials:
        http://www.loc.gov/catdir/toc/ecip0720/2007024090.html
    """
    numerator = moe / 1.645
    cv = numerator / estimate
    #if cv > 0.12:
        #warnings.warn("", CVWarning) ## “reasonable standard of precision for an estimate” exceeded

    return cv
    

In [9]:
coefficient_of_variation(42211.096153846156, 10153.200960954948)

0.14622123567658166

In [45]:
# quantify information loss through aggregation

def information_loss(old_region_data, makeup, new_region_data):
    """
    Measure whether the new estimates for a given variable are within the margins of error of their original components.

    Information is considered to be lost if the 90 percent confidence interval of original geography value
    estimates do not overlap with the new geography's estimate. This diagnostic comes from `Splelman and Folch '15'`_.
    Args:
        old_region_data (list of dictionaries): 
            Each dictionary should have two keys:
                * val (float): The estimate for the old region
                * moe (float): The margin of error on the estimate for the old region
        makeup (list of lists): inner lists are length of new geographies, outer list is length of old geographies,
                                the values in inner list j partition old region j into each of the new regions
        new_region_data (list): The estimate for the new region
    Returns:
        A two-item tuple with an array of information loss metrics per new geographic region and an aggregated information loss across all the regions
        ([0.0, 0.0], 0.0)
    Examples:
        >>> information_loss(test_input, makeup_input, disaggregate_sum(test_input, makeup_input, deterministic = True)[0])
        ([0.0, 0.0], 0.0)
    ... _Splelman and Folch '15':
        https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0115626#abstract0
    """
    check_by_new_region = []
    for i in range(len(old_region_data)):
        check_one_region = []
        for j in range(len(makeup[i])):
            if makeup[i][j]!=0:  # if old region has been partially partitioned into new region
                check_one_region.append(abs(old_region_data[i][0]['val']-new_region_data[j]) < old_region_data[i][0]['moe'])  # see if new estimate lies in old confidence interval
                # NOTE: maybe too stringent, see what the values look like for LA, should this also account for new geography's moe?
        check_by_new_region.append(sum(check_one_region)/len(check_one_region))  # aggregate metric for all old values that go into the new one
    return check_by_new_region, sum(check_by_new_region)/len(check_by_new_region)  # aggregate metric across all new geographies

In [50]:
import math
import numpy
household_income_2013_1 = [
            
        dict( n=900, val = 900,  moe=20) ## in special case of sum, val is the same as n, but expecting different naming
            
        ]

household_income_la_2013_2 = [
      
      dict( n=1927, val = 1927, moe = 30)
  ]

## note aggregator will need to provide n and moe n as outputs somehow for this


test_input = [household_income_2013_1,household_income_la_2013_2]

makeup_1 = [0.8, 0.2] ## 80% of old geography 1 goes to new geography 1, 20% of old geography 1 goes to new geograph 2
makeup_2 =  [0.3, 0.7] ## 30% of old geography 2 goes to new geography 1, 70% of old geography 2 goes to new geograph 2

makeup_input = [makeup_1, makeup_2] ## should have a test that each sums to 1


In [13]:
# as of 2019-08-26
def disaggregate_sum(data_list, makeup, deterministic = True, simulations=50):
    """
    Take aggregated sums for a set of geographies and disaggregate into new sums for a different set of geographies
    Args:
        data_list (list of dictionaries): 
            Each dictionary should have two keys:
                * n (int): The number of people, households or other unit in the old region
                * moe (float): the margin of error on the n for the old region
        makeup (list of lists): inner lists are length of new geographies, outer list is length of old geographies,
                                the values in inner list j partition old region j into each of the new regions
    Returns:
        list of two arrays (estimate of n values for new regions, margins of error for new n values)
    Examples:
        >>> disaggregate_sum(data_list, makeup)
        (array([1298.1, 1528.9]), None)
        >>> disaggregate_sum(data_list, makeup, deterministic = F)
        (array([1298.502, 1528.938]), array([13.933, 27.603]))
        """
    for m in makeup_input:
        if not math.isclose(sum(m), 1, abs_tol = .001):
            warnings.warn("", PartitionWarning) ## at least one of your partitions does not sum to one

    #  if you just want an estimate and ignore margin of error
    if deterministic:
        results_sum = []
        for i in range(len(data_list)):
            total = []
            results_sum.append([data_list[i][0]['n']*z for z in makeup[i]]) #  get proportion of n going to each new geography
        rr = numpy.vstack(results_sum) #  get into array format
        est = numpy.apply_along_axis(sum, 0, rr) #  sum each column (total for each new geography)
        return est, None
    # otherwise deal with margin of error via simulation
    else:
        simulation_results = []
        for s in range(simulations):
            results_sum = []
            for i in range(len(data_list)):
                total = []
                se = data_list[i][0]['moe'] / 1.645 #  convert moe to se
                new_n = round(numpy.random.normal(data_list[i][0]['n'], se)) # u se moe to introduce randomness into number in bin
                new_n = int(new_n) #  clean it up
                results_sum.append([new_n*z for z in makeup[i]]) #  get proportion of simulated n going to each new geography
            rr = numpy.vstack(results_sum) # get into array format
            simulation_results.append(numpy.apply_along_axis(sum, 0, rr)) #  sum each column (total for each new geography)
        ss = numpy.vstack(simulation_results) #  get simulation into array format
        est = numpy.apply_along_axis(numpy.mean, 0, ss) #  mean across simulations
        t1 = numpy.apply_along_axis(numpy.quantile, axis=0, arr =ss, q=0.95) - est #  higher quantile across simulations
        t2 = est - numpy.apply_along_axis(numpy.quantile, axis=0, arr = ss, q=0.05) #  lower quantile across simulations
        margin_of_error = numpy.amax(numpy.column_stack((t1,t2)), 1) #  take the larger of each row to be conservative
        return est, margin_of_error

In [53]:
test =disaggregate_sum(test_input, makeup_input, deterministic = True, simulations=50)
test

(array([1298.1, 1528.9]), None)

In [52]:
information_loss(test_input, makeup_input, test[0])

([0.0, 0.0], 0.0)