# Proof of Concept: Approximate Mean

In [None]:
range_list = [
            dict(min=2499, max=9999, n=7942251, moe=17662),
            dict(min=10000, max=14999, n=5768114, moe=16409),
            dict(min=15000, max=19999, n=5727180, moe=16801),
            dict(min=20000, max=24999, n=5910725, moe=17864),
            dict(min=25000, max=29999, n=5619002, moe=16113),
            dict(min=30000, max=34999, n=5711286, moe=15891),
            dict(min=35000, max=39999, n=5332778, moe=16488),
            dict(min=40000, max=44999, n=5354520, moe=15415),
            dict(min=45000, max=49999, n=4725195, moe=16890),
            dict(min=50000, max=59999, n=9181800, moe=20965),
            dict(min=60000, max=74999, n=11818514, moe=30723),
            dict(min=75000, max=99999, n=14636046, moe=49159),
            dict(min=100000, max=124999, n=10273788, moe=47842),
            dict(min=125000, max=149999, n=6428069, moe=37952),
            dict(min=150000, max=199999, n=6931136, moe=37236),
            dict(min=200000, max=250001, n=7465517, moe=42206)
        ]


Example from table B19001 found [here](https://www2.census.gov/programs-surveys/acs/replicate_estimates/2017/data/5-year/010/)

HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2017 INFLATION-ADJUSTED DOLLARS)

## Generate Own Replicates

In [None]:
import numpy

range_list.sort(key=lambda x: x['min'])


simR = []        
for i in range(50): ## simulation
    test = []
    track_n = []
    for range_ in range_list:
        se = range_['moe']/1.645 ## convert moe to se
        nn = round(numpy.random.normal(range_['n'], se))
        ## use moe to introduce randomness into number in bin
        nn=int(nn) ## clean it up
        test.append(numpy.random.uniform(range_['min'], range_['max'], size=(1, nn )).sum())
        ## draw random values within the bin, assume uniform
        track_n.append(nn)
    simR.append(sum(test)/sum(track_n)) ## get mean in one simulation iteration 
## why so slow?    

In [None]:
numpy.mean(simR) ## mean of means


In [None]:
numpy.quantile(simR,0.05) ## uncertainty lower bound 90%


In [None]:
numpy.quantile(simR,0.95) ## Uncertainty upper bound 90%

In [None]:
se=max(numpy.quantile(simR,0.95) -numpy.mean(simR),numpy.mean(simR)-numpy.quantile(simR,0.05) )
se

In [None]:
moe=1.645*se
moe

Compare to median:

In [20]:
def approximate_median(range_list, design_factor=None, sampling_percentage=None):
    """
    Estimate a median and approximate the margin of error.
    Follows the U.S. Census Bureau's `official guidelines`_ for estimation using a design factor.
    Useful for generating medians for measures like household income and age when aggregating census geographies.
    Args:
        range_list (list): A list of dictionaries that divide the full range of data values into continuous categories.
            Each dictionary should have three keys:
                * min (int): The minimum value of the range
                * max (int): The maximum value of the range
                * n (int): The number of people, households or other unit in the range
            The minimum value in the first range and the maximum value in the last range can be tailored to the dataset
            by using the "jam values" provided in the `American Community Survey's technical documentation`_.
        design_factor (float, optional): A statistical input used to tailor the standard error to the
            variance of the dataset. The Census Bureau publishes design factors as part of its PUMS Accuracy statement.
            Find the value for the dataset you are estimating by referring to `the bureau's reference material`_.
            If you do not provide this input, a margin of error will not be returned.
        sampling_percentage (float, optional): A statistical input used to correct variance for finite population.
            For example, the 1-year ACS is designed to be a 2.5% sample of the population, and the 1-year PUMS is
            designed to be a 1% sample of the population. You can multiply these percentages by 5 for the 5-year versions.
    Returns:
        A two-item tuple with the median followed by the approximated margin of error.
        (42211.096153846156, 10153.200960954948)
    Examples:
        Estimating the median for a range of median household incomes.
        >>> income = [
            dict(min=2499, max=9999, n=186),
            dict(min=10000, max=14999, n=78),
            dict(min=15000, max=19999, n=98),
            dict(min=20000, max=24999, n=287),
            dict(min=25000, max=29999, n=142),
            dict(min=30000, max=34999, n=90),
            dict(min=35000, max=39999, n=107),
            dict(min=40000, max=44999, n=104),
            dict(min=45000, max=49999, n=178),
            dict(min=50000, max=59999, n=106),
            dict(min=60000, max=74999, n=177),
            dict(min=75000, max=99999, n=262),
            dict(min=100000, max=124999, n=77),
            dict(min=125000, max=149999, n=100),
            dict(min=150000, max=199999, n=58),
            dict(min=200000, max=250001, n=18)
        ]
        >>> approximate_median(income, design_factor=1.5, sampling_percentage=1)
        (42211.096153846156, 10153.200960954948)
    ... _official guidelines:
        https://www.documentcloud.org/documents/6165603-2013-2017AccuracyPUMS.html#document/p18
    ... _American Community Survey's technical documentation
        https://www.documentcloud.org/documents/6165752-2017-SummaryFile-Tech-Doc.html#document/p20/a508561
    ... _the bureau's reference material:
        https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.html
    """
    # Sort the list
    range_list.sort(key=lambda x: x['min'])

    # For each range calculate its min and max value along the universe's scale
    cumulative_n = 0
    for range_ in range_list:
        range_['n_min'] = cumulative_n
        cumulative_n += range_['n']
        range_['n_max'] = cumulative_n

    # What is the total number of observations in the universe?
    n = sum([d['n'] for d in range_list])

    # What is the estimated midpoint of the n?
    n_midpoint = n / 2.0

    # Now use those to determine which group contains the midpoint.
    n_midpoint_range = next(d for d in range_list if n_midpoint >= d['n_min'] and n_midpoint <= d['n_max'])

    # How many households in the midrange are needed to reach the midpoint?
    n_midrange_gap = n_midpoint - n_midpoint_range['n_min']

    # What is the proportion of the group that would be needed to get the midpoint?
    n_midrange_gap_percent = n_midrange_gap / n_midpoint_range['n']

    # Apply this proportion to the width of the midrange
    n_midrange_gap_adjusted = (n_midpoint_range['max'] - n_midpoint_range['min']) * n_midrange_gap_percent

    # Estimate the median
    estimated_median = n_midpoint_range['min'] + n_midrange_gap_adjusted

    # If there's no design factor, we can't calculate a margin of error
    if not design_factor:
        # Let's throw a warning, but still return the median
        warnings.warn("", DesignFactorWarning)
        return estimated_median, None

    # If there's no sampling percentage, we can't calculate a margin of error
    if not sampling_percentage:
        # Let's throw a warning, but still return the median
        warnings.warn("", SamplingPercentageWarning)
        return estimated_median, None

    # Get the standard error for this dataset
    standard_error = (design_factor * math.sqrt(((100 - sampling_percentage) / (n * sampling_percentage)) * (50**2))) / 100

    # Use the standard error to calculate the p values
    p_lower = (.5 - standard_error)
    p_upper = (.5 + standard_error)

    # Estimate the p_lower and p_upper n values
    p_lower_n = n * p_lower
    p_upper_n = n * p_upper

    # Find the ranges the p values fall within
    try:
        p_lower_range_i, p_lower_range = next(
            (i, d) for i, d in enumerate(range_list)
            if p_lower_n >= d['n_min'] and p_lower_n <= d['n_max']
        )
    except StopIteration:
        raise DataError(f"The n's lower p value {p_lower_n} does not fall within a data range.")

    try:
        p_upper_range_i, p_upper_range = next(
            (i, d) for i, d in enumerate(range_list)
            if p_upper_n >= d['n_min'] and p_upper_n <= d['n_max']
        )
    except StopIteration:
        raise DataError(f"The n's upper p value {p_upper_n} does not fall within a data range.")

    # Use these values to estimate the lower bound of the confidence interval
    p_lower_a1 = p_lower_range['min']
    try:
        p_lower_a2 = range_list[p_lower_range_i + 1]['min']
    except IndexError:
        p_lower_a2 = p_lower_range['max']
    p_lower_c1 = p_lower_range['n_min'] / n
    try:
        p_lower_c2 = range_list[p_lower_range_i + 1]['n_min'] / n
    except IndexError:
        p_lower_c2 = p_lower_range['n_max'] / n
    lower_bound = ((p_lower - p_lower_c1) / (p_lower_c2 - p_lower_c1)) * (p_lower_a2 - p_lower_a1) + p_lower_a1

    # Same for the upper bound
    p_upper_a1 = p_upper_range['min']
    try:
        p_upper_a2 = range_list[p_upper_range_i + 1]['min']
    except IndexError:
        p_upper_a2 = p_upper_range['max']
    p_upper_c1 = p_upper_range['n_min'] / n
    try:
        p_upper_c2 = range_list[p_upper_range_i + 1]['n_min'] / n
    except IndexError:
        p_upper_c2 = p_upper_range['n_max'] / n
    upper_bound = ((p_upper - p_upper_c1) / (p_upper_c2 - p_upper_c1)) * (p_upper_a2 - p_upper_a1) + p_upper_a1

    # Calculate the standard error of the median
    standard_error_median = 0.5 * (upper_bound - lower_bound)

    # Calculate the margin of error at the 90% confidence level
    margin_of_error = 1.645 * standard_error_median

    # Return the result
    return estimated_median, margin_of_error


In [None]:
import math
approximate_median(range_list, design_factor=1.5, sampling_percentage=1)

## Using Variance Replicates

The variance replicates can replace nn above. 

In [2]:
import pandas

In [3]:
df = pandas.read_csv('B19001.csv')
df.head()
var_rep = df.filter(regex='Var_Rep')[3:len(df.index)]
var_rep

Unnamed: 0,Var_Rep1,Var_Rep2,Var_Rep3,Var_Rep4,Var_Rep5,Var_Rep6,Var_Rep7,Var_Rep8,Var_Rep9,Var_Rep10,...,Var_Rep71,Var_Rep72,Var_Rep73,Var_Rep74,Var_Rep75,Var_Rep76,Var_Rep77,Var_Rep78,Var_Rep79,Var_Rep80
3,7947131.0,7939797.0,7940458.0,7935285.0,7936316.0,7944845.0,7942954.0,7942681.0,7933266.0,7940823.0,...,7936880.0,7939616.0,7944919.0,7936129.0,7938589.0,7942106.0,7931527.0,7941768.0,7951035.0,7940870.0
4,5767793.0,5771394.0,5765966.0,5765894.0,5770267.0,5759185.0,5757174.0,5771644.0,5765285.0,5772535.0,...,5778645.0,5769788.0,5768429.0,5765843.0,5761868.0,5754665.0,5774419.0,5769981.0,5763020.0,5774246.0
5,5733718.0,5722926.0,5718443.0,5728392.0,5733420.0,5733969.0,5722348.0,5722920.0,5720309.0,5726584.0,...,5724386.0,5726665.0,5733084.0,5728016.0,5723107.0,5732806.0,5727032.0,5720968.0,5732669.0,5722820.0
6,5913655.0,5904578.0,5912424.0,5912590.0,5905584.0,5907430.0,5913763.0,5912571.0,5907015.0,5903192.0,...,5910439.0,5915728.0,5921530.0,5913854.0,5911197.0,5914823.0,5896565.0,5915167.0,5914580.0,5918477.0
7,5620944.0,5619209.0,5619735.0,5622877.0,5617609.0,5620789.0,5623380.0,5626248.0,5625514.0,5620982.0,...,5616189.0,5623396.0,5622701.0,5620332.0,5622363.0,5617253.0,5619061.0,5630177.0,5611937.0,5616960.0
8,5711723.0,5706211.0,5698929.0,5718930.0,5712721.0,5718687.0,5712087.0,5707267.0,5712064.0,5705839.0,...,5712767.0,5712702.0,5715423.0,5713763.0,5711406.0,5714805.0,5720609.0,5706239.0,5713273.0,5717054.0
9,5335921.0,5331746.0,5339884.0,5330815.0,5331101.0,5330901.0,5329483.0,5334145.0,5332606.0,5330824.0,...,5330708.0,5331466.0,5324041.0,5338324.0,5334169.0,5342365.0,5343135.0,5331498.0,5328856.0,5328251.0
10,5357712.0,5356783.0,5348232.0,5361187.0,5350706.0,5340816.0,5357452.0,5353906.0,5352459.0,5361624.0,...,5348177.0,5345627.0,5359709.0,5355147.0,5359372.0,5359496.0,5359973.0,5361390.0,5351739.0,5351352.0
11,4725411.0,4735920.0,4730954.0,4724805.0,4724105.0,4722969.0,4728018.0,4718018.0,4734879.0,4721769.0,...,4729634.0,4731646.0,4728507.0,4722964.0,4729020.0,4726803.0,4724054.0,4725492.0,4735518.0,4724608.0
12,9186182.0,9177871.0,9180241.0,9175400.0,9183269.0,9187609.0,9182623.0,9187261.0,9180524.0,9181429.0,...,9203967.0,9184719.0,9178978.0,9173057.0,9186432.0,9182415.0,9178613.0,9185266.0,9182987.0,9180805.0


Sums

In [None]:
rep_sum = []
for col in var_rep.columns:
    rep_sum.append(sum(var_rep[col]))


In [None]:
est_sum = sum(df['estimate'][3:len(df.index)])

In [None]:
import math 
test = []
for i in range(len(rep_sum)):
    test.append(rep_sum[i]-est_sum)
    test[i]=test[i]**2
variance = sum(test)*4/80    
moe = 1.645 * math.sqrt(variance)


In [None]:
est_sum

In [None]:
moe

Median

In [None]:
range_list = [
            dict(min=2499, max=9999, n=7942251, moe=17662),
            dict(min=10000, max=14999, n=5768114, moe=16409),
            dict(min=15000, max=19999, n=5727180, moe=16801),
            dict(min=20000, max=24999, n=5910725, moe=17864),
            dict(min=25000, max=29999, n=5619002, moe=16113),
            dict(min=30000, max=34999, n=5711286, moe=15891),
            dict(min=35000, max=39999, n=5332778, moe=16488),
            dict(min=40000, max=44999, n=5354520, moe=15415),
            dict(min=45000, max=49999, n=4725195, moe=16890),
            dict(min=50000, max=59999, n=9181800, moe=20965),
            dict(min=60000, max=74999, n=11818514, moe=30723),
            dict(min=75000, max=99999, n=14636046, moe=49159),
            dict(min=100000, max=124999, n=10273788, moe=47842),
            dict(min=125000, max=149999, n=6428069, moe=37952),
            dict(min=150000, max=199999, n=6931136, moe=37236),
            dict(min=200000, max=250001, n=7465517, moe=42206)
        ]
var_rep.reset_index(drop=True,inplace=True)

med_est=approximate_median(range_list, design_factor=1.5)

med_rep = []
for col in var_rep.columns:
    for i in range(len(range_list)):
        range_list[i]['n']=var_rep[col][i]
    med_rep.append(approximate_median(range_list, design_factor=1.5)[0])

In [None]:
val = 0
for i in range(len(med_rep)):
    val = val + (med_rep[i]-med_est[0])**2

In [None]:
moe = 1.645 * math.sqrt(val*4/80)

In [None]:
med_est

In [None]:
moe

Mean

In [4]:
range_list = [
            dict(min=2499, max=9999, n=7942251, moe=17662),
            dict(min=10000, max=14999, n=5768114, moe=16409),
            dict(min=15000, max=19999, n=5727180, moe=16801),
            dict(min=20000, max=24999, n=5910725, moe=17864),
            dict(min=25000, max=29999, n=5619002, moe=16113),
            dict(min=30000, max=34999, n=5711286, moe=15891),
            dict(min=35000, max=39999, n=5332778, moe=16488),
            dict(min=40000, max=44999, n=5354520, moe=15415),
            dict(min=45000, max=49999, n=4725195, moe=16890),
            dict(min=50000, max=59999, n=9181800, moe=20965),
            dict(min=60000, max=74999, n=11818514, moe=30723),
            dict(min=75000, max=99999, n=14636046, moe=49159),
            dict(min=100000, max=124999, n=10273788, moe=47842),
            dict(min=125000, max=149999, n=6428069, moe=37952),
            dict(min=150000, max=199999, n=6931136, moe=37236),
            dict(min=200000, max=250001, n=7465517, moe=42206)
]
  

range_list.sort(key=lambda x: x['min'])


test=[]
track_n=[]
for range_ in range_list:
    test.append(numpy.random.uniform(range_['min'], range_['max'], size=(1, range_['n'] )).sum())
        ## draw random values within the bin, assume uniform
    track_n.append(range_['n'])

mean_est=sum(test)/sum(track_n)


74585.82871389529

In [17]:
simR = [] 
for col in var_rep.columns:
    test = []
    track_n = []
    for i in range(len(range_list)):
        test.append(numpy.random.uniform(range_list[i]['min'], range_list[i]['max'], size=(1, int(var_rep[col][i]) )).sum())
        ## indexing is wrong for ranges
        track_n.append(var_rep[col][i])
    simR.append(sum(test)/sum(track_n)) ## get mean in one simulation iteration 

In [18]:
test = []
for i in range(len(simR)):
    test.append(simR[i]-mean_est)
    test[i]=test[i]**2
variance = sum(test)*4/80    
moe = 1.645 * math.sqrt(variance)

117.74477505394293

In [None]:
mean_est

In [None]:
moe