# Stratification

In [2]:
# devtools::install_github("sdaza/sampler")
library(sampler)
library(data.table)
library(survey)

Loading required package: grid

Loading required package: Matrix

Loading required package: survival


Attaching package: ‘survey’


The following object is masked from ‘package:graphics’:

    dotchart




# `sampler` package

- Allocation for stratification
- Function to get MOE

In [3]:
# get data 
chile = data.table(chile)
chile

reg,pob,pr
<int>,<dbl>,<dbl>
1,328782,0.3
2,613328,0.4
3,308247,0.5
4,759228,0.5
5,1808300,0.5
6,910577,0.6
7,1035593,0.3
8,2100494,0.1
9,983499,0.2
10,834714,0.5


# Allocation strata function

In [4]:
?astrata

astrata                package:sampler                 R Documentation

_S_t_r_a_t_u_m _a_l_l_o_c_a_t_i_o_n

_D_e_s_c_r_i_p_t_i_o_n:

     Allocates cases to strata using different methods.

_U_s_a_g_e:

     astrata(samplesize, N, method = "mixed", min = 1, wp = 1, e = NULL,
       deff = 1, rr = 1, p = 0.5)
     
_A_r_g_u_m_e_n_t_s:

samplesize: Total sample size expected. Not needed when the method used
          is ‘error’.

       N: Population size. If the population size is not specified,
          MOEs are estimated assumming an infinite population.

  method: Allocation can be assigned using different methods. The
          default method is *mixed*.

            • *mixed*: Combines equal and proportional allocation
              across strata. The weight of the proportional assigment
              needs to be defined using ‘wp’. If ‘wp = 1’ (default
              value), the strata allocation is proportional. If ‘wp =
              0’

# MOE estimation

- Function `serrst`

In [33]:
# proportional allocation, same variance (max) across strata
chile[, ssize := astrata(1000, pob, wp=1)]
chile[, same_pr := 0.5]

# STR formula
serrst(n = chile$ssize, N = chile$pob, p = chile$same_pr)

In [34]:
# using SRS formula
serr(1000, N=sum(chile$pob), p=0.5)

## Fixed allocation


In [8]:
# fixed or simple allocation
chile[, ssize := astrata(1000, pob, wp=0)]
serrst(n = chile$ssize, N = chile$pob, p = chile$pr)

In [35]:
# let's add design effect info
serrst(n = chile$ssize, N = chile$pob, p = chile$same_pr, deff=1.3)

In [36]:
# effective sample size
n = 1000
deff = 1.3
n / deff

## Error allocation

In [40]:
chile[, ssize := astrata(e = .11, method = "error", N = pob, p = pr)]
chile

reg,pob,pr,ssize,same_pr
<int>,<dbl>,<dbl>,<dbl>,<dbl>
1,328782,0.3,67,0.5
2,613328,0.4,76,0.5
3,308247,0.5,79,0.5
4,759228,0.5,79,0.5
5,1808300,0.5,79,0.5
6,910577,0.6,76,0.5
7,1035593,0.3,67,0.5
8,2100494,0.1,29,0.5
9,983499,0.2,51,0.5
10,834714,0.5,79,0.5


In [41]:
sum(chile$ssize)

In [43]:
serrst(n = chile$ssize, N = chile$pob, p = chile$pr)

# Simulate some data

In [44]:
set.seed(12212022)
a = rnorm(13000, mean=4.1, sd=0.1)
b = rnorm(1500, mean=8.3, sd=0.3)
c = rnorm(7500, mean=1.7, sd=0.5)
d = rnorm(1000, mean=5.9, sd=0.1)

values = c(a, b, c, d)
labels = c(rep("a", length(a)), rep("b", length(b)), rep("c", length(c)), rep("d", length(d)))
dt = data.table(id=1:length(values), label=labels, values)
dt[, pop_strat := .N, by=label]
dt[, total_population := .N]
total_population = nrow(dt)
print(total_population)

dt[, sample_rate := 1000/23000]
dt[, sample_strat := ceiling(pop_strat * sample_rate)]

[1] 23000


In [45]:
head(dt)

id,label,values,pop_strat,total_population,sample_rate,sample_strat
<int>,<chr>,<dbl>,<int>,<int>,<dbl>,<dbl>
1,a,4.13592,13000,23000,0.04347826,566
2,a,4.241181,13000,23000,0.04347826,566
3,a,4.029086,13000,23000,0.04347826,566
4,a,3.945607,13000,23000,0.04347826,566
5,a,4.094104,13000,23000,0.04347826,566
6,a,4.062808,13000,23000,0.04347826,566


In [46]:
# stratified sample (by label)
sdt = dt[,.SD[sample(.N, min(sample_strat, .N))], label]

print(paste0("Any duplicates?: ", anyDuplicated(sdt$id)))
print(paste0("Sample size: ", nrow(sdt)))

[1] "Any duplicates?: 0"
[1] "Sample size: 1003"


## Intro to package `survey`

In [47]:
# declare survey design
d_str_0 = svydesign(id=~0, data=sdt, probs=~sample_rate, fpc=~total_population)
d_str_1 = svydesign(id=~0, data=sdt, strata=~label, probs=~sample_rate, fpc=~pop_strat)

In [48]:
d_str_1

Stratified Independent Sampling design
svydesign(id = ~0, data = sdt, strata = ~label, probs = ~sample_rate, 
    fpc = ~pop_strat)

In [49]:
svymean(~values, d_str_0, deff=TRUE)

           mean       SE   DEff
values 3.681517 0.054287 0.9999

In [50]:
svymean(~values, d_str_1, deff=TRUE)

            mean        SE   DEff
values 3.6815172 0.0093089 0.0294

## Why is this happening?

This is the code to generate strata: 

```
a = rnorm(13000, mean=4.1, sd=0.1)
b = rnorm(1500, mean=8.3, sd=0.3)
c = rnorm(7500, mean=1.7, sd=0.5)
d = rnorm(1000, mean=5.9, sd=0.4)
```

In [51]:
# distribution in the population
prop.table(table(dt$label))


         a          b          c          d 
0.56521739 0.06521739 0.32608696 0.04347826 

In [52]:
srs = dt[sample(.N, 1003)]
prop.table(table(srs$label))


         a          b          c          d 
0.56131605 0.05782652 0.33998006 0.04087737 

In [53]:
strs = dt[,.SD[sample(.N, min(sample_strat, .N))], label]
prop.table(table(strs$label))


         a          b          c          d 
0.56430708 0.06580259 0.32602193 0.04386839 

# Simple allocation (same size per stratum)

In [54]:
dt[, sample_size_fixed := 1000/length(unique(dt$label))]
dt[, sample_rate_fixed := sample_size_fixed/pop_strat]
sdts = dt[,.SD[sample(.N, min(sample_size_fixed, .N))], by = label]

prop.table(table(sdts$label))


   a    b    c    d 
0.25 0.25 0.25 0.25 

In [55]:
mean(sdts$values)

In [56]:
sdts[, w := 1/sample_rate_fixed]
sum(sdts$w)

In [57]:
weighted.mean((sdts$values), sdts$w)

In [58]:
d_str_2 = svydesign(id=~0, data=sdts, strata=~label, probs=~sample_rate_fixed, fpc=~pop_strat)

In [59]:
svymean(~values, d_str_2, deff=TRUE)

           mean       SE   DEff
values 3.661131 0.010599 0.0373

# Note: Systematic sampling

In [60]:
k = 340/39

diff = c()
unit = 3

for (i in 2:38) {
    v = round(unit[length(unit)] + k)
    diff = c(diff, v - unit[length(unit)])
    unit = c(unit, v)
}

In [61]:
print(k)
head(unit)
tail(unit)

[1] 8.717949
