Skip to content

Commit

Permalink
Merge branch 'next_release' into basic_raking
Browse files Browse the repository at this point in the history
  • Loading branch information
smishr committed Apr 14, 2023
2 parents fa5bf0a + 0703cfc commit b22e222
Show file tree
Hide file tree
Showing 11 changed files with 360 additions and 32 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Survey"
uuid = "c1a98b4d-6cd2-47ec-b9e9-69b59c35373c"
authors = ["Ayush Patnaik <ayushpatnaik@gmail.com>"]
version = "0.1.0"
version = "0.2.0"

[deps]
AlgebraOfGraphics = "cbdf2221-f076-402e-a563-3d30da359d67"
Expand Down
2 changes: 2 additions & 0 deletions docs/src/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ SurveyDesign
ReplicateDesign
load_data
bootweights
jackknifeweights
jackknife_variance
mean
total
quantile
Expand Down
2 changes: 2 additions & 0 deletions src/Survey.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ include("boxplot.jl")
include("show.jl")
include("ratio.jl")
include("by.jl")
include("jackknife.jl")
include("raking.jl")

export load_data
Expand All @@ -36,6 +37,7 @@ export hist, sturges, freedman_diaconis
export boxplot
export bootweights
export ratio
export jackknifeweights, jackknife_variance
export raking

end
42 changes: 26 additions & 16 deletions src/bootstrap.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,27 +25,18 @@ replicates: 1000
function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwister(1234))
stratified = groupby(design.data, design.strata)
H = length(keys(stratified))
substrata_dfs = DataFrame[]
substrata_dfs = Vector{DataFrame}(undef, H)
for h = 1:H
substrata = DataFrame(stratified[h])
cluster_sorted = sort(substrata, design.cluster)
psus = unique(cluster_sorted[!, design.cluster])
npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus]
nh = length(psus)
cluster_sorted_designcluster = cluster_sorted[!, design.cluster]
cluster_weights = cluster_sorted[!, design.weights]
for replicate = 1:replicates
randinds = rand(rng, 1:(nh), (nh - 1))
cluster_sorted[!, "replicate_"*string(replicate)] =
vcat(
[
fill((count(==(i), randinds)) * (nh / (nh - 1)), npsus[i]) for
i = 1:nh
]...,
) .* cluster_weights
end
push!(substrata_dfs, cluster_sorted)
# Perform the inner loop in a type-stable function to improve runtime.
_bootweights_cluster_sorted!(cluster_sorted, cluster_weights,
cluster_sorted_designcluster, replicates, rng)
substrata_dfs[h] = cluster_sorted
end
df = vcat(substrata_dfs...)
df = reduce(vcat, substrata_dfs)
return ReplicateDesign(
df,
design.cluster,
Expand All @@ -60,3 +51,22 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis
[Symbol("replicate_"*string(replicate)) for replicate in 1:replicates]
)
end

function _bootweights_cluster_sorted!(cluster_sorted,
cluster_weights, cluster_sorted_designcluster, replicates, rng)

psus = unique(cluster_sorted_designcluster)
npsus = [count(==(i), cluster_sorted_designcluster) for i in psus]
nh = length(psus)
for replicate = 1:replicates
randinds = rand(rng, 1:(nh), (nh - 1))
cluster_sorted[!, "replicate_"*string(replicate)] =
reduce(vcat,
[
fill((count(==(i), randinds)) * (nh / (nh - 1)), npsus[i]) for
i = 1:nh
]
) .* cluster_weights
end
cluster_sorted
end
7 changes: 3 additions & 4 deletions src/by.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
function bydomain(x::Symbol, domain::Symbol, design::SurveyDesign, func::Function)
function bydomain(x::Symbol, domain, design::SurveyDesign, func::Function)
gdf = groupby(design.data, domain)
nd = length(unique(design.data[!, domain]))
X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic)
return X
end

function bydomain(x::Symbol, domain::Symbol, design::ReplicateDesign, func::Function)
function bydomain(x::Symbol, domain, design::ReplicateDesign, func::Function)
gdf = groupby(design.data, domain)
nd = length(unique(design.data[!, domain]))
nd = length(gdf)
X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic)
Xt_mat = Array{Float64,2}(undef, (nd, design.replicates))
for i = 1:design.replicates
Expand Down
159 changes: 159 additions & 0 deletions src/jackknife.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
"""
jackknifeweights(design::SurveyDesign)
Delete-1 Jackknife algorithm for replication weights from sampling weights. The replicate weights are calculated using the following formula.
```math
w_{i(hj)} =
\\begin{cases}
w_i\\quad\\quad &\\text{if observation unit }i\\text{ is not in stratum }h\\\\
0\\quad\\quad &\\text{if observation unit }i\\text{ is in psu }j\\text{of stratum }h\\\\
\\dfrac{n_h}{n_h - 1}w_i \\quad\\quad &\\text{if observation unit }i\\text{ is in stratum }h\\text{ but not in psu }j\\\\
\\end{cases}
```
In the above formula, ``w_i`` represent the original weights, ``w_{i(hj)}`` represent the replicate weights when the ``j``th PSU from cluster ``h`` is removed, and ``n_h`` represents the number of unique PSUs within cluster ``h``. Replicate weights are added as columns to `design.data`, and these columns have names of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weight columns.
# Examples
```jldoctest
julia> using Survey;
julia> apistrat = load_data("apistrat");
julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
julia> rstrat = jackknifeweights(dstrat)
ReplicateDesign:
data: 200×244 DataFrame
strata: stype
[E, E, E … M]
cluster: none
popsize: [4420.9999, 4420.9999, 4420.9999 … 1018.0]
sampsize: [100, 100, 100 … 50]
weights: [44.21, 44.21, 44.21 … 20.36]
allprobs: [0.0226, 0.0226, 0.0226 … 0.0491]
type: jackknife
replicates: 200
```
# Reference
pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
"""
function jackknifeweights(design::SurveyDesign)
sort!(design.data, [design.strata, design.cluster])
df = design.data

stratified_gdf = groupby(df, design.strata)
replicate_index = 0
for (key, subgroup) in pairs(stratified_gdf)
stratum = key[design.strata] # current stratum
psus_in_stratum = unique(subgroup[!, design.cluster])
nh = length(psus_in_stratum)

for psu in psus_in_stratum
replicate_index += 1
colname = "replicate_"*string(replicate_index)

# Initialise replicate_i with original sampling weights
df[!, colname] = Vector(df[!, design.weights])

# getting indexes
same_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .== psu)
different_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .!== psu)

# scaling weights appropriately
df[same_psu, colname] .*= 0
df[different_psu, colname] .*= nh/(nh - 1)
end
end

return ReplicateDesign(
df,
design.cluster,
design.popsize,
design.sampsize,
design.strata,
design.weights,
design.allprobs,
design.pps,
"jackknife",
UInt(replicate_index),
[Symbol("replicate_"*string(index)) for index in 1:replicate_index]
)
end

"""
jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign)
Compute variance of column `x` for the given `func` using the Jackknife method. The formula to compute this variance is the following.
```math
\\hat{V}_{\\text{JK}}(\\hat{\\theta}) = \\sum_{h = 1}^H \\dfrac{n_h - 1}{n_h}\\sum_{j = 1}^{n_h}(\\hat{\\theta}_{(hj)} - \\hat{\\theta})^2
```
Above, ``\\hat{\\theta}`` represents the estimator computed using the original weights, and ``\\hat{\\theta_{(hj)}}`` represents the estimator computed from the replicate weights obtained when PSU ``j`` from cluster ``h`` is removed.
# Examples
```jldoctest
julia> using Survey, StatsBase
julia> apistrat = load_data("apistrat");
julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
julia> rstrat = jackknifeweights(dstrat)
ReplicateDesign:
data: 200×244 DataFrame
strata: stype
[E, E, E … M]
cluster: none
popsize: [4420.9999, 4420.9999, 4420.9999 … 1018.0]
sampsize: [100, 100, 100 … 50]
weights: [44.21, 44.21, 44.21 … 20.36]
allprobs: [0.0226, 0.0226, 0.0226 … 0.0491]
type: jackknife
replicates: 200
julia> weightedmean(x, y) = mean(x, weights(y));
julia> jackknife_variance(:api00, weightedmean, rstrat)
1×2 DataFrame
Row │ estimator SE
│ Float64 Float64
─────┼────────────────────
1 │ 662.287 9.53613
```
# Reference
pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
"""
function jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign)
df = design.data
# sort!(df, [design.strata, design.cluster])
stratified_gdf = groupby(df, design.strata)

# estimator from original weights
θ = func(df[!, x], df[!, design.weights])

variance = 0
replicate_index = 1
for subgroup in stratified_gdf
psus_in_stratum = unique(subgroup[!, design.cluster])
nh = length(psus_in_stratum)
cluster_variance = 0
for psu in psus_in_stratum
# get replicate weights corresponding to current stratum and psu
rep_weights = df[!, "replicate_"*string(replicate_index)]

# estimator from replicate weights
θhj = func(df[!, x], rep_weights)

cluster_variance += ((nh - 1)/nh)*(θhj - θ)^2
replicate_index += 1
end
variance += cluster_variance
end

return DataFrame(estimator = θ, SE = sqrt(variance))
end

22 changes: 14 additions & 8 deletions src/mean.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,19 @@ julia> mean(:api00, bclus1)
```
"""
function mean(x::Symbol, design::ReplicateDesign)
X = mean(design.data[!, x], weights(design.data[!, design.weights]))
Xt = [
mean(design.data[!, x], weights(design.data[!, "replicate_"*string(i)])) for
i = 1:design.replicates
]
variance = sum((Xt .- X) .^ 2) / design.replicates
DataFrame(mean = X, SE = sqrt(variance))
if design.type == "bootstrap"
θ̂ = mean(design.data[!, x], weights(design.data[!, design.weights]))
θ̂t = [
mean(design.data[!, x], weights(design.data[!, "replicate_"*string(i)])) for
i = 1:design.replicates
]
variance = sum((θ̂t .- θ̂) .^ 2) / design.replicates
return DataFrame(mean = θ̂, SE = sqrt(variance))
# Jackknife integration
elseif design.type == "jackknife"
weightedmean(x, y) = mean(x, weights(y))
return jackknife_variance(x, weightedmean, design)
end
end

"""
Expand Down Expand Up @@ -130,7 +136,7 @@ julia> mean(:api00, :cname, bclus1)
11 │ Mendocino 623.25 1.09545e-13
```
"""
function mean(x::Symbol, domain::Symbol, design::AbstractSurveyDesign)
function mean(x::Symbol, domain, design::AbstractSurveyDesign)
weighted_mean(x, w) = mean(x, StatsBase.weights(w))
df = bydomain(x, domain, design, weighted_mean)
rename!(df, :statistic => :mean)
Expand Down
2 changes: 1 addition & 1 deletion src/total.jl
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ julia> total(:api00, :cname, bclus1)
11 │ Mendocino 84380.6 80215.9
```
"""
function total(x::Symbol, domain::Symbol, design::AbstractSurveyDesign)
function total(x::Symbol, domain, design::AbstractSurveyDesign)
df = bydomain(x, domain, design, wsum)
rename!(df, :statistic => :total)
end
Loading

0 comments on commit b22e222

Please sign in to comment.