Merge branch 'next_release' into basic_raking

smishr · Apr 14, 2023 · b22e222 · b22e222
2 parents fa5bf0a + 0703cfc
commit b22e222
Show file tree

Hide file tree

Showing 11 changed files with 360 additions and 32 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Survey"
 uuid = "c1a98b4d-6cd2-47ec-b9e9-69b59c35373c"
 authors = ["Ayush Patnaik <ayushpatnaik@gmail.com>"]
-version = "0.1.0"
+version = "0.2.0"
 
 [deps]
 AlgebraOfGraphics = "cbdf2221-f076-402e-a563-3d30da359d67"

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -14,6 +14,8 @@ SurveyDesign
 ReplicateDesign
 load_data
 bootweights
+jackknifeweights
+jackknife_variance
 mean
 total
 quantile

diff --git a/src/Survey.jl b/src/Survey.jl
@@ -25,6 +25,7 @@ include("boxplot.jl")
 include("show.jl")
 include("ratio.jl")
 include("by.jl")
+include("jackknife.jl")
 include("raking.jl")
 
 export load_data
@@ -36,6 +37,7 @@ export hist, sturges, freedman_diaconis
 export boxplot
 export bootweights
 export ratio
+export jackknifeweights, jackknife_variance
 export raking
 
 end
diff --git a/src/bootstrap.jl b/src/bootstrap.jl
@@ -25,27 +25,18 @@ replicates: 1000
 function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwister(1234))
     stratified = groupby(design.data, design.strata)
     H = length(keys(stratified))
-    substrata_dfs = DataFrame[]
+    substrata_dfs = Vector{DataFrame}(undef, H)
     for h = 1:H
         substrata = DataFrame(stratified[h])
         cluster_sorted = sort(substrata, design.cluster)
-        psus = unique(cluster_sorted[!, design.cluster])
-        npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus]
-        nh = length(psus)
+        cluster_sorted_designcluster = cluster_sorted[!, design.cluster]
         cluster_weights = cluster_sorted[!, design.weights]
-        for replicate = 1:replicates
-            randinds = rand(rng, 1:(nh), (nh - 1))
-            cluster_sorted[!, "replicate_"*string(replicate)] =
-                vcat(
-                    [
-                        fill((count(==(i), randinds)) * (nh / (nh - 1)), npsus[i]) for
-                        i = 1:nh
-                    ]...,
-                ) .* cluster_weights
-        end
-        push!(substrata_dfs, cluster_sorted)
+        # Perform the inner loop in a type-stable function to improve runtime.
+        _bootweights_cluster_sorted!(cluster_sorted, cluster_weights,
+            cluster_sorted_designcluster, replicates, rng)
+        substrata_dfs[h] = cluster_sorted
     end
-    df = vcat(substrata_dfs...)
+    df = reduce(vcat, substrata_dfs)
     return ReplicateDesign(
         df,
         design.cluster,
@@ -60,3 +51,22 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis
         [Symbol("replicate_"*string(replicate)) for replicate in 1:replicates]
     )
 end
+
+function _bootweights_cluster_sorted!(cluster_sorted,
+        cluster_weights, cluster_sorted_designcluster, replicates, rng)
+
+    psus = unique(cluster_sorted_designcluster)
+    npsus = [count(==(i), cluster_sorted_designcluster) for i in psus]
+    nh = length(psus)
+    for replicate = 1:replicates
+        randinds = rand(rng, 1:(nh), (nh - 1))
+        cluster_sorted[!, "replicate_"*string(replicate)] =
+            reduce(vcat,
+                [
+                    fill((count(==(i), randinds)) * (nh / (nh - 1)), npsus[i]) for
+                    i = 1:nh
+                ]
+            ) .* cluster_weights
+    end
+    cluster_sorted
+end
diff --git a/src/by.jl b/src/by.jl
@@ -1,13 +1,12 @@
-function bydomain(x::Symbol, domain::Symbol, design::SurveyDesign, func::Function)
+function bydomain(x::Symbol, domain, design::SurveyDesign, func::Function)
     gdf = groupby(design.data, domain)
-    nd = length(unique(design.data[!, domain]))
     X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic)
     return X
 end
 
-function bydomain(x::Symbol, domain::Symbol, design::ReplicateDesign, func::Function)
+function bydomain(x::Symbol, domain, design::ReplicateDesign, func::Function)
     gdf = groupby(design.data, domain)
-    nd = length(unique(design.data[!, domain]))
+    nd = length(gdf)
     X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic)
     Xt_mat = Array{Float64,2}(undef, (nd, design.replicates))
     for i = 1:design.replicates

diff --git a/src/jackknife.jl b/src/jackknife.jl
@@ -0,0 +1,159 @@
+"""
+    jackknifeweights(design::SurveyDesign)
+
+Delete-1 Jackknife algorithm for replication weights from sampling weights. The replicate weights are calculated using the following formula.
+
+```math
+w_{i(hj)} =
+\\begin{cases}
+    w_i\\quad\\quad &\\text{if observation unit }i\\text{ is not in stratum }h\\\\
+    0\\quad\\quad &\\text{if observation unit }i\\text{ is in psu }j\\text{of stratum }h\\\\
+    \\dfrac{n_h}{n_h - 1}w_i \\quad\\quad &\\text{if observation unit }i\\text{ is in stratum }h\\text{ but not in psu }j\\\\
+\\end{cases}
+```
+In the above formula, ``w_i`` represent the original weights, ``w_{i(hj)}`` represent the replicate weights when the ``j``th PSU from cluster ``h`` is removed, and ``n_h`` represents the number of unique PSUs within cluster ``h``. Replicate weights are added as columns to `design.data`, and these columns have names of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weight columns.
+
+# Examples
+```jldoctest
+julia> using Survey;
+
+julia> apistrat = load_data("apistrat");
+
+julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
+
+julia> rstrat = jackknifeweights(dstrat)
+ReplicateDesign:
+data: 200×244 DataFrame
+strata: stype
+    [E, E, E  …  M]
+cluster: none
+popsize: [4420.9999, 4420.9999, 4420.9999  …  1018.0]
+sampsize: [100, 100, 100  …  50]
+weights: [44.21, 44.21, 44.21  …  20.36]
+allprobs: [0.0226, 0.0226, 0.0226  …  0.0491]
+type: jackknife
+replicates: 200
+
+```
+
+# Reference
+pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
+"""
+function jackknifeweights(design::SurveyDesign)
+    sort!(design.data, [design.strata, design.cluster])
+    df = design.data
+
+    stratified_gdf = groupby(df, design.strata)
+    replicate_index = 0
+    for (key, subgroup) in pairs(stratified_gdf)
+        stratum = key[design.strata]    # current stratum
+        psus_in_stratum = unique(subgroup[!, design.cluster])
+        nh = length(psus_in_stratum)
+
+        for psu in psus_in_stratum
+            replicate_index += 1
+            colname = "replicate_"*string(replicate_index)
+
+            # Initialise replicate_i with original sampling weights
+            df[!, colname] = Vector(df[!, design.weights])
+
+            # getting indexes
+            same_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .== psu)
+            different_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .!== psu)
+
+            # scaling weights appropriately
+            df[same_psu, colname] .*= 0
+            df[different_psu, colname] .*= nh/(nh - 1)
+        end
+    end
+
+    return ReplicateDesign(
+        df,
+        design.cluster,
+        design.popsize,
+        design.sampsize,
+        design.strata,
+        design.weights,
+        design.allprobs,
+        design.pps,
+        "jackknife",
+        UInt(replicate_index),
+        [Symbol("replicate_"*string(index)) for index in 1:replicate_index]
+    )
+end
+
+"""
+    jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign)
+
+Compute variance of column `x` for the given `func` using the Jackknife method. The formula to compute this variance is the following.
+
+```math
+\\hat{V}_{\\text{JK}}(\\hat{\\theta}) = \\sum_{h = 1}^H \\dfrac{n_h - 1}{n_h}\\sum_{j = 1}^{n_h}(\\hat{\\theta}_{(hj)} - \\hat{\\theta})^2
+```
+
+Above, ``\\hat{\\theta}`` represents the estimator computed using the original weights, and ``\\hat{\\theta_{(hj)}}`` represents the estimator computed from the replicate weights obtained when PSU ``j`` from cluster ``h`` is removed.
+
+# Examples
+```jldoctest
+julia> using Survey, StatsBase
+
+julia> apistrat = load_data("apistrat");
+
+julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
+
+julia> rstrat = jackknifeweights(dstrat)
+ReplicateDesign:
+data: 200×244 DataFrame
+strata: stype
+    [E, E, E  …  M]
+cluster: none
+popsize: [4420.9999, 4420.9999, 4420.9999  …  1018.0]
+sampsize: [100, 100, 100  …  50]
+weights: [44.21, 44.21, 44.21  …  20.36]
+allprobs: [0.0226, 0.0226, 0.0226  …  0.0491]
+type: jackknife
+replicates: 200
+
+julia> weightedmean(x, y) = mean(x, weights(y));
+
+julia> jackknife_variance(:api00, weightedmean, rstrat)
+1×2 DataFrame
+ Row │ estimator  SE
+     │ Float64    Float64
+─────┼────────────────────
+   1 │   662.287  9.53613
+
+```
+# Reference
+pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
+"""
+function jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign)
+    df = design.data
+    # sort!(df, [design.strata, design.cluster])
+    stratified_gdf = groupby(df, design.strata)
+
+    # estimator from original weights
+    θ = func(df[!, x], df[!, design.weights])
+
+    variance = 0
+    replicate_index = 1
+    for subgroup in stratified_gdf
+        psus_in_stratum = unique(subgroup[!, design.cluster])
+        nh = length(psus_in_stratum)
+        cluster_variance = 0
+        for psu in psus_in_stratum
+            # get replicate weights corresponding to current stratum and psu
+            rep_weights = df[!, "replicate_"*string(replicate_index)]
+
+            # estimator from replicate weights
+            θhj = func(df[!, x], rep_weights)
+
+            cluster_variance += ((nh - 1)/nh)*(θhj - θ)^2
+            replicate_index += 1
+        end
+        variance += cluster_variance
+    end
+
+    return DataFrame(estimator = θ, SE = sqrt(variance))
+end
+
diff --git a/src/mean.jl b/src/mean.jl
@@ -46,13 +46,19 @@ julia> mean(:api00, bclus1)
 ```
 """
 function mean(x::Symbol, design::ReplicateDesign)
-    X = mean(design.data[!, x], weights(design.data[!, design.weights]))
-    Xt = [
-        mean(design.data[!, x], weights(design.data[!, "replicate_"*string(i)])) for
-        i = 1:design.replicates
-    ]
-    variance = sum((Xt .- X) .^ 2) / design.replicates
-    DataFrame(mean = X, SE = sqrt(variance))
+    if design.type == "bootstrap"
+        θ̂ = mean(design.data[!, x], weights(design.data[!, design.weights]))
+        θ̂t = [
+            mean(design.data[!, x], weights(design.data[!, "replicate_"*string(i)])) for
+            i = 1:design.replicates
+        ]
+        variance = sum((θ̂t .- θ̂) .^ 2) / design.replicates
+        return DataFrame(mean = θ̂, SE = sqrt(variance))
+    # Jackknife integration
+    elseif design.type == "jackknife"
+        weightedmean(x, y) = mean(x, weights(y))
+        return jackknife_variance(x, weightedmean, design)
+    end
 end
 
 """
@@ -130,7 +136,7 @@ julia> mean(:api00, :cname, bclus1)
   11 │ Mendocino    623.25    1.09545e-13
 ```
 """
-function mean(x::Symbol, domain::Symbol, design::AbstractSurveyDesign)
+function mean(x::Symbol, domain, design::AbstractSurveyDesign)
     weighted_mean(x, w) = mean(x, StatsBase.weights(w))
     df = bydomain(x, domain, design, weighted_mean)
     rename!(df, :statistic => :mean)

diff --git a/src/total.jl b/src/total.jl
@@ -120,7 +120,7 @@ julia> total(:api00, :cname, bclus1)
   11 │ Mendocino     84380.6        80215.9
 ```
 """
-function total(x::Symbol, domain::Symbol, design::AbstractSurveyDesign)
+function total(x::Symbol, domain, design::AbstractSurveyDesign)
     df = bydomain(x, domain, design, wsum)
     rename!(df, :statistic => :total)
 end