From 84da895b465fccb0c79322d2f4f6fa4deaca4810 Mon Sep 17 00:00:00 2001 From: Bernhard Meindl Date: Fri, 18 Sep 2020 08:51:04 +0200 Subject: [PATCH] remove `LLmodGlobalRisk()` and replace with `modRisk()` --- DESCRIPTION | 1 - NAMESPACE | 2 +- R/0classes.r | 25 +++-- R/LLmodGlobalRisk.R | 152 ------------------------------ R/aux_functions.r | 10 +- R/modRisk.R | 198 +++++++++++++++++++++++---------------- R/sdcMicro-package.R | 34 ++++--- man/LLmodGlobalRisk.Rd | 84 ----------------- man/modRisk.Rd | 32 +++++-- man/sdcMicro-package.Rd | 28 ++++-- man/sdcMicroObj-class.Rd | 24 +++-- 11 files changed, 226 insertions(+), 364 deletions(-) delete mode 100644 R/LLmodGlobalRisk.R delete mode 100644 man/LLmodGlobalRisk.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 67f5097e..65611a4d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -64,7 +64,6 @@ Collate: 'groupAndRename.R' 'GUIfunctions.R' 'indivRisk.R' - 'LLmodGlobalRisk.R' 'LocalRecProg.R' 'localSupp.R' 'localSuppression.R' diff --git a/NAMESPACE b/NAMESPACE index 87803684..057b1257 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,7 +15,6 @@ S3method(summary,freqCalc) S3method(summary,micro) S3method(summary,pram) export("strataVar<-") -export(LLmodGlobalRisk) export(LocalRecProg) export(addGhostVars) export(addNoise) @@ -120,6 +119,7 @@ importFrom(stats,lm) importFrom(stats,mad) importFrom(stats,median) importFrom(stats,na.omit) +importFrom(stats,runif) importFrom(stats,sd) importFrom(stats,terms) importFrom(stats,var) diff --git a/R/0classes.r b/R/0classes.r index ca5c714a..895ef46d 100644 --- a/R/0classes.r +++ b/R/0classes.r @@ -44,6 +44,7 @@ #' @importFrom stats sd #' @importFrom stats terms #' @importFrom stats var +#' @importFrom stats runif #' @importFrom utils data setClassUnion("dataframeOrNULL", c("data.frame", "NULL")) setClassUnion("numericOrNULL", c("numeric", "NULL")) @@ -136,19 +137,31 @@ setClassUnion("sdcmicroOrNULL", c("NULL")) #' sdc <- topBotCoding(sdc, value=60000000, replacement=62000000, column="income") #' head(get.sdcMicroObj(sdc, type="manipNumVars")) #' sdc@@risk$numeric +#' #' ### LocalRecProg #' data(testdata2) +#' keyVars <- c("urbrur", "roof", "walls", "water", "sex") +#' w <- "sampling_weight" #' sdc <- createSdcObj(testdata2, -#' keyVars=c("urbrur", "roof", "walls", "water", "sex", "relat")) +#' keyVars = keyVars, +#' weightVar = w) #' sdc@@risk$global #' sdc <- LocalRecProg(sdc) #' sdc@@risk$global -#' ### LLmodGlobalRisk -#' sdc <- undolast(sdc) -#' sdc <- LLmodGlobalRisk(sdc, inclProb=0.001) -#' sdc@@risk$model +#' ### model-based risks +#' #' formula +#' form <- as.formula(paste("~", paste(keyVars, collapse = "+"))) +#' sdc <- modRisk(sdc, method = "default", formulaM = form) +#' get.sdcMicroObj(sdc, "risk")$model +#' sdc <- modRisk(sdc, method = "CE", formulaM = form) +#' get.sdcMicroObj(sdc, "risk")$model +#' sdc <- modRisk(sdc, method = "PLM", formulaM = form) +#' get.sdcMicroObj(sdc, "risk")$model +#' sdc <- modRisk(sdc, method = "weightedLLM", formulaM = form) +#' get.sdcMicroObj(sdc, "risk")$model +#' sdc <- modRisk(sdc, method = "IPF", formulaM = form) +#' get.sdcMicroObj(sdc, "risk")$model #' } -#' setClass(Class = "sdcMicroObj", representation = representation( origData = "dataframeOrNULL", diff --git a/R/LLmodGlobalRisk.R b/R/LLmodGlobalRisk.R deleted file mode 100644 index ce1b5009..00000000 --- a/R/LLmodGlobalRisk.R +++ /dev/null @@ -1,152 +0,0 @@ -#' Global risk using log-linear models. -#' -#' The sample frequencies are assumed to be independent and following a Poisson -#' distribution. The parameters of the corresponding parameters are estimated -#' by a log-linear model including the main effects and possible interactions. -#' -#' This measure aims to (1) calculate the number of sample uniques that are -#' population uniques with a probabilistic Poisson model and (2) to estimate -#' the expected number of correct matches for sample uniques. -#' -#' ad 1) this risk measure is defined over all sample uniques (SU) as \deqn{ -#' \tau_1 = \sum\limits_{SU} P(F_k=1 | f_k=1) \quad , } i.e. the expected -#' number of sample uniques that are population uniques. -#' -#' ad 2) this risk measure is defined over all sample uniques (SU) as \deqn{ -#' \tau_2 = \sum\limits_{SU} P(F_k=1 | f_k=1) \quad , CORRECT! } -#' -#' Since population frequencies \eqn{F_k} are unknown, they has to be -#' estimated. -#' -#' The iterative proportional fitting method is used to fit the parameters of -#' the Poisson distributed frequency counts related to the model specified to -#' fit the frequency counts. The obtained parameters are used to estimate a -#' global risk, defined in Skinner and Holmes (1998). -#' -#' @name LLmodGlobalRisk -#' @docType methods -#' @param obj \code{\link{sdcMicroObj-class}}-object or a \code{data.frame} containing -#' the categorical key variables. -#' @param method At this time, only iterative proportional fitting -#' (\dQuote{IPF}) can be used. -#' @param inclProb Inclusion probabilites (experimental) -#' @param form A formula specifying the model. -#' @param modOutput If TRUE, additional output is given. -#' @return Two global risk measures or the modified risk in the \code{\link{sdcMicroObj-class}} object. -#' @author Matthias Templ -#' @seealso \code{\link{loglm}}, \code{\link{measure_risk}} -#' @references Skinner, C.J. and Holmes, D.J. (1998) \emph{Estimating the -#' re-identification risk per record in microdata}. Journal of Official -#' Statistics, 14:361-372, 1998. -#' -#' Rinott, Y. and Shlomo, N. (1998). \emph{A Generalized Negative Binomial -#' Smoothing Model for Sample Disclosure Risk Estimation}. Privacy in -#' Statistical Databases. Lecture Notes in Computer Science. Springer-Verlag, -#' 82--93. -#' -#' Templ, M. Statistical Disclosure Control for Microdata: Methods and Applications in R. -#' \emph{Springer International Publishing}, 287 pages, 2017. ISBN 978-3-319-50272-4. -#' \doi{10.1007/978-3-319-50272-4} -#' -#' @keywords manip -#' @note LLmodGlobalRisk is depcrecated for \code{\link{modRisk}} and is only -#' provided for compatibility with older versions of this package. It may be removed -#' in future versions. -#' @seealso \code{\link{modRisk}} -#' @export -LLmodGlobalRisk <- function(obj, method="IPF", inclProb=NULL, form=NULL, modOutput=FALSE) { - LLmodGlobalRiskX(obj=obj, method=method, inclProb=inclProb, form=form, modOutput=modOutput) -} - -setGeneric("LLmodGlobalRiskX", function(obj, method="IPF", inclProb=NULL, form=NULL, modOutput=FALSE) { - standardGeneric("LLmodGlobalRiskX") -}) - -setMethod(f="LLmodGlobalRiskX", signature=c("sdcMicroObj"), -definition=function(obj, method="IPF", inclProb=NULL, form=NULL, modOutput=FALSE) { - .Deprecated("modRisk") - if (is.null(form)) { - x <- get.sdcMicroObj(obj, type="manipKeyVars") - form <- as.formula(paste(" ~ ", paste(colnames(x), collapse="+"))) - } else { - vars <- labels(terms(form)) - mk <- get.sdcMicroObj(obj, type="manipKeyVars") - mn <- get.sdcMicroObj(obj, type="manipNumVars") - ok <- get.sdcMicroObj(obj, type="origData") - ok <- ok[, !colnames(ok) %in% c(colnames(mk), colnames(mn)), drop=FALSE] - if (any(colnames(mk) %in% vars)) { - x <- mk[, colnames(mk) %in% vars, drop=FALSE] - } else x <- NULL - if (any(colnames(mn) %in% vars)) { - if (is.null(x)) - x <- mn[, colnames(mn) %in% vars, drop=FALSE] else x <- data.frame(x, mn[, colnames(mn) %in% vars, drop=FALSE]) - } - if (any(colnames(ok) %in% vars)) { - if (is.null(x)) - x <- ok[, colnames(ok) %in% vars, drop=FALSE] else x <- data.frame(x, ok[, colnames(ok) %in% vars, drop=FALSE]) - } - } - if (is.null(inclProb) && !is.null(get.sdcMicroObj(obj, type="weightVar"))) { - inclProb <- 1/get.sdcMicroObj(obj, type="origData")[, get.sdcMicroObj(obj, type="weightVar")] - } - risk <- get.sdcMicroObj(obj, type="risk") - risk$model <- LLmodGlobalRiskWORK(x=x, method=method, inclProb=inclProb, form=form, modOutput=modOutput) - risk$model$inclProb <- inclProb - obj <- set.sdcMicroObj(obj, type="risk", input=list(risk)) - obj -}) - -setMethod(f="LLmodGlobalRiskX", signature=c("data.frame"), -definition=function(obj, method="IPF", inclProb=NULL, form=NULL, modOutput=FALSE) { - .Deprecated("modRisk") - if (is.null(form)) { - form <- as.formula(paste(" ~ ", paste(colnames(obj), collapse="+"))) - } - LLmodGlobalRiskWORK(x=obj, method=method, inclProb=inclProb, form=form, modOutput=modOutput) -}) - -LLmodGlobalRiskWORK <- function(x, method="IPF", inclProb=NULL, form=as.formula(paste(" ~ ", - paste(colnames(x), collapse="+"))), modOutput=FALSE) { - - if (is.null(inclProb)) { - errMsg <- paste0("Please provide the inclusion probabilities, eg.\n"); - errMsg <- paste0(errMsg, "approx by 1/sampling weights.\n") - stop(errMsg) - } - # x risk functions P(F_k=r | f_k=r) - risk1 <- function(l, p) { - v=(1 - p) * l - exp(-v) - } - # E(1/F_k | f_k=1) - risk2 <- function(l, p) { - v=(1 - p) * l - (1 - exp(-v))/v - } - # file level risk measure - file_risk <- function(freq, risk) { - sum(as.numeric(freq == 1) * risk) - } - - ## sample frequencies - tab <- stats::xtabs(form, x) - x <- data.frame(x, inclProb=inclProb) - form2 <- as.formula(paste(c("inclProb", as.character(form)), collapse="")) - tabP <- stats::xtabs(form2, x) - - ## IPF - mod <- loglm(form, data=tab, fitted=TRUE) - lambda <- stats::fitted(mod) - - ## Risk - r1 <- risk1(lambda, tabP) - r2 <- risk2(lambda, tabP) - gr1 <- file_risk(tab, r1)/nrow(x) - gr2 <- file_risk(tab, r2)/nrow(x) - if (modOutput) { - res <- list(gr1=gr1, gr2=gr2, gr1perc=gr1 * 100, gr2perc=gr2 * 100, tab=tab, fitted=lambda) - } else { - res <- list(gr1=gr1, gr2=gr2, gr1perc=gr1 * 100, gr2perc=gr2 * 100) - } - res -} diff --git a/R/aux_functions.r b/R/aux_functions.r index 0d3dbdfe..e38e054e 100644 --- a/R/aux_functions.r +++ b/R/aux_functions.r @@ -338,11 +338,11 @@ setMethod(f="calcRisksX", signature=c("sdcMicroObj"), definition=function(obj, . suda2Set <- (!is.null(risk$suda2)) obj <- measure_risk(obj) if (modelSet) { - inclProb <- NULL - if (!is.null(risk$model$inclProb)) { - inclProb <- risk$model$inclProb - } - obj <- LLmodGlobalRisk(obj, inclProb=inclProb) + kv <- colnames(get.sdcMicroObj(obj, type = "manipKeyVars")) + obj <- modRisk( + obj = obj, + method = "IPF", + formulaM = as.formula(paste(" ~ ", paste(kv, collapse="+")))) } if (suda2Set) { obj <- suda2(obj) diff --git a/R/modRisk.R b/R/modRisk.R index c08dbe10..f12858a0 100644 --- a/R/modRisk.R +++ b/R/modRisk.R @@ -1,5 +1,5 @@ ################################################### -# LLmodRisk Function +# modRisk Function # Calculates two disclosure risk measures with # five possible log-linear models (standard, CE, PSE, weightedLLM, IPF) # 1. estimates the number of sample uniques that are population unique @@ -73,20 +73,35 @@ #' data(testdata2) #' form <- ~sex+water+roof #' w <- "sampling_weight" -#' (modRisk(testdata2, method="default", formulaM=form, weights=w)) -#' (modRisk(testdata2, method="CE", formulaM=form, weights=w)) -#' (modRisk(testdata2, method="PML", formulaM=form, weights=w)) -#' (modRisk(testdata2, method="weightedLLM", formulaM=form, weights=w)) -#' (modRisk(testdata2, method="IPF", formulaM=form, weights=w)) +#' (modRisk(testdata2, method = "default", formulaM = form, weights = w)) +#' (modRisk(testdata2, method = "CE", formulaM = form, weights = w)) +#' (modRisk(testdata2, method = "PML", formulaM = form, weights = w)) +#' (modRisk(testdata2, method = "weightedLLM", formulaM = form, weights = w)) +#' (modRisk(testdata2, method = "IPF", formulaM = form, weights = w)) #' #' ## application to a sdcMicroObj #' data(testdata2) #' sdc <- createSdcObj(testdata2, -#' keyVars=c('urbrur','roof','walls','electcon','relat','sex'), -#' numVars=c('expend','income','savings'), w='sampling_weight') -#' sdc <- modRisk(sdc,form=~sex+water+roof) +#' keyVars = c("urbrur", "roof", "walls", "electcon", "relat", "sex"), +#' numVars = c("expend", "income", "savings"), +#' w = "sampling_weight") +#' sdc <- modRisk(sdc, form = ~sex+water+roof) #' slot(sdc, "risk")$model - +#' +#' \dontrun{ +#' # an example using data from the laeken-pkg +#' library(laeken) +#' data(eusilc) +#' f <- as.formula(paste(" ~ ", "db040 + hsize + rb090 + +#' age + pb220a + age:rb090 + age:hsize + +#' hsize:rb090")) +#' w <- "rb050" +#' (modRisk(eusilc, method = "default", weights = w, formulaM = f, bound = 5)) +#' (modRisk(eusilc, method = "CE", weights = w, formulaM = f, bound = 5)) +#' (modRisk(eusilc, method = "PML", weights = w, formulaM = f, bound = 5)) +#' (modRisk(eusilc, method = "weightedLLM", weights = w, formulaM = f, bound = 5)) +#' } +#' modRisk <- function(obj, method="default", weights, formulaM, bound=Inf, ...) { modRiskX(obj=obj, method=method, weights=weights, formulaM=formulaM, bound=bound, ...) } @@ -98,13 +113,13 @@ setGeneric("modRiskX", function(obj, method="default", weights, formulaM, bound= setMethod(f="modRiskX", signature=c("sdcMicroObj"), definition=function(obj, method="default", weights, formulaM, bound=Inf) { vars <- labels(terms(formulaM)) - mk <- get.sdcMicroObj(obj, type="manipKeyVars") - mn <- get.sdcMicroObj(obj, type="manipNumVars") - orig <- get.sdcMicroObj(obj, type="origData") + mk <- get.sdcMicroObj(obj, type = "manipKeyVars") + mn <- get.sdcMicroObj(obj, type = "manipNumVars") + orig <- get.sdcMicroObj(obj, type = "origData") cn <- colnames(orig) - ok <- orig[, !colnames(orig) %in% c(colnames(mk), colnames(mn)), drop=FALSE] - if ( any(colnames(mk) %in% vars) ) { - x <- mk[, colnames(mk) %in% vars, drop=FALSE] + ok <- orig[, !colnames(orig) %in% c(colnames(mk), colnames(mn)), drop = FALSE] + if (any(colnames(mk) %in% vars)) { + x <- mk[, colnames(mk) %in% vars, drop = FALSE] } else { x <- NULL } @@ -123,16 +138,29 @@ definition=function(obj, method="default", weights, formulaM, bound=Inf) { } } - wV <- get.sdcMicroObj(obj, type="weightVar") + wV <- get.sdcMicroObj(obj, type = "weightVar") weightsVar <- cn[wV] - if ( is.null(wV) ) { - stop("It is not possible to calculate model-based risks for data without sampling weights (slot 'weightVar')!\n") + if (is.null(wV)) { + w <- c( + "model-based risks are computed for data without defined sampling weights.", + "weights are temporarily set to 1." + ) + warning(paste(w, collapse = "\n")) + weightsVar <- paste0("tmpweights_", substr(runif(1), 3, 10)) + x[[weightsVar]] <- 1 + } else { + x[[weightsVar]] <- orig[[wV]] } - x[[weightsVar]] <- orig[[wV]] risk <- get.sdcMicroObj(obj, type="risk") - risk$model <- modRisk(x, method=method, weights=weightsVar, formulaM=formulaM, bound=bound) - obj <- set.sdcMicroObj(obj, type="risk", input=list(risk)) + risk$model <- modRisk( + obj = x, + method = method, + weights = weightsVar, + formulaM = formulaM, + bound = bound + ) + obj <- set.sdcMicroObj(obj, type = "risk", input = list(risk)) obj }) @@ -152,126 +180,132 @@ definition=function(obj, method="default", weights, formulaM, bound=Inf) { . <- inclProb <- counts <- id <- Fk <- NULL x <- obj - if ( !is.data.frame(x) ) { - stop("input 'x' must be a data.frame!\n") + if (!inherits(x, "data.frame")) { + stop("input 'x' must inherit class `data.frame`", call. = FALSE) } - if ( !method %in% c("default","CE","PML","weightedLLM","IPF") ) { - stop("Unknown value for 'method' was detected!\n") + if (!method %in% c("default", "CE", "PML", "weightedLLM", "IPF")) { + stop("Unknown value for 'method' was detected!", call. = FALSE) } - if ( !weights %in% colnames(x) ) { - stop("Please provide a valid variable name that contains sampling weights!\n") + if (!weights %in% colnames(x)) { + stop("Please provide a valid variable name that contains sampling weights!", call. = FALSE) } - if ( length(bound) != 1 & bound[1] <= 0) { - stop("Argument 'bound' must be numeric > 0!\n") + if (length(bound) != 1 & bound[1] <= 0) { + stop("Argument 'bound' must be numeric > 0!", call. = FALSE) } form_info <- terms(formulaM) orders <- attributes(form_info)$order vars <- labels(form_info)[orders==1] - if ( method=="IPF" && any(orders>1) ) { - stop("Sorry, but method 'IPF' cannot be used for models with interactions!\n") + if (method == "IPF" && any(orders > 1)) { + stop("Sorry, but method 'IPF' cannot be used for models with interactions!", call. = FALSE) } - if ( !all(vars %in% colnames(x)) ) { - stop("all variables specified in the formula must exist in the input dataset!\n") + if (!all(vars %in% colnames(x))) { + stop("all variables specified in the formula must exist in the input dataset!", call. = FALSE) } - x <- x[,c(vars, weights)] + x <- x[, c(vars, weights)] colnames(x) <- c(vars, "weights") - y <- data.table(x, key=vars) - y[,inclProb:=1/weights] - y <- y[,.(counts=.N, weights=sum(weights), inclProb=sum(inclProb)), by=key(y)] + y <- data.table(x, key = vars) + y[, inclProb := 1 / weights] + y <- y[, .(counts = .N, weights = sum(weights), inclProb = sum(inclProb)), by = key(y)] grid <- data.table(expand.grid(lapply(1:length(vars), function(t) unique((x[[vars[t]]]))))) setnames(grid, vars) setkeyv(grid, vars) - x <- merge(grid, y, all.x=TRUE) - x[is.na(counts), counts:=0] - x[is.na(weights ), weights :=0] - x[is.na(inclProb), inclProb:=0] + x <- merge(grid, y, all.x = TRUE) + x[is.na(counts), counts := 0] + x[is.na(weights), weights := 0] + x[is.na(inclProb), inclProb := 0] # model selection - if ( method == "default" ) { - form <- as.formula(paste(c("counts", as.character(formulaM)), collapse="")) + if (method == "default") { + form <- as.formula(paste(c("counts", as.character(formulaM)), collapse = "")) } - if ( method == "CE" ) { - EC <- x$counts/x$weights #offset term + if (method == "CE") { + EC <- x$counts / x$weights #offset term EC[EC == "NaN"] <- 0 EC <- log(EC + 0.1) - form <- as.formula(paste(c("counts", c(as.character(formulaM)), "+ offset(EC)"), - collapse="")) + form <- as.formula(paste(c("counts", c(as.character(formulaM)), "+ offset(EC)"), collapse = "")) } - if ( method == "PML" ) { - f <- sum(x$counts)/sum(x$weights) + if (method == "PML") { + f <- sum(x$counts) / sum(x$weights) weights_t <- round(x$weights * f) #round x <- data.frame(x, weights_t) - form <- as.formula(paste(c("weights_t", as.character(formulaM)), collapse="")) + form <- as.formula(paste(c("weights_t", as.character(formulaM)), collapse = "")) } - if ( method == "weightedLLM" ) { - form_zw <- as.formula(paste(c(formulaM, "weights"), collapse="+")) - form <- as.formula(paste(c("counts", as.character(form_zw)), collapse="")) + if (method == "weightedLLM") { + form_zw <- as.formula(paste(c(formulaM, "weights"), collapse = "+")) + form <- as.formula(paste(c("counts", as.character(form_zw)), collapse = "")) } - if ( method == "IPF" ) { - form <- as.formula(paste(c("counts", as.character(formulaM)), collapse="")) - form2 <- as.formula(paste(c("inclProb", as.character(formulaM)), collapse="")) + if (method == "IPF") { + form <- as.formula(paste(c("counts", as.character(formulaM)), collapse = "")) + form2 <- as.formula(paste(c("inclProb", as.character(formulaM)), collapse = "")) tab <- stats::xtabs(form, x) tabP <- stats::xtabs(form2, x) } # running the model with the chosen formula - if ( method == "IPF" ) { - mod <- loglm(form, data=tab, fitted=TRUE) + if (method == "IPF") { + mod <- loglm(form, data = tab, fitted = TRUE) } else { - mod <- glm(form, data=x, family=stats::poisson()) + mod <- glm(form, data = x, family = stats::poisson()) } lambda <- stats::fitted(mod) # calculate risk and estimate # 1. the number of sample uniques that are population unique # 2. the number of correct matches of sample uniques - if ( method != "IPF" ) { + if (method != "IPF") { x <- data.table(x) - x[,id:=1:nrow(x)] + x[, id := 1:nrow(x)] setkey(x, id) - lambda <- data.table(Fk=as.numeric(lambda), id=as.numeric(attributes(lambda)$names)) + lambda <- data.table(Fk = as.numeric(lambda), id = as.numeric(attributes(lambda)$names)) setkey(lambda, id) } else { - x <- data.table(x, key=vars) - lambda <- as.data.frame.table(lambda, stringsAsFactors=FALSE) + x <- data.table(x, key = vars) + lambda <- as.data.frame.table(lambda, stringsAsFactors = FALSE) colnames(lambda)[ncol(lambda)] <- "Fk" - lambda <- data.table(lambda, key=vars) + lambda <- data.table(lambda, key = vars) #lambda <- lambda[,lapply(.SD, as.numeric)] # suggestion by Ying Chen: lambda <- data.table(lambda, key = vars) - for(v in vars){ - if(class(x[[v]])!=class(lambda[[v]])){ - if(is.character(x[[v]])){ + for (v in vars) { + if (class(x[[v]]) != class(lambda[[v]])) { + if (is.character(x[[v]])) { lambda[[v]] <- as.character(lambda[[v]]) - }else if(is.integer(x[[v]])){ + } else if (is.integer(x[[v]])) { lambda[[v]] <- as.integer(lambda[[v]]) - }else if(is.numeric(x[[v]])){ + } else if (is.numeric(x[[v]])) { lambda[[v]] <- as.numeric(lambda[[v]]) - }else if(is.factor(x[[v]])){ + } else if (is.factor(x[[v]])) { lambda[[v]] <- as.factor(lambda[[v]]) } } } } - - x <- merge(x, lambda, all.x=TRUE) - x[is.na(Fk), Fk:=0] + x <- merge(x, lambda, all.x = TRUE) + x[is.na(Fk), Fk := 0] x <- x[0 < x$counts & x$counts <= bound] - x <- x[!(x$Fk==0 & x$counts >0)] + x <- x[!(x$Fk == 0 & x$counts > 0)] r1 <- risk1(x$Fk, x$inclProb) / nrow(x) r2 <- risk2(x$Fk, x$inclProb) / nrow(x) gr1 <- file_risk(x$counts, r1) gr2 <- file_risk(x$counts, r2) - res <- list(gr1=gr1, gr2=gr2, gr1perc=gr1*100, gr2perc=gr2*100, - method=method, model=formulaM, fitted=stats::fitted(mod), inclProb=x$inclProb) + res <- list( + gr1 = gr1, + gr2 = gr2, + gr1perc = gr1 * 100, + gr2perc = gr2 * 100, + method = method, + model = formulaM, + fitted = stats::fitted(mod), + inclProb = x$inclProb + ) class(res) <- "modrisk" res }) @@ -289,9 +323,9 @@ definition=function(obj, method="default", weights, formulaM, bound=Inf) { #' @method print modrisk #' @export print.modrisk <- function(x, ...) { - message(paste0("The estimated model (using method '",x$method,"') was:\n")) + message(paste0("The estimated model (using method '",x$method,"') was:")) message(paste0("\t",paste(as.character(x$model), collapse=" "),"\n")) - message("global risk-measures:\n") - message(paste0("\tRisk-Measure 1: ", prettyF(x$gr1, digits=3)," (",prettyF(x$gr1perc, digits=3)," %)\n")) - message(paste0("\tRisk-Measure 2: ", prettyF(x$gr2, digits=3)," (",prettyF(x$gr2perc, digits=3)," %)\n")) + message("global risk-measures:") + message(paste0("\tRisk-Measure 1: ", prettyF(x$gr1, digits = 3)," (", prettyF(x$gr1perc, digits = 3), " %)")) + message(paste0("\tRisk-Measure 2: ", prettyF(x$gr2, digits = 3)," (", prettyF(x$gr2perc, digits = 3), " %)")) } diff --git a/R/sdcMicro-package.R b/R/sdcMicro-package.R index 710bbb82..11386b57 100644 --- a/R/sdcMicro-package.R +++ b/R/sdcMicro-package.R @@ -33,24 +33,24 @@ #' @author Matthias Templ, Alexander Kowarik, Bernhard Meindl #' #' Maintainer: Matthias Templ -#' @references +#' @references #' Templ, M. Statistical Disclosure Control for Microdata: Methods and Applications in R. #' \emph{Springer International Publishing}, 287 pages, 2017. ISBN 978-3-319-50272-4. #' \doi{10.1007/978-3-319-50272-4} -#' -#' Templ, M. and Kowarik, A. and Meindl, B. -#' Statistical Disclosure Control for Micro-Data Using the R Package sdcMicro. +#' +#' Templ, M. and Kowarik, A. and Meindl, B. +#' Statistical Disclosure Control for Micro-Data Using the R Package sdcMicro. #' \emph{Journal of Statistical Software}, \strong{67} (4), 1--36, 2015. \doi{10.18637/jss.v067.i04} #' #' Templ, M. and Meindl, B. \emph{Practical Applications in #' Statistical Disclosure Control Using R}, Privacy and Anonymity in #' Information Management Systems, Bookchapter, Springer London, pp. 31-62, #' 2010. \doi{10.1007/978-1-84996-238-4_3} -#' +#' #' Kowarik, A. and Templ, M. and Meindl, B. and Fonteneau, F. and Prantner, B.: #' \emph{Testing of IHSN Cpp Code and Inclusion of New Methods into sdcMicro}, #' in: Lecture Notes in Computer Science, J. Domingo-Ferrer, I. Tinnirello -#' (editors.); Springer, Berlin, 2012, ISBN: 978-3-642-33626-3, pp. 63-77. +#' (editors.); Springer, Berlin, 2012, ISBN: 978-3-642-33626-3, pp. 63-77. #' \doi{10.1007/978-3-642-33627-0_6} #' #' Templ, M. \emph{Statistical Disclosure Control for Microdata Using the @@ -241,15 +241,27 @@ #' sdc@@risk$numeric #' ### LocalRecProg #' data(testdata2) +#' keyVars <- c("urbrur", "roof", "walls", "water", "sex") +#' w <- "sampling_weight" #' sdc <- createSdcObj(testdata2, -#' keyVars=c("urbrur", "roof", "walls", "water", "sex", "relat")) +#' keyVars = keyVars, +#' weightVar = w) #' sdc@@risk$global #' sdc <- LocalRecProg(sdc) #' sdc@@risk$global -#' ### LLmodGlobalRisk -#' sdc <- undolast(sdc) -#' sdc <- LLmodGlobalRisk(sdc, inclProb=0.001) -#' sdc@@risk$model +#' ### model-based risks +#' #' formula +#' form <- as.formula(paste("~", paste(keyVars, collapse = "+"))) +#' sdc <- modRisk(sdc, method = "default", formulaM = form) +#' get.sdcMicroObj(sdc, "risk")$model +#' sdc <- modRisk(sdc, method = "CE", formulaM = form) +#' get.sdcMicroObj(sdc, "risk")$model +#' sdc <- modRisk(sdc, method = "PLM", formulaM = form) +#' get.sdcMicroObj(sdc, "risk")$model +#' sdc <- modRisk(sdc, method = "weightedLLM", formulaM = form) +#' get.sdcMicroObj(sdc, "risk")$model +#' sdc <- modRisk(sdc, method = "IPF", formulaM = form) +#' get.sdcMicroObj(sdc, "risk")$model #' } #' NULL diff --git a/man/LLmodGlobalRisk.Rd b/man/LLmodGlobalRisk.Rd deleted file mode 100644 index 88a70d58..00000000 --- a/man/LLmodGlobalRisk.Rd +++ /dev/null @@ -1,84 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LLmodGlobalRisk.R -\docType{methods} -\name{LLmodGlobalRisk} -\alias{LLmodGlobalRisk} -\title{Global risk using log-linear models.} -\usage{ -LLmodGlobalRisk( - obj, - method = "IPF", - inclProb = NULL, - form = NULL, - modOutput = FALSE -) -} -\arguments{ -\item{obj}{\code{\link{sdcMicroObj-class}}-object or a \code{data.frame} containing -the categorical key variables.} - -\item{method}{At this time, only iterative proportional fitting -(\dQuote{IPF}) can be used.} - -\item{inclProb}{Inclusion probabilites (experimental)} - -\item{form}{A formula specifying the model.} - -\item{modOutput}{If TRUE, additional output is given.} -} -\value{ -Two global risk measures or the modified risk in the \code{\link{sdcMicroObj-class}} object. -} -\description{ -The sample frequencies are assumed to be independent and following a Poisson -distribution. The parameters of the corresponding parameters are estimated -by a log-linear model including the main effects and possible interactions. -} -\details{ -This measure aims to (1) calculate the number of sample uniques that are -population uniques with a probabilistic Poisson model and (2) to estimate -the expected number of correct matches for sample uniques. - -ad 1) this risk measure is defined over all sample uniques (SU) as \deqn{ -\tau_1 = \sum\limits_{SU} P(F_k=1 | f_k=1) \quad , } i.e. the expected -number of sample uniques that are population uniques. - -ad 2) this risk measure is defined over all sample uniques (SU) as \deqn{ -\tau_2 = \sum\limits_{SU} P(F_k=1 | f_k=1) \quad , CORRECT! } - -Since population frequencies \eqn{F_k} are unknown, they has to be -estimated. - -The iterative proportional fitting method is used to fit the parameters of -the Poisson distributed frequency counts related to the model specified to -fit the frequency counts. The obtained parameters are used to estimate a -global risk, defined in Skinner and Holmes (1998). -} -\note{ -LLmodGlobalRisk is depcrecated for \code{\link{modRisk}} and is only -provided for compatibility with older versions of this package. It may be removed -in future versions. -} -\references{ -Skinner, C.J. and Holmes, D.J. (1998) \emph{Estimating the -re-identification risk per record in microdata}. Journal of Official -Statistics, 14:361-372, 1998. - -Rinott, Y. and Shlomo, N. (1998). \emph{A Generalized Negative Binomial -Smoothing Model for Sample Disclosure Risk Estimation}. Privacy in -Statistical Databases. Lecture Notes in Computer Science. Springer-Verlag, -82--93. - -Templ, M. Statistical Disclosure Control for Microdata: Methods and Applications in R. -\emph{Springer International Publishing}, 287 pages, 2017. ISBN 978-3-319-50272-4. -\doi{10.1007/978-3-319-50272-4} -} -\seealso{ -\code{\link{loglm}}, \code{\link{measure_risk}} - -\code{\link{modRisk}} -} -\author{ -Matthias Templ -} -\keyword{manip} diff --git a/man/modRisk.Rd b/man/modRisk.Rd index 550ffe08..445c7b05 100644 --- a/man/modRisk.Rd +++ b/man/modRisk.Rd @@ -64,19 +64,35 @@ global risk, defined in Skinner and Holmes (1998). data(testdata2) form <- ~sex+water+roof w <- "sampling_weight" -(modRisk(testdata2, method="default", formulaM=form, weights=w)) -(modRisk(testdata2, method="CE", formulaM=form, weights=w)) -(modRisk(testdata2, method="PML", formulaM=form, weights=w)) -(modRisk(testdata2, method="weightedLLM", formulaM=form, weights=w)) -(modRisk(testdata2, method="IPF", formulaM=form, weights=w)) +(modRisk(testdata2, method = "default", formulaM = form, weights = w)) +(modRisk(testdata2, method = "CE", formulaM = form, weights = w)) +(modRisk(testdata2, method = "PML", formulaM = form, weights = w)) +(modRisk(testdata2, method = "weightedLLM", formulaM = form, weights = w)) +(modRisk(testdata2, method = "IPF", formulaM = form, weights = w)) ## application to a sdcMicroObj data(testdata2) sdc <- createSdcObj(testdata2, - keyVars=c('urbrur','roof','walls','electcon','relat','sex'), - numVars=c('expend','income','savings'), w='sampling_weight') -sdc <- modRisk(sdc,form=~sex+water+roof) + keyVars = c("urbrur", "roof", "walls", "electcon", "relat", "sex"), + numVars = c("expend", "income", "savings"), + w = "sampling_weight") +sdc <- modRisk(sdc, form = ~sex+water+roof) slot(sdc, "risk")$model + +\dontrun{ +# an example using data from the laeken-pkg +library(laeken) +data(eusilc) +f <- as.formula(paste(" ~ ", "db040 + hsize + rb090 + + age + pb220a + age:rb090 + age:hsize + + hsize:rb090")) +w <- "rb050" +(modRisk(eusilc, method = "default", weights = w, formulaM = f, bound = 5)) +(modRisk(eusilc, method = "CE", weights = w, formulaM = f, bound = 5)) +(modRisk(eusilc, method = "PML", weights = w, formulaM = f, bound = 5)) +(modRisk(eusilc, method = "weightedLLM", weights = w, formulaM = f, bound = 5)) +} + } \references{ Skinner, C.J. and Holmes, D.J. (1998) \emph{Estimating the diff --git a/man/sdcMicro-package.Rd b/man/sdcMicro-package.Rd index ddb58016..c181a482 100644 --- a/man/sdcMicro-package.Rd +++ b/man/sdcMicro-package.Rd @@ -213,15 +213,27 @@ head(get.sdcMicroObj(sdc, type="manipNumVars")) sdc@risk$numeric ### LocalRecProg data(testdata2) +keyVars <- c("urbrur", "roof", "walls", "water", "sex") +w <- "sampling_weight" sdc <- createSdcObj(testdata2, - keyVars=c("urbrur", "roof", "walls", "water", "sex", "relat")) + keyVars = keyVars, + weightVar = w) sdc@risk$global sdc <- LocalRecProg(sdc) sdc@risk$global -### LLmodGlobalRisk -sdc <- undolast(sdc) -sdc <- LLmodGlobalRisk(sdc, inclProb=0.001) -sdc@risk$model +### model-based risks +#' formula +form <- as.formula(paste("~", paste(keyVars, collapse = "+"))) +sdc <- modRisk(sdc, method = "default", formulaM = form) +get.sdcMicroObj(sdc, "risk")$model +sdc <- modRisk(sdc, method = "CE", formulaM = form) +get.sdcMicroObj(sdc, "risk")$model +sdc <- modRisk(sdc, method = "PLM", formulaM = form) +get.sdcMicroObj(sdc, "risk")$model +sdc <- modRisk(sdc, method = "weightedLLM", formulaM = form) +get.sdcMicroObj(sdc, "risk")$model +sdc <- modRisk(sdc, method = "IPF", formulaM = form) +get.sdcMicroObj(sdc, "risk")$model } } @@ -230,8 +242,8 @@ Templ, M. Statistical Disclosure Control for Microdata: Methods and Applications \emph{Springer International Publishing}, 287 pages, 2017. ISBN 978-3-319-50272-4. \doi{10.1007/978-3-319-50272-4} -Templ, M. and Kowarik, A. and Meindl, B. -Statistical Disclosure Control for Micro-Data Using the R Package sdcMicro. +Templ, M. and Kowarik, A. and Meindl, B. +Statistical Disclosure Control for Micro-Data Using the R Package sdcMicro. \emph{Journal of Statistical Software}, \strong{67} (4), 1--36, 2015. \doi{10.18637/jss.v067.i04} Templ, M. and Meindl, B. \emph{Practical Applications in @@ -242,7 +254,7 @@ Information Management Systems, Bookchapter, Springer London, pp. 31-62, Kowarik, A. and Templ, M. and Meindl, B. and Fonteneau, F. and Prantner, B.: \emph{Testing of IHSN Cpp Code and Inclusion of New Methods into sdcMicro}, in: Lecture Notes in Computer Science, J. Domingo-Ferrer, I. Tinnirello -(editors.); Springer, Berlin, 2012, ISBN: 978-3-642-33626-3, pp. 63-77. +(editors.); Springer, Berlin, 2012, ISBN: 978-3-642-33626-3, pp. 63-77. \doi{10.1007/978-3-642-33627-0_6} Templ, M. \emph{Statistical Disclosure Control for Microdata Using the diff --git a/man/sdcMicroObj-class.Rd b/man/sdcMicroObj-class.Rd index f0f171b1..46e626d4 100644 --- a/man/sdcMicroObj-class.Rd +++ b/man/sdcMicroObj-class.Rd @@ -159,19 +159,31 @@ sdc@risk$numeric sdc <- topBotCoding(sdc, value=60000000, replacement=62000000, column="income") head(get.sdcMicroObj(sdc, type="manipNumVars")) sdc@risk$numeric + ### LocalRecProg data(testdata2) +keyVars <- c("urbrur", "roof", "walls", "water", "sex") +w <- "sampling_weight" sdc <- createSdcObj(testdata2, - keyVars=c("urbrur", "roof", "walls", "water", "sex", "relat")) + keyVars = keyVars, + weightVar = w) sdc@risk$global sdc <- LocalRecProg(sdc) sdc@risk$global -### LLmodGlobalRisk -sdc <- undolast(sdc) -sdc <- LLmodGlobalRisk(sdc, inclProb=0.001) -sdc@risk$model +### model-based risks +#' formula +form <- as.formula(paste("~", paste(keyVars, collapse = "+"))) +sdc <- modRisk(sdc, method = "default", formulaM = form) +get.sdcMicroObj(sdc, "risk")$model +sdc <- modRisk(sdc, method = "CE", formulaM = form) +get.sdcMicroObj(sdc, "risk")$model +sdc <- modRisk(sdc, method = "PLM", formulaM = form) +get.sdcMicroObj(sdc, "risk")$model +sdc <- modRisk(sdc, method = "weightedLLM", formulaM = form) +get.sdcMicroObj(sdc, "risk")$model +sdc <- modRisk(sdc, method = "IPF", formulaM = form) +get.sdcMicroObj(sdc, "risk")$model } - ## we can also specify ghost (linked) variables ## these variables are linked to some categorical key variables ## and have the sampe suppression pattern as the variable that they