/
GUIfunctions.R
688 lines (645 loc) · 26.3 KB
/
GUIfunctions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
# Adds attributes to dataframe with variabe label information from list created by extractLabels
#
# For each variable in the dataset, a variable label is added in the label attribute,
# which can be exported by read_dta from haven
#
# @keywords internal
# @param datManip a data.frame with manipulated data (variable names should not be changed)
# @param lab list with label information generated by extractLabels
# @return the dataframe datManip with attrributes containing variable label information
# @author Thijs Benschop
addVarLabels <- function(datManip, lab){
if (is.null(lab[[1]])) {
warning("There are no variable labels!")
return(datManip)
} else {
# Collect colnames of datManip and match labels from lab
# Variable names with no matching names get an empty variable label (e.g. a newly created strata variable)
newLabels <- cbind(colnames(datManip), unlist(lapply(colnames(datManip), function(x) {
if (length(which(x == lab[[1]][,"var.name"])) == 0) {
return("")
} else {
lab[[1]][which(x == lab[[1]][,"var.name"]),"var.label"]
}
})))
# Add label attributes to all variables from newLabels
datManip2 <- lapply(colnames(datManip), function(x) {
attr(datManip[[x]], "label") <- as.character(newLabels[which(x == newLabels[,1]),2])
datManip[[x]]
})
# Convert to data.frame
datManip2 <- as.data.frame(datManip2)
colnames(datManip2) <- colnames(datManip)
return(datManip2)
}
}
# Changes a variable label in the list generated by extractLabels
#
# The variable label can be changed for variables that were already in the dataframe used for extractLabels
#
# @keywords internal
# @param lab list with label information generated by extractLabels
# @param varname the variable name
# @param newlabel the new label for variable varname
# @return a list with both variable labels (as matrix) and value labels (as list) (see extractLabels)
# @author Thijs Benschop
changeVarLabel <- function(lab, varname, newlabel){
# Check whether varname is in lab
if (!varname %in% lab[[1]][,"var.name"]) {
stop("The variable is not in the label information/original dataset. Labels can only be changed for variables in the original dataset.")
}
# Replace label with newlabel for variable varname
lab[[1]][which(lab[[1]][,"var.name"] == varname), "var.label"] <- newlabel
return(lab)
}
# Extracts label information from dataframe read by read_dta from haven package.
#
# Both variable labels and value labels are extracted.
# @keywords internal
# @param dat a data.frame loaded by read_dta from have package
# @return a list with both variable labels (as data.frame) and value labels (as list)
# @author Thijs Benschop
extractLabels <- function(dat){
# Check whether there are variable labels available
if (!all(sapply(sapply(dat, function(x) { attr(x, "label") }), is.null))) {
# Save all variable labels
varLab <- as.data.frame(cbind(colnames(dat), lapply(dat, function(x){attr(x, "label")})))
colnames(varLab) <- c("var.name", "var.label")
rownames(varLab) <- NULL
# Set to NA values in var.label that have more than one element (value labels)
varLab[which(sapply(dat, function(x) { length(attr(x, "label")) }) > 1), 2] <- NA
# Set to NULL values in var.label to NA
varLab[which(sapply(sapply(dat, function(x) { attr(x, "label") }), is.null)), 2] <- NA
# Set to NA values in var.label that have more than one element (value labels)
varLab[which(sapply(dat, function(x) { length(attr(x, "label")) }) > 1), 2] <- NA
# Convert all variable labels to UTF-8
nonUTFvarlabel <- NULL
varLab[, 2] <- unlist(varLab[,2], use.names = FALSE)
if(any(!validUTF8(varLab[,2]))){
whichNotUTF8 <- which(!is.na(varLab[,2]) & !validUTF8(varLab[,2]))
nonUTFvarlabel <- varLab[whichNotUTF8, c(1,2)] # Save list of all labels that aren't encoded in UTF-8
varLab[whichNotUTF8, 2] <- enc2utf8(varLab[whichNotUTF8, 2])
varLab[whichNotUTF8, 2] <- iconv(varLab[whichNotUTF8, 2], "UTF-8", "UTF-8", sub='')
nonUTFvarlabel <- cbind(nonUTFvarlabel, varLab[whichNotUTF8, 2])
}
} else {
varLab <- NULL
nonUTFvarlabel <- NULL
}
# Check whether there are value labels available
if (!all(sapply(sapply(dat, function(x) { attr(x, "labels") }), is.null))) {
# Save all value labels for variables of class labelled
valLab <- lapply(dat, function(x){attr(x, "labels")})
} else {
valLab <- NULL
}
return(list(varLab, valLab, nonUTFvarlabel))
}
#' Creates a household level file from a dataset with a household structure.
#'
#' It removes individual level variables and selects one record per household based on a household ID. The function can also be used for other hierachical structures.
#'
#' @note It is of great importance that users select a variable with containing information on household-ids and weights in \code{hhVars}.
#'
#' @param dat a data.frame with the full dataset
#' @param hhId name of the variable with the household (cluster) ID
#' @param hhVars character vector with names of all household level variables
#' @return a data.frame with only household level variables and one record per household
#' @author Thijs Benschop and Bernhard Meindl
#' @export
#' @examples
#' ## ori-hid: household-ids; household_weights: sampling weights for households
#' x_hh <- selectHouseholdData(dat=testdata, hhId="ori_hid",
#' hhVars=c("urbrur", "roof", "walls", "water", "electcon", "household_weights"))
selectHouseholdData <- function(dat, hhId, hhVars) {
# Check whether specified variables are available in the data
if (!all(hhVars %in% colnames(dat))) {
stop("Some selected household variables aren't available in the data.\nRespecify hhVars\n")
}
if (!hhId %in% colnames(dat)) {
stop("The selected household ID isn't available in the data.\nRespecify hhId\n")
}
# Remove any records with missing household ID (these cannot be matched later on in the process)
res <- dat[stats::complete.cases(dat[,hhId]), ]
# Keep only one observation per household
res <- res[which(!duplicated(res[,hhId])),]
# Sort hhVars on the order of the variables in dat
hhVars <- colnames(dat)[which(colnames(dat) %in% hhVars)]
# Drop all variables that are not at the household level
res <- res[,c(hhId, hhVars), drop=FALSE]
invisible(res)
}
#' Replaces the raw household-level data with the anonymized household-level data in the full dataset
#' for anonymization of data with a household structure (or other hierarchical structure).
#' Requires a matching household ID in both files.
#'
#' @param dat a data.frame with the full dataset
#' @param hhId name of the household (cluster) ID (identical in both datasets)
#' @param dathh a dataframe with the treated household level data (generated for example with \link{selectHouseholdData})
#' @return a data.frame with the treated household level variables and the raw individual level variables
#' @author Thijs Benschop and Bernhard Meindl
#' @export
#' @examples
#' ## Load data
#' x <- testdata
#' \donttest{
#' ## donttest is necessary because of
#' ## Examples with CPU time > 2.5 times elapsed time
#' ## caused by using C++ code and/or data.table
#' ## Create household level dataset
#' x_hh <- selectHouseholdData(dat=x, hhId="ori_hid",
#' hhVars=c("urbrur", "roof", "walls", "water", "electcon", "household_weights"))
#' ## Anonymize household level dataset and extract data
#' sdc_hh <- createSdcObj(x_hh, keyVars=c('urbrur','roof'), w='household_weights')
#' sdc_hh <- kAnon(sdc_hh, k = 3)
#' x_hh_anon <- extractManipData(sdc_hh)
#'
#' ## Merge anonymized household level data back into the full dataset
#' x_anonhh <- mergeHouseholdData(x, "ori_hid", x_hh_anon)
#'
#' ## Anonymize full dataset and extract data
#' sdc_full <- createSdcObj(x_anonhh, keyVars=c('sex', 'age', 'urbrur', 'roof'), w='sampling_weight')
#' sdc_full <- kAnon(sdc_full, k = 3)
#' x_full_anon <- extractManipData(sdc_full)
#' }
mergeHouseholdData <- function(dat, hhId, dathh) {
# Check whether household ID is available in both datasets
if (!(hhId %in% colnames(dathh)))
stop("The selected household ID isn't available in the anonymized household data.\nRespecify hhId")
if (!(hhId %in% colnames(dat)))
stop("The selected household ID isn't available in the full dataset.\nRespecify hhId")
# Drop all variables from the untreated dataset that are in the household level dataset except the household ID and the weight variable
# The weights from the household level file aren't exported because they either are contained in the complete dataset or generated and can
# therefore be generated by the user.
toBeDropped <- colnames(dathh)
toBeDropped <- toBeDropped[-which(toBeDropped %in% hhId)]
res <- dat[,-which(colnames(dat) %in% toBeDropped), drop=FALSE]
# Merge the individual level and household level files by the household ID
# Individuals without matching household ID in dathh are kept and have all missings for the household level variables
res <- merge(res, dathh, by = hhId, all.x = TRUE)
# Move hhId to first column
res <- res[,c(hhId, colnames(res)[-which(colnames(res) %in% hhId)])]
invisible(res)
}
#' Generate one strata variable from multiple factors
#'
#' For strata defined by multiple variables (e.g. sex,age,country) one combined
#' variable is generated.
#'
#' @param df a data.frame
#' @param stratavars character vector with variable name
#' @param name name of the newly generated variable
#' @return The original data set with one new column.
#' @author Alexander Kowarik
#' @export
#' @examples
#'
#' x <- testdata
#' x <- generateStrata(x,c("sex","urbrur"),"strataIDvar")
#' head(x)
#'
generateStrata <- function(df, stratavars, name) {
strata <- rep("", nrow(df))
for (i in seq_along(stratavars)) {
strata <- paste(strata, df[, stratavars[i]], sep="")
if (length(stratavars) > i) {
strata <- paste(strata, "-", sep="")
}
}
df <- cbind(df, strata)
colnames(df)[length(colnames(df))] <- name
return(df)
}
#' Remove certain variables from the data set inside a sdc object.
#'
#' Delete variables without changing anything else in the sdcObject (writing
#' NAs).
#'
#'
#' @name removeDirectID
#' @docType methods
#' @param obj object of class \code{\link{sdcMicroObj-class}}
#' @param var name of the variable(s) to be remove
#' @return the modified \code{\link{sdcMicroObj-class}}
#' @author Alexander Kowarik
#' @keywords methods
#' @export
#' @examples
#' ## for objects of class sdcMicro:
#' data(testdata2)
#' sdc <- createSdcObj(testdata, keyVars=c('urbrur','roof'),
#' numVars=c('expend','income','savings'), w='sampling_weight')
#' sdc <- removeDirectID(sdc, var="age")
removeDirectID <- function(obj, var) {
removeDirectIDX(obj=obj, var=var)
}
setGeneric("removeDirectIDX", function(obj, var) {
standardGeneric("removeDirectIDX")
})
setMethod(f="removeDirectIDX", signature=c("sdcMicroObj"),
definition=function(obj, var) {
kV <- colnames(obj@origData)[get.sdcMicroObj(obj, "keyVars")]
nV <- colnames(obj@origData)[get.sdcMicroObj(obj, "numVars")]
wV <- colnames(obj@origData)[get.sdcMicroObj(obj, "weightVar")]
sV <- colnames(obj@origData)[get.sdcMicroObj(obj, "strataVar")]
hV <- colnames(obj@origData)[get.sdcMicroObj(obj, "hhId")]
if (any(var %in% kV))
stop("A direct identifier should not be seleceted as key variable.\n Therefore it can not be removed.")
if (any(var %in% nV))
stop("A direct identifier should not be seleceted as numerical key variable.\n Therefore it can not be removed.")
if (any(var %in% wV))
stop("A direct identifier should not be seleceted as weight variable.\n Therefore it can not be removed.")
if (any(var %in% sV))
stop("A direct identifier should not be seleceted as strata variable.\n Therefore it can not be removed.")
if (any(var %in% hV))
stop("A direct identifier should not be seleceted as cluster ID.\n Therefore it can not be removed.")
o <- obj@origData
if (any(!var %in% colnames(o)))
stop("direct identifier variable not found on data set")
o <- o[, !colnames(o) %in% var, drop=FALSE]
obj <- nextSdcObj(obj)
obj@deletedVars <- c(obj@deletedVars, var)
obj@origData <- o
obj
})
#' Change the a keyVariable of an object of class \code{\link{sdcMicroObj-class}} from Numeric to
#' Factor or from Factor to Numeric
#'
#' Change the scale of a variable
#'
#' @name varToFactor
#' @docType methods
#' @param obj object of class \code{\link{sdcMicroObj-class}}
#' @param var name of the keyVariable to change
#' @return the modified \code{\link{sdcMicroObj-class}}
#' @keywords methods
#' @export
#' @examples
#' ## for objects of class sdcMicro:
#' data(testdata2)
#' sdc <- createSdcObj(testdata2,
#' keyVars=c('urbrur','roof','walls','water','electcon','relat','sex'),
#' numVars=c('expend','income','savings'), w='sampling_weight')
#' sdc <- varToFactor(sdc, var="urbrur")
#'
varToFactor <- function(obj, var) {
varToFactorX(obj=obj, var=var)
}
setGeneric("varToFactorX", function(obj, var) {
standardGeneric("varToFactorX")
})
setMethod(f="varToFactorX", signature=c("sdcMicroObj"),
definition=function(obj, var) {
obj <- nextSdcObj(obj)
x <- get.sdcMicroObj(obj, type="manipKeyVars")
x2 <- varToFactor(x, var=var)
obj <- set.sdcMicroObj(obj, type="manipKeyVars", input=list(as.data.frame(x2)))
obj
})
setMethod(f="varToFactorX", signature=c("data.frame"),
definition=function(obj, var) {
#if ( length(var)!=1) {
# stop("More than 1 variable specified in 'var'!\n")
#}
if (!all(var %in% colnames(obj))) {
stop("at least one variable specified in 'var' is not available in 'obj'!\n")
}
for (vv in var) {
obj[[vv]] <- as.factor(obj[[vv]])
}
obj
})
#' @export
#' @rdname varToFactor
varToNumeric <- function(obj, var) {
varToNumericX(obj=obj, var=var)
}
setGeneric("varToNumericX", function(obj, var) {
standardGeneric("varToNumericX")
})
setMethod(f="varToNumericX", signature=c("sdcMicroObj"),
definition=function(obj, var) {
obj <- nextSdcObj(obj)
x <- get.sdcMicroObj(obj, type="manipKeyVars")
suppressWarnings(tmpvar <- as.numeric(as.character(x[, var])))
x2 <- varToNumeric(x, var=var)
obj <- set.sdcMicroObj(obj, type="manipKeyVars", input=list(as.data.frame(x2)))
obj
})
setMethod(f="varToNumericX", signature=c("data.frame"),
definition=function(obj, var) {
if (!all(var %in% colnames(obj))) {
stop("at least one variable specified in 'var' is not available in 'obj'!\n")
}
for (vv in var) {
if (inherits(obj[[vv]], "factor")) {
obj[[vv]] <- as.numeric(levels(obj[[vv]]))[obj[[vv]]]
} else{
obj[[vv]] <- as.numeric(obj[[vv]])
}
}
obj
})
# wrapper for tryCatch()
tryCatchFn <- function(expr) {
result <- tryCatch({expr},
error=function(e) {
return(e)
})
return(result)
}
#' readMicrodata
#'
#' reads data from various formats into R. Used in \code{\link{sdcApp}}.
#'
#' @param path a file path
#' @param type which format does the file have. currently allowed values are
#' \itemize{
#' \item \code{sas}
#' \item \code{spss}
#' \item \code{stata}
#' \item \code{R}
#' \item \code{rdf}
#' \item \code{csv}
#' }
#' @param convertCharToFac (logical) if TRUE, all character vectors are automatically
#' converted to factors
#' @param drop_all_missings (logical) if TRUE, all variables that contain NA-values only
#' will be dropped
#' @param ... additional parameters. Currently used only if \code{type='csv'} to pass
#' arguments to \code{read.table()}.
#'
#' @note if \code{type} is either \code{'sas'}, \code{'spss'} or \code{'stata'}, values read in as \code{NaN}
#' will be converted to \code{NA}.
#' @return a data.frame or an object of class 'simple.error'. If a stata file was read in, the resulting \code{data.frame}
#' has an additional attribute \code{lab} in which variable and value labels are stored.
#' @author Bernhard Meindl
#' @export
readMicrodata <- function(path, type, convertCharToFac=TRUE, drop_all_missings=TRUE, ...) {
nonUTFvarname <- NULL
if (type=="sas") {
res <- tryCatchFn(haven::read_sas(data_file=path))
# Convert column names to utf8
nonUTFvarname <- cbind(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))], iconv(enc2utf8(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))]), "UTF-8", "UTF-8", sub='')) # Save list of all variable names that aren't encoded in UTF-8
colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))] <- nonUTFvarname[,2]
}
if (type=="spss") {
res <- tryCatchFn(haven::read_spss(file=path))
# Convert column names to utf8
nonUTFvarname <- cbind(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))], iconv(enc2utf8(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))]), "UTF-8", "UTF-8", sub='')) # Save list of all variable names that aren't encoded in UTF-8
colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))] <- nonUTFvarname[,2]
}
if (type=="stata") {
res <- tryCatchFn(haven::read_dta(file=path))
# Convert column names to utf8
nonUTFvarname <- cbind(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))], iconv(enc2utf8(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))]), "UTF-8", "UTF-8", sub='')) # Save list of all variable names that aren't encoded in UTF-8
colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))] <- nonUTFvarname[,2]
lab <- extractLabels(res)
}
if (type=="R") {
res <- tryCatchFn(get(load(file=path)))
}
if (type=="rdf") {
res <- tryCatchFn(get(paste(path)))
}
if (type=="csv") {
opts <- list(...)
header <- ifelse(opts$header==TRUE, TRUE, FALSE)
sep <- opts$sep
quote <- "\""
comment.char <- ""
res <- tryCatchFn(utils::read.table(path, sep=sep, header=header, quote=quote, comment.char=comment.char))
}
if (inherits(res, "simpleError")) {
return(res)
}
if (!inherits(res, "data.frame")) {
res$message <- paste0(res$message,"\ndata read into the system was not of class 'data.frame'!")
return(res)
}
# convert result to clas 'data.frame' if it is a 'tbl_df'...
if (inherits(res, "tbl_df")) {
class(res) <- "data.frame"
}
# convert NaN to NA if data was read in with haven
if (type %in% c("sas","spss","stata")) {
res[is.na(res)] <- NA
}
# check if any variable has class 'labelled' or 'haven_labelled' (from haven 2.0.0) and convert it to factors.
# this might happen if we read data with read_xxx() from haven
cl_lab <- which(sapply(res, inherits, "labelled") | sapply(res, inherits, "haven_labelled"))
if (length(cl_lab) > 0) {
if (length(cl_lab)==1) {
res[[cl_lab]] <- haven::as_factor(res[[cl_lab]], levels="default")
} else {
res[,cl_lab] <- lapply(res[,cl_lab] , function(x) {
haven::as_factor(x, levels="default")
})
}
}
if (convertCharToFac) {
# convert character-variables to factors
cl_char <- which(sapply(res, class)=="character")
if (length(cl_char) >0) {
if (length(cl_char) == 1) {
res[[cl_char]] <- as.factor(res[[cl_char]])
} else {
res[,cl_char] <- lapply(res[,cl_char], as.factor)
}
}
}
if (drop_all_missings) {
# drop all variables that are NA-only
keep <- which(sapply(res, function(x) sum(is.na(x))!=length(x)))
dropped <- colnames(res)[-keep]
res <- res[,keep,drop=FALSE]
# save names of dropped variables
if(length(dropped) > 0){
attr(res, "dropped") <- dropped
}
}
# Convert levels in factor and character variables to utf8
nonUTFvallabels <- data.frame(varName = character(), initLabel = character(0), convLabel = character(0), stringsAsFactors = FALSE)
for (i in 1:dim(res)[2]) {
# Character strings
if (inherits(res[, i], "character")) {
if (any(!validUTF8(res[,i]))) {
nonUTFvallabels <- rbind(nonUTFvallabels, cbind(rep(colnames(res)[i], length(unique(res[which(!validUTF8(res[,i])),i]))),
unique(res[which(!validUTF8(res[,i])),i]),
iconv(enc2utf8(unique(res[which(!validUTF8(res[,i])),i])), "UTF-8", "UTF-8", sub='')))
res[which(!validUTF8(res[,i])),i] <- enc2utf8(res[which(!validUTF8(res[,i])),i])
# Remove any non UTF8 characters
res[which(!validUTF8(res[,i])),i] <- iconv(res[which(!validUTF8(res[,i])),i], "UTF-8", "UTF-8", sub='')
}
}
# Factor variables
if (inherits(res[, i], "factor")) {
if (any(!validUTF8(levels(res[,i])))) {
nonUTFvallabels <- rbind(nonUTFvallabels, cbind(rep(colnames(res)[i], length(levels(res[,i])[which(!validUTF8(levels(res[,i])))])),
levels(res[,i])[which(!validUTF8(levels(res[,i])))],
iconv(enc2utf8(levels(res[,i])[which(!validUTF8(levels(res[,i])))]), "UTF-8", "UTF-8", sub='')))
# Convert to UTF8, encoding unknown
levels(res[,i])[which(!validUTF8(levels(res[,i])))] <- enc2utf8(levels(res[,i])[which(!validUTF8(levels(res[,i])))])
# Remove any non UTF8 characters
levels(res[,i])[which(!validUTF8(levels(res[,i])))] <- iconv(levels(res[,i])[which(!validUTF8(levels(res[,i])))], "UTF-8", "UTF-8", sub='')
}
}
}
if (!is.null(nonUTFvarname)) {
if (dim(nonUTFvarname)[1] == 0) {
nonUTFvarname <- NULL
}
} # Set to NULL if no changed labels
if (!is.null(nonUTFvallabels)) {
if (dim(nonUTFvallabels)[1] == 0) {
nonUTFvallabels <- NULL
}
} # Set to NULL if no changed labels
if (type=="stata") {
attr(res, "lab") <- lab
}
# Collect variable names, variable labels and value labels that were encoded to UTF8
if (type=="stata") {
attr(res, "nonUTF") <- list(nonUTFvarname, nonUTFvallabels, lab[[3]])
} else{
attr(res, "nonUTF") <- list(nonUTFvarname, nonUTFvallabels)
}
res
}
#' importProblem
#'
#' reads an sdcProblem with code that has been exported within \code{\link{sdcApp}}.
#'
#' @param path a file path
#' @return an object of class \code{sdcMicro_GUI_export} or an object of class 'simple.error'
#' @author Bernhard Meindl
#' @export
importProblem <- function(path) {
res <- tryCatchFn(get(load(file=path)))
if (inherits(res, "simpleError")) {
return(res)
}
if (!inherits(res, "sdcMicro_GUI_export")) {
res$message <- paste0(
res$message,
"\ndata read into the system was not of class 'sdcMicro_GUI_export'!"
)
return(res)
}
res
}
#' subsetMicrodata
#'
#' allows to restrict original data to only a subset. This may be useful to test some anonymization
#' methods. This function will only be used in the graphical user interface \code{\link{sdcApp}}.
#'
#' @param obj an object of class \code{\link{data.frame}} containing micro data
#' @param type algorithm used to sample from original microdata. Currently supported choices are
#' \describe{
#' \item{\code{n_perc}}{ the restricted microdata will be a \code{n-percent} sample of the original microdata.}
#' \item{\code{first_n}}{ only the first \code{n} observations will be used.}
#' \item{\code{every_n}}{ the restricted microdata set consists of every \code{n-th} record.}
#' \item{\code{size_n}}{ a total of \code{n} observations will be randomly drawn.}
#' }
#' @param n numeric vector of length 1 specifying the specific parameter with respect to argument \code{type}.
#' @return an object of class \code{\link{sdcMicroObj-class}} with modified slot \code{@origData}.
#' @author Bernhard Meindl
#' @rdname subsetMicrodata
subsetMicrodata <- function(obj, type, n) {
if (!type %in% c("n_perc","first_n","every_n","size_n")) {
stop("invalid value in argument 'type'\n")
}
if (n < 1) {
stop("argument 'n' must be >=1\n")
}
dat <- obj
nrObs <- nrow(dat)
if (type=="n_perc") {
ssize <- ceiling((nrObs/100)*n)
dat <- dat[sample(1:nrObs, ssize),,drop=FALSE]
}
if (type=="first_n") {
dat <- dat[1:n,,drop=F]
}
if (type=="every_n") {
ssize <- (1:nrObs)%%n==1
dat <- dat[ssize,,drop=F]
}
if (type=="size_n") {
dat <- dat[sample(1:nrObs, n),,drop=F]
}
dim(dat)
return(dat)
}
#' writeSafeFile
#'
#' writes an anonymized dataset to a file. This function should be used in the
#' graphical user interface [sdcApp()] only.
#'
#' @param obj a `data.frame` containing micro data
#' @param randomizeRecords (logical) specifies, if the output records should
#' be randomized. The following options are possible:
#' - `"no"`: default, no randomization takes place
#' - `"simple"`: records are randomly swapped
#' - `"byHH"`: if slot `"hhId"` is not `NULL`, the clusters defined by this
#' variable are randomized across the dataset. If slot `"hhId"` is `NULL`, the
#' records or the dataset are randomly changed.
#' - `"withinHH"`: if slot `"hhId"` is not `NULL`, the clusters defined by
#' this variable are randomized across the dataset and additionally, the order
#' of records within the clusters are also randomly changed. If slot `"hhId"`
#' is `NULL`, the records or the dataset are randomly changed.
#'
#' @param format (character) specifies the output file format. Accepted
#' values are:
#' - `"rdata"`: output will be saved in the R binary file-format
#' - `"sav"`: output will be saved as SPSS-file
#' - `"dta"`: ouput will be saved as STATA-file
#' - `"csv"`: output will be saved as comma seperated (text)-file
#' - `"sas"`: output will be saved as SAS-file (sas7bdat)
#' @param fileOut (character) file to which output should be written
#' @param ... optional arguments used for [utils::write.table()] if
#' argument `"format"` equals `"csv"`
#' @return invisible `NULL` if the file was successfully written
#' @author Bernhard Meindl
#' @rdname writeSafeFile
#' @md
#' @export
writeSafeFile <- function(obj, format, randomizeRecords, fileOut, ...) {
if (!inherits(obj, "sdcMicroObj")) {
stop("invalid input in argument 'obj'\n")
}
dat <- extractManipData(obj, randomizeRecords=randomizeRecords)
if (format=="rdata") {
save(dat, file=fileOut)
}
if (format=="sav") {
haven::write_sav(data=dat, path=fileOut)
}
if (format=="sas") {
haven::write_sas(data=dat, path=fileOut)
}
if (format=="dta") {
# add label information
inp <- list(...)
new_labs <- inp$lab
if (!is.null(new_labs)) {
# restrict to existing variables in anonymized dataset
ll1 <- new_labs[[1]]
ii <- which(ll1$var.name %in% colnames(dat))
ll1 <- ll1[ii,]
new_labs[[1]] <- ll1
ll2 <- new_labs[[2]]
if (!is.null(ll2)) {
ii <- which(names(ll2) %in% colnames(dat))
ll2 <- ll2[ii]
new_labs[[2]] <- ll2
}
dat <- addVarLabels(dat, lab=new_labs)
}
haven::write_dta(data=dat, path=fileOut, version=inp$version)
}
if (format=="csv") {
utils::write.table(dat, file=fileOut, ...)
}
return(invisible(NULL))
}