-
Notifications
You must be signed in to change notification settings - Fork 3
/
ipf.r
850 lines (789 loc) · 31.8 KB
/
ipf.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
#' @rdname ipf_step
#' @name ipf_step
#' @md
#'
#' @param dat a `data.frame` containing the factor variables to be combined.
#'
#' @export
combine_factors <- function(dat, targets) {
x <- as.data.frame(targets)
x$ID_ipu <- seq_len(nrow(x))
x <- merge(dat, x, by = names(dimnames(targets)), sort = FALSE, all.x = TRUE)
factor(x$ID_ipu, levels = seq_along(targets))
}
getMeanFun <- function(meanHH) {
if (isTRUE(meanHH))
meanHH <- "arithmetic"
if (identical(meanHH, FALSE))
meanHH <- "none"
meanfun <- switch(meanHH,
arithmetic = arithmetic_mean,
geometric = geometric_mean,
none = function(x, w) {
x
}
)
if (is.null(meanfun))
stop("invalid value for meanHH")
meanfun
}
boundsFak <- function(g1, g0, f, bound = 4, minMaxTrim=NULL) {
# Berechnet die neuen Gewichte (innerhalb 4, .25 Veraenderungsraten)
g1 <- g1 * f
if(!is.null(bound)){
TF <- which((g1 / g0) > bound)
TF[is.na(TF)] <- FALSE
g1[TF] <- bound * g0[TF]
TF <- which((g1 / g0) < (1 / bound))
TF[is.na(TF)] <- FALSE
g1[TF] <- (1 / bound) * g0[TF]
}
if(!is.null(minMaxTrim)){
g1[g1<minMaxTrim[1]] <- minMaxTrim[1]
g1[g1>minMaxTrim[2]] <- minMaxTrim[2]
}
return(g1)
}
boundsFakHH <- function(g1, g0, eps, orig, p, bound = 4, minMaxTrim=NULL) {
# Berechnet die neuen Gewichte fuer Unter- und Obergrenze (innerhalb 4,
# .25 Veraenderungsraten)
u <- orig * (1 - eps)
o <- orig * (1 + eps)
pbo <- which(p > o)
psu <- which(p < u)
g1[pbo] <- g1[pbo] * o[pbo] / p[pbo]
g1[psu] <- g1[psu] * u[psu] / p[psu]
if(!is.null(bound)){
TF <- which((g1 / g0) > bound)
g1[TF] <- bound * g0[TF]
TF <- which((g1 / g0) < (1 / bound))
g1[TF] <- (1 / bound) * g0[TF]
}
if(!is.null(minMaxTrim)){
g1[g1<minMaxTrim[1]] <- minMaxTrim[1]
g1[g1>minMaxTrim[2]] <- minMaxTrim[2]
}
return(g1)
}
check_eps <- function(con, eps, type){
l <- length(con)
if(is.list(eps)){
if(length(eps)!=l){
stop(paste("Provided",type,"eps argument does not fit",type,"constraints."))
}
for(i in 1:length(eps)){
if(is.array(eps[[i]])){
if(!identical(dim(eps[[i]]),dim(con[[i]]))){
stop(paste("Provided",type,"eps argument",i,"does not fit in dimension to ",type,"constraints",i,"."))
}
}
}
}else if(length(eps)>1){
stop("Individual eps arguments for each constraints must be defined as list.")
}
}
check_population_totals <- function(con, dat, type = "personal") {
# do not apply this check for numerical calibration
if (is.null(names(con))) {
ind <- seq_along(con)
} else {
ind <- which(names(con) == "")
}
# do not apply this check for constraints that only cover the population
# partially
ind <- ind[vapply(
ind,
function(i) {
constraint <- con[[i]]
for (variable in names(dimnames(constraint))) {
if (!(variable %in% names(dat)))
stop("variable ", variable, " appears in a constraint but not ",
"in the dataset")
if (!identical(sort(unique(as.character(dat[[variable]]))),
sort(dimnames(constraint)[[variable]]))) {
message(type, " constraint ", i, " only covers a subset of the ",
"population")
return(FALSE)
}
}
return(TRUE)
},
TRUE)]
if (length(ind) == 0)
return(NULL)
pop_totals <- vapply(
ind,
function(index) {
sum(con[[index]])
},
0)
rel_errors <- abs(pop_totals - pop_totals[1]) / pop_totals[1]
# use a 1% tolerance. Maybe it would be better to make the tolerance
# dependent on conH?
if (any(rel_errors > 1e-2))
stop("population totals for different constraints do not match")
}
calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter,
numericalWeighting, numericalWeightingVar, w, cw, minMaxTrim) {
selectGroupNotConverged <- epsPcur <- maxFac <- OriginalSortingVariable <- V1 <-
epsvalue <- fVariableForCalibrationIPF <- NULL
temporary_hvar <- value <-
wValue <- representativeHouseholdForCalibration <- NULL
variableKeepingTheBaseWeight <- w
variableKeepingTheCalibWeight <- cw
combined_factors <- dat[[paste0("combined_factors_", i)]]
setnames(dat, valueP[i], "value")
setnames(dat, paste0("epsP_", i), "epsPcur")
tmp <- data.table(x = factor(levels(combined_factors)))
setnames(tmp, "x", paste0("combined_factors_", i))
paste0("combined_factors_h_", i)
con_current <- dat[tmp, on = paste0("combined_factors_", i),
mult = "first", value]
if (!is.null(numericalWeightingVar)) {
## numerical variable to be calibrated
## use name of conP list element to define numerical variable
set(dat, j = "fVariableForCalibrationIPF",
value = ipf_step_f(dat[[variableKeepingTheCalibWeight]] *
dat[[numericalWeightingVar]],
combined_factors, con_current))
set(dat, j = "wValue", value = dat[["value"]] /
dat[["fVariableForCalibrationIPF"]])
# try to divide the weight between units with larger/smaller value in the
# numerical variable linear
dat[, fVariableForCalibrationIPF := numericalWeighting(
head(wValue, 1), head(value, 1), get(numericalWeightingVar),
get(variableKeepingTheCalibWeight)),
by = eval(paste0("combined_factors_", i))]
} else {
# categorical variable to be calibrated
set(dat, j = "fVariableForCalibrationIPF", value = ipf_step_f(
dat[[variableKeepingTheCalibWeight]], combined_factors, con_current))
}
if (dat[!is.na(fVariableForCalibrationIPF),
any(abs(1 / fVariableForCalibrationIPF - 1) > epsPcur)]) {
## sicherheitshalber abs(epsPcur)? Aber es wird schon niemand negative eps
## Werte uebergeben??
if (verbose && calIter %% 10 == 0) {
message(calIter, ":Not yet converged for P-Constraint", i, "\n")
if (calIter %% 100 == 0) {
dat[, selectGroupNotConverged := any(!is.na(fVariableForCalibrationIPF) &
(abs(1 / fVariableForCalibrationIPF - 1) > epsPcur)), by= c(pColNames[[i]])]
tmp <- dat[
selectGroupNotConverged == TRUE,
list(
maxFac = max(abs(1 / fVariableForCalibrationIPF - 1)), .N,
epsP = head(epsPcur, 1),
CalibMargin = {
if (!is.null(numericalWeightingVar)) {
sum(get(variableKeepingTheCalibWeight) *
get(numericalWeightingVar))
}else{
sum(get(variableKeepingTheCalibWeight))
}
},
PopMargin = head(value, 1)),
by = eval(pColNames[[i]])]
dat[, selectGroupNotConverged := NULL]
print(tmp[order(maxFac, decreasing = TRUE), ])
message("-----------------------------------------\n")
}
}
if (!is.null(bound) || !is.null(minMaxTrim)) {
dat[!is.na(fVariableForCalibrationIPF),
c(variableKeepingTheCalibWeight) :=
boundsFak(
get(variableKeepingTheCalibWeight),
get(variableKeepingTheBaseWeight), fVariableForCalibrationIPF,
bound = bound, minMaxTrim = minMaxTrim)]
#,by=eval(pColNames[[i]])]
} else {
dat[!is.na(fVariableForCalibrationIPF),
c(variableKeepingTheCalibWeight) := fVariableForCalibrationIPF *
get(variableKeepingTheCalibWeight),
by = eval(paste0("combined_factors_", i))]
}
error <- TRUE
}
setnames(dat, "value", valueP[i])
setnames(dat, "epsPcur", paste0("epsP_", i))
return(error)
}
calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter,
looseH, numericalWeighting, numericalWeightingVar, w, cw, minMaxTrim) {
variableKeepingTheBaseWeight <- w
variableKeepingTheCalibWeight <- cw
epsHcur <- OriginalSortingVariable <- V1 <-
epsvalue <- fVariableForCalibrationIPF <- NULL
maxFac <- temporary_hvar <-
value <- wValue <- representativeHouseholdForCalibration <- NULL
setnames(dat, valueH[i], "value")
setnames(dat, paste0("epsH_", i), "epsHcur")
combined_factors <- dat[[paste0("combined_factors_h_", i)]]
tmp <- data.table(x = factor(levels(combined_factors)))
setnames(tmp, "x", paste0("combined_factors_h_", i))
paste0("combined_factors_h_", i)
con_current <- dat[tmp, on = paste0("combined_factors_h_", i),
mult = "first", value]
if (!is.null(numericalWeightingVar)) {
## numerical variable to be calibrated
## use name of conH list element to define numerical variable
set(dat, j = "fVariableForCalibrationIPF", value = ipf_step_f(
dat[[variableKeepingTheCalibWeight]] *
dat[["representativeHouseholdForCalibration"]] *
dat[[numericalWeightingVar]], combined_factors, con_current))
set(dat, j = "wValue", value = dat[["value"]] /
dat[["fVariableForCalibrationIPF"]])
# try to divide the weight between units with larger/smaller value in the
# numerical variable linear
dat[, fVariableForCalibrationIPF := numericalWeighting(
head(wValue, 1), head(value, 1), get(numericalWeightingVar),
get(variableKeepingTheCalibWeight)),
by = eval(paste0("combined_factors_h_", i))]
} else {
# categorical variable to be calibrated
set(dat, j = "fVariableForCalibrationIPF", value = ipf_step_f(
dat[[variableKeepingTheCalibWeight]] *
dat[["representativeHouseholdForCalibration"]],
combined_factors, con_current))
}
set(dat, j = "wValue", value = dat[["value"]] /
dat[["fVariableForCalibrationIPF"]])
if (dat[!is.na(fVariableForCalibrationIPF),
any(abs(1 / fVariableForCalibrationIPF - 1) > epsHcur)]) {
if (verbose && calIter %% 10 == 0) {
message(calIter, ":Not yet converged for H-Constraint", i, "\n")
if (calIter %% 100 == 0) {
tmp <- dat[
!is.na(fVariableForCalibrationIPF) &
(abs(1 / fVariableForCalibrationIPF - 1) > epsHcur),
list(maxFac = max(abs(1 / fVariableForCalibrationIPF - 1)), .N,
epsH = head(epsHcur, 1),
sumCalibWeight = sum(get(variableKeepingTheCalibWeight) *
representativeHouseholdForCalibration),
PopMargin = head(value, 1)),
by = eval(hColNames[[i]])]
print(tmp[order(maxFac, decreasing = TRUE), ])
message("-----------------------------------------\n")
}
}
if (!is.null(bound) || !is.null(minMaxTrim)) {
if (!looseH) {
set(dat, j = variableKeepingTheCalibWeight, value = boundsFak(
g1 = dat[[variableKeepingTheCalibWeight]],
g0 = dat[[variableKeepingTheBaseWeight]],
f = dat[["fVariableForCalibrationIPF"]],
bound = bound, minMaxTrim = minMaxTrim))
}else{
set(dat, j = variableKeepingTheCalibWeight, value = boundsFakHH(
g1 = dat[[variableKeepingTheCalibWeight]],
g0 = dat[[variableKeepingTheBaseWeight]],
eps = dat[["epsHcur"]], orig = dat[["value"]],
p = dat[["wValue"]], bound = bound, minMaxTrim = minMaxTrim)
)
}
} else {
dat[, c(variableKeepingTheCalibWeight) := fVariableForCalibrationIPF *
get(variableKeepingTheCalibWeight),
by = eval(paste0("combined_factors_h_", i))]
}
error <- TRUE
}
setnames(dat, "value", valueH[i])
setnames(dat, "epsHcur", paste0("epsH_", i))
return(error)
}
## recreate the formula argument to xtabs based on conP, conH
getFormulas <- function(con, w) {
formOut <- NULL
for (i in seq_along(con)) {
lhs <- names(con)[i]
if (is.null(lhs) || lhs == "") {
lhs <- w
} else {
lhs <- paste(lhs, "*", w)
}
rhs <- paste(names(dimnames(con[[i]])), collapse = "+")
formOut[[i]] <- formula(paste(lhs, "~", rhs), env = .GlobalEnv)
}
formOut
}
## enrich dat_original with the calibrated weights and assign attributes
addWeightsAndAttributes <- function(dat, conP, conH, epsP, epsH, dat_original,
maxIter, calIter, returnNA, cw, verbose) {
variableKeepingTheCalibWeight <- cw
representativeHouseholdForCalibration <- OriginalSortingVariable <-
outTable <- copy(dat_original)
formP <- getFormulas(conP, w = variableKeepingTheCalibWeight)
formH <- getFormulas(conH, w = variableKeepingTheCalibWeight)
# general information
setattr(outTable, "iterations", min(maxIter, calIter))
# input constraints
setattr(outTable, "conP", conP)
setattr(outTable, "conH", conH)
# adjusted constraints (conP, conH according to the calibrated weights)
conP_adj <- lapply(formP, xtabs, dat)
conH_adj <- lapply(
formH, xtabs, dat[representativeHouseholdForCalibration == 1])
setattr(outTable, "conP_adj", conP_adj)
setattr(outTable, "conH_adj", conH_adj)
# tolerances
setattr(outTable, "epsP", epsP)
setattr(outTable, "epsH", epsH)
setkey(dat, OriginalSortingVariable)
# convergence
conP_converged <- sapply(seq_along(conP), function(i) {
epsP_current <- switch(is.list(epsP) + 1, epsP, epsP[[i]])
all(abs(conP[[i]] - conP_adj[[i]]) <= epsP_current * conP[[i]])
})
conH_converged <- sapply(seq_along(conH), function(i) {
epsH_current <- switch(is.list(epsH) + 1, epsH, epsH[[i]])
all(abs(conH[[i]] - conH_adj[[i]]) <= epsH_current * conH[[i]])
})
converged <- all(conP_converged) && all(conH_converged)
setattr(outTable, "converged", converged)
if (verbose) {
if (converged)
message("Convergence reached")
else
message("No convergence reached")
}
# add calibrated weights. Use setkey to make sure the indexes match
setkey(dat, OriginalSortingVariable)
if (!converged & returnNA) {
outTable[, c(variableKeepingTheCalibWeight) := NA]
} else {
outTable[, c(variableKeepingTheCalibWeight) :=
dat[[variableKeepingTheCalibWeight]]]
}
# formulas
setattr(outTable, "formP", formP)
setattr(outTable, "formH", formH)
# not used yet
#class(outTable) <- c("ipf", class(outTable))
invisible(outTable)
}
#' Iterative Proportional Fitting
#'
#' Adjust sampling weights to given totals based on household-level and/or
#' individual level constraints.
#'
#' This function implements the weighting procedure described
#' [here](https://doi.org/10.17713/ajs.v45i3.120).
#' Usage examples can be found in the corresponding vignette
#' (`vignette("ipf")`).
#'
#' `conP` and `conH` are contingency tables, which can be created with `xtabs`.
#' The `dimnames` of those tables should match the names and levels of the
#' corresponding columns in `dat`.
#'
#' `maxIter`, `epsP` and `epsH` are the stopping criteria. `epsP` and `epsH`
#' describe relative tolerances in the sense that
#' \deqn{1-epsP < \frac{w_{i+1}}{w_i} < 1+epsP}{1-epsP < w(i+1)/w(i) < 1+epsP}
#' will be used as convergence criterium. Here i is the iteration step and wi is
#' the weight of a specific person at step i.
#'
#' The algorithm
#' performs best if all varables occuring in the constraints (`conP` and `conH`)
#' as well as the household variable are coded as `factor`-columns in `dat`.
#' Otherwise, conversions will be necessary which can be monitored with the
#' `conversion_messages` argument. Setting `check_hh_vars` to `FALSE` can also
#' incease the performance of the scheme.
#'
#' @name ipf
#' @md
#' @aliases ipf
#' @param dat a `data.table` containing household ids (optionally), base
#' weights (optionally), household and/or personal level variables (numerical
#' or categorical) that should be fitted.
#' @param hid name of the column containing the household-ids within `dat` or
#' NULL if such a variable does not exist.
#' @param w name if the column containing the base weights within `dat` or NULL
#' if such a variable does not exist. In the latter case, every observation
#' in `dat` is assigned a starting weight of 1.
#' @param conP list or (partly) named list defining the constraints on person
#' level. The list elements are contingency tables in array representation
#' with dimnames corresponding to the names of the relevant calibration
#' variables in `dat`. If a numerical variable is to be calibrated, the
#' respective list element has to be named with the name of that numerical
#' variable. Otherwise the list element shoud NOT be named.
#' @param conH list or (partly) named list defining the constraints on
#' household level. The list elements are contingency tables in array
#' representation with dimnames corresponding to the names of the relevant
#' calibration variables in `dat`. If a numerical variable is to be
#' calibrated, the respective list element has to be named with the name of
#' that numerical variable. Otherwise the list element shoud NOT be named.
#' @param epsP numeric value or list (of numeric values and/or arrays)
#' specifying the convergence limit(s) for `conP`. The list can contain
#' numeric values and/or arrays which must appear in the same order as the
#' corresponding constraints in `conP`. Also, an array must have the same
#' dimensions and dimnames as the corresponding constraint in `conP`.
#' @param epsH numeric value or list (of numeric values and/or arrays)
#' specifying the convergence limit(s) for `conH`. The list can contain
#' numeric values and/or arrays which must appear in the same order as the
#' corresponding constraints in `conH`. Also, an array must have the same
#' dimensions and dimnames as the corresponding constraint in `conH`.
#' @param verbose if TRUE, some progress information will be printed.
#' @param bound numeric value specifying the multiplier for determining the
#' weight trimming boundary if the change of the base weights should be
#' restricted, i.e. if the weights should stay between 1/`bound`*`w`
#' and `bound`*\code{w}.
#' @param minMaxTrim numeric vector of length2, first element a minimum value
#' for weights to be trimmed to, second element a maximum value for weights to
#' be trimmed to.
#' @param maxIter numeric value specifying the maximum number of iterations
#' that should be performed.
#' @param meanHH if TRUE, every person in a household is assigned the mean of
#' the person weights corresponding to the household. If `"geometric"`, the
#' geometric mean is used rather than the arithmetic mean.
#' @param allPthenH if TRUE, all the person level calibration steps are
#' performed before the houshold level calibration steps (and `meanHH`, if
#' specified). If FALSE, the houshold level calibration steps (and `meanHH`,
#' if specified) are performed after everey person level calibration step.
#' This can lead to better convergence properties in certain cases but also
#' means that the total number of calibration steps is increased.
#' @param returnNA if TRUE, the calibrated weight will be set to NA in case of
#' no convergence.
#' @param looseH if FALSE, the actual constraints `conH` are used for
#' calibrating all the hh weights. If TRUE, only the weights for which the
#' lower and upper thresholds defined by `conH` and `epsH` are exceeded are
#' calibrated. They are however not calibrated against the actual constraints
#' `conH` but against these lower and upper thresholds, i.e.
#' `conH`-`conH`*`epsH` and `conH`+`conH`*\code{epsH}.
#' @param numericalWeighting See [numericalWeighting]
#' @param check_hh_vars If `TRUE` check for non-unique values inside of a
#' household for variables in household constraints
#' @param conversion_messages show a message, if inputs need to be reformatted.
#' This can be useful for speed optimizations if ipf is called several times
#' with similar inputs (for example bootstrapping)
#' @param nameCalibWeight character defining the name of the variable for the
#' newly generated calibrated weight.
#' @return The function will return the input data `dat` with the calibrated
#' weights `calibWeight` as an additional column as well as attributes. If no
#' convergence has been reached in `maxIter` steps, and `returnNA` is `TRUE`
#' (the default), the column `calibWeights` will only consist of `NA`s. The
#' attributes of the table are attributes derived from the `data.table` class
#' as well as the following.
#' \tabular{ll}{
#' `converged` \tab Did the algorithm converge in `maxIter` steps? \cr
#' `iterations` \tab The number of iterations performed. \cr
#' `conP`, `conH`, `epsP`, `epsH` \tab See Arguments. \cr
#' `conP_adj`, `conH_adj` \tab Adjusted versions of `conP` and `conH` \cr
#' `formP`, `formH` \tab Formulas that were used to calculate `conP_adj` and
#' `conH_adj` based on the output table.
#' }
#' @export ipf
#' @author Alexander Kowarik, Gregor de Cillia
#' @examples
#' \dontrun{
#'
#' # load data
#' eusilc <- demo.eusilc(n = 1, prettyNames = TRUE)
#'
#' # personal constraints
#' conP1 <- xtabs(pWeight ~ age, data = eusilc)
#' conP2 <- xtabs(pWeight ~ gender + region, data = eusilc)
#' conP3 <- xtabs(pWeight*eqIncome ~ gender, data = eusilc)
#'
#' # household constraints
#' conH1 <- xtabs(pWeight ~ hsize + region, data = eusilc)
#'
#' # simple usage ------------------------------------------
#'
#' calibweights1 <- ipf(
#' eusilc,
#' conP = list(conP1, conP2, eqIncome = conP3),
#' bound = NULL,
#' verbose = TRUE
#' )
#'
#' # compare personal weight with the calibweigth
#' calibweights1[, .(hid, pWeight, calibWeight)]
#'
#' # advanced usage ----------------------------------------
#'
#' # use an array of tolerances
#' epsH1 <- conH1
#' epsH1[1:4, ] <- 0.005
#' epsH1[5, ] <- 0.2
#'
#' # create an initial weight for the calibration
#' eusilc[, regSamp := .N, by = region]
#' eusilc[, regPop := sum(pWeight), by = region]
#' eusilc[, baseWeight := regPop/regSamp]
#'
# calibrate
#' calibweights2 <- ipf(
#' eusilc,
#' conP = list(conP1, conP2),
#' conH = list(conH1),
#' epsP = 1e-6,
#' epsH = list(epsH1),
#' bound = 4,
#' w = "baseWeight",
#' verbose = TRUE
#' )
#'
#' # show an adjusted version of conP and the original
#' attr(calibweights2, "conP_adj")
#' attr(calibweights2, "conP")
#' }
ipf <- function(
dat, hid = NULL, conP = NULL, conH = NULL, epsP = 1e-6, epsH = 1e-2,
verbose = FALSE, w = NULL, bound = 4, maxIter = 200, meanHH = TRUE,
allPthenH = TRUE, returnNA = TRUE, looseH = FALSE, numericalWeighting =
computeLinear, check_hh_vars = TRUE, conversion_messages = FALSE,
nameCalibWeight = "calibWeight", minMaxTrim = NULL) {
check_population_totals(conP, dat, "personal")
check_population_totals(conH, dat, "household")
check_eps(conP, epsP, type = "personal")
check_eps(conH, epsH, type = "household")
variableKeepingTheBaseWeight <- w
variableKeepingTheCalibWeight <- nameCalibWeight
if ("variableKeepingTheBaseWeight" %in% names(dat))
stop("The provided dataset must not have a column called",
" 'variableKeepingTheBaseWeight'")
if(!is.null(minMaxTrim)){
if(length(minMaxTrim)!=2)
stop("minMaxTrim must have exactly 2 elements, a minimum and a maximum.")
if(!is.numeric(minMaxTrim)){
stop("minMaxTrim must be a numeric vector of length two.")
}
if(minMaxTrim[2]<minMaxTrim[1]){
stop("minMaxTrim must have a minimum as a first element and a maximum as a second element.
But in the input the second element is smaller than the first.")
}
}
OriginalSortingVariable <- V1 <- epsvalue <-
f <- temporary_hvar <-
value <- wValue <- representativeHouseholdForCalibration <- ..hid <- NULL
dat_original <- dat
dat <- copy(dat)
## originalsorting is fucked up without this
dat[, OriginalSortingVariable := .I]
meanfun <- getMeanFun(meanHH)
# dat sollte ein data.table sein
# w ein Name eines Basisgewichts oder NULL
valueP <- paste0("valueP", seq_along(conP))
###fixed target value, should not be changed in iterations
valueH <- paste0("valueH", seq_along(conH))
###Housekeeping of the varNames used
usedVarNames <- c(valueP, valueH, "value",
"representativeHouseholdForCalibration", "wValue")
if (any(names(dat) %in% usedVarNames)) {
renameVars <- names(dat)[names(dat) %in% usedVarNames]
setnames(dat, renameVars, paste0(renameVars, "_safekeeping"))
}
### Treatment of HID, creating 0,1 var for being the first hh member
#delVars <- c()
if (is.null(hid)) {
#delVars <- c("hid")
hid <- "hid"
dat[, hid := as.factor(seq_len(nrow(dat)))]
dat[, representativeHouseholdForCalibration := 1]
} else {
if(!hid%in%colnames(dat)){
stop("dat does not contain column ",hid)
}
if(any(is.na(dat[[hid]]))){
stop("hid contains missing values")
}
if (!is.factor(dat[[hid]]))
data.table::set(dat, NULL, hid, as.factor(dat[[hid]]))
dat[, representativeHouseholdForCalibration :=
as.numeric(!duplicated(get(..hid)))]
}
## Names of the calibration variables for Person and household dimension
pColNames <- lapply(conP, function(x) names(dimnames(x)))
hColNames <- lapply(conH, function(x) names(dimnames(x)))
for (i in seq_along(conP)) {
current_colnames <- pColNames[[i]]
for (colname in current_colnames) {
if (!inherits(dat[[colname]], "factor")) {
if (conversion_messages)
message("converting column ", colname, " to factor")
set(
dat, j = colname,
value = factor(dat[[colname]],
levels = dimnames(conP[[i]])[[colname]])
)
}
else if (!identical(levels(dat[[colname]]),
dimnames(conP[[i]])[[colname]])) {
if (conversion_messages)
message("correct levels of column ", colname)
set(
dat, j = colname, value = factor(
dat[[colname]], levels = dimnames(conP[[i]])[[colname]])
)
}
}
combined_factors <- combine_factors(dat, conP[[i]])
set(dat, j = paste0("combined_factors_", i), value = combined_factors)
set(dat, j = paste0("valueP", i),
value = as.vector(conP[[i]][combined_factors]))
}
for (i in seq_along(conH)) {
colnames <- hColNames[[i]]
## make sure the columns mentioned in the contingency table are in fact
## factors
for (colname in colnames) {
if (!inherits(dat[[colname]], "factor")) {
if (conversion_messages)
message("converting column ", colname, " to factor")
set(
dat, j = colname, value = factor(
dat[[colname]], levels = dimnames(conH[[i]])[[colname]])
)
}
else if (!identical(levels(dat[[colname]]),
dimnames(conH[[i]])[[colname]])) {
if (conversion_messages)
message("correct levels of column ", colname)
set(
dat, j = colname, value = factor(
dat[[colname]], levels = dimnames(conH[[i]])[[colname]])
)
}
}
combined_factors <- combine_factors(dat, conH[[i]])
set(dat, j = paste0("combined_factors_h_", i), value = combined_factors)
set(dat, j = paste0("valueH", i),
value = as.vector(conH[[i]][combined_factors]))
}
if (is.null(variableKeepingTheBaseWeight)) {
if (!is.null(bound) && is.null(w))
stop("Bounds are only reasonable if base weights are provided")
set(dat, j = variableKeepingTheCalibWeight, value = 1)
} else {
set(dat, j = variableKeepingTheCalibWeight,
value = dat[[variableKeepingTheBaseWeight]])
}
if (check_hh_vars) {
## Check for non-unqiue values inside of a household for variabels used
## in Household constraints
for (hh in hColNames) {
for (h in hh) {
setnames(dat, h, "temporary_hvar")
if (dat[, length(unique(temporary_hvar)),
by = c(hid)][, any(V1 != 1)]) {
stop(paste(h, "has different values inside a household"))
}
setnames(dat, "temporary_hvar", h)
}
}
}
if (is.list(epsP)) {
for (i in seq_along(epsP)) {
if (is.array(epsP[[i]])) {
combined_factors <- dat[[paste0("combined_factors_", i)]]
set(dat, j = paste0("epsP_", i),
value = as.vector(epsP[[i]][combined_factors]))
} else {
set(dat, j = paste0("epsP_", i), value = epsP[[i]])
}
}
} else {
for (i in seq_along(conP)) {
set(dat, j = paste0("epsP_", i), value = epsP)
}
}
if (is.list(epsH)) {
for (i in seq_along(epsH)) {
if (is.array(epsH[[i]])) {
combined_factors <- dat[[paste0("combined_factors_h_", i)]]
set(dat, j = paste0("epsH_", i),
value = as.vector(epsH[[i]][combined_factors]))
} else {
set(dat, j = paste0("epsH_", i), value = epsH[[i]])
}
}
} else {
for (i in seq_along(conH)) {
set(dat, j = paste0("epsH_", i), value = epsH)
}
}
###Calib
error <- TRUE
calIter <- 1
while (error && calIter <= maxIter) {
error <- FALSE
if (allPthenH) {
### Person calib
for (i in seq_along(conP)) {
numericalWeightingTmp <- NULL
if (isTRUE(names(conP)[i] != "")) {
numericalWeightingTmp <- names(conP)[i]
}
error <- calibP(
i = i, dat = dat, error = error, valueP = valueP,
pColNames = pColNames, bound = bound, verbose = verbose,
calIter = calIter, numericalWeighting = numericalWeighting,
numericalWeightingVar = numericalWeightingTmp,
w = variableKeepingTheBaseWeight,
cw = variableKeepingTheCalibWeight, minMaxTrim = minMaxTrim)
}
## replace person weight with household average
set(dat, j = variableKeepingTheCalibWeight,
value = meanfun(dat[[variableKeepingTheCalibWeight]], dat[[hid]]))
### Household calib
for (i in seq_along(conH)) {
numericalWeightingTmp <- NULL
if (isTRUE(names(conH)[i] != "")) {
numericalWeightingTmp <- names(conH)[i]
}
error <- calibH(
i = i, dat = dat, error = error, valueH = valueH,
hColNames = hColNames, bound = bound, verbose = verbose,
calIter = calIter, looseH = looseH,
numericalWeighting = numericalWeighting,
numericalWeightingVar = numericalWeightingTmp,
w = variableKeepingTheBaseWeight,
cw = variableKeepingTheCalibWeight, minMaxTrim = minMaxTrim)
}
} else {
### Person calib
for (i in seq_along(conP)) {
numericalWeightingTmp <- NULL
if (isTRUE(names(conP)[i] != "")) {
numericalWeightingTmp <- names(conP)[i]
}
error <- calibP(
i = i, dat = dat, error = error, valueP = valueP,
pColNames = pColNames, bound = bound, verbose = verbose,
calIter = calIter, numericalWeighting = numericalWeighting,
numericalWeightingVar = numericalWeightingTmp,
w = variableKeepingTheBaseWeight,
cw = variableKeepingTheCalibWeight, minMaxTrim = minMaxTrim)
## replace person weight with household average
set(dat, j = variableKeepingTheCalibWeight,
value = meanfun(dat[[variableKeepingTheCalibWeight]], dat[[hid]]))
### Household calib
for (i in seq_along(conH)) {
numericalWeightingTmp <- NULL
if (isTRUE(names(conH)[i] != "")) {
numericalWeightingTmp <- numericalWeighting
}
error <- calibH(
i = i, dat = dat, error = error, valueH = valueH,
hColNames = hColNames, bound = bound, verbose = verbose,
calIter = calIter, numericalWeighting = numericalWeighting,
numericalWeightingVar = numericalWeightingTmp, looseH = looseH,
w = variableKeepingTheBaseWeight,
cw = variableKeepingTheCalibWeight, minMaxTrim = minMaxTrim)
}
}
}
if (verbose && !error) {
message("Iteration stopped after ", calIter, " steps")
} else if (maxIter == calIter) {
warning("Not converged in ", maxIter, " steps")
}
calIter <- calIter + 1
}
# Remove Help Variables
fVariableForCalibrationIPF <- NULL
dat[, fVariableForCalibrationIPF := NULL]
addWeightsAndAttributes(dat, conP, conH, epsP, epsH, dat_original, maxIter,
calIter, returnNA, variableKeepingTheCalibWeight,
verbose)
}