Skip to content

Commit

Permalink
this fixes #3
Browse files Browse the repository at this point in the history
  • Loading branch information
alexkowa committed Nov 22, 2019
1 parent e217a1d commit 87910d1
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 51 deletions.
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ URL: https://github.com/statistikat/simPop
Depends:
R(>= 3.0.0),
lattice,
vcd,
data.table
vcd
Imports:
data.table,
laeken,
plyr,
MASS,
Expand All @@ -27,7 +27,7 @@ Imports:
doParallel,
foreach,
colorspace,
VIM,
VIM,matrixStats,
methods,party,EnvStats,fitdistrplus,ranger,wrswoR
Suggests:
haven,
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ importFrom(lattice,panel.bwplot)
importFrom(lattice,panel.points)
importFrom(lattice,panel.refline)
importFrom(lattice,panel.xyplot)
importFrom(matrixStats,colMeans2)
importFrom(matrixStats,colSds)
importFrom(plyr,revalue)
importFrom(ranger,ranger)
importFrom(stats,as.formula)
Expand All @@ -159,6 +161,7 @@ importFrom(stats,rexp)
importFrom(stats,rnorm)
importFrom(stats,runif)
importFrom(stats,sd)
importFrom(stats,terms)
importFrom(stats,uniroot)
importFrom(stats,var)
importFrom(stats,weighted.mean)
Expand Down
3 changes: 2 additions & 1 deletion R/0classes.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ setClassUnion('listOrNULL', c('list', 'NULL'))
#' @importFrom colorspace heat_hcl
#' @importFrom VIM hotdeck
#' @importFrom graphics par
#' @importFrom stats as.formula chisq.test coef cor cov dlogis formula lm mad quantile rexp na.omit
#' @importFrom stats as.formula chisq.test coef cor cov dlogis formula lm mad quantile rexp na.omit terms
#' @importFrom utils read.csv2 tail head
#' @importFrom stats median model.matrix optim plogis ppoints predict rnorm runif uniroot var weighted.mean glm poisson xtabs
# removed stats quantilerexp
Expand All @@ -32,6 +32,7 @@ setClassUnion('listOrNULL', c('list', 'NULL'))
#' @importFrom fitdistrplus fitdist
#' @importFrom ranger ranger
#' @importFrom RcppArmadillo armadillo_version
#' @importFrom matrixStats colSds colMeans2
NULL


Expand Down
20 changes: 11 additions & 9 deletions R/calibPop.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ calcFinalWeights <- function(data0, totals0, params) {
subsetList <- function(totals,split,x){

totals <- lapply(totals,function(dt){
dt.x <- dt[.(x),,on=c(split)]
dt.x <- dt[list(x),,on=c(split)]
dt.x[[split]]<-NULL
return(dt.x)
})
Expand Down Expand Up @@ -99,7 +99,7 @@ checkTables <- function(tabs,namesData,split=NULL,verbose=FALSE,namesTabs="pers"
makeFactor <- keepNames[keepNames!="Freq"]
z[,c(makeFactor):=lapply(.SD,factor),.SDcols=c(makeFactor)]

z[,..keepNames]
z[, keepNames, with=FALSE]

})

Expand Down Expand Up @@ -238,6 +238,8 @@ makeFactors <- function(totals,dat,split){
#' @param observe.times Only used if \code{memory=TRUE}. Number of times the new value of the objective function is saved. If \code{observe.times=0} values are not saved.
#' @param observe.break Only used if \code{memory=TRUE}. When objective value has been saved \code{observe.times}-times the coefficient of variation is calculated over saved values; if the coefficient of variation falls below \code{observe.break}
#' simmulated annealing terminates. This repeats for each new set of \code{observe.times} new values of the objecive function. Can help save run time if objective value does not improve much. Disable this termination by either setting \code{observe.times=0} or \code{observe.break=0}.
#' @param hhTables information on population margins for households
#' @param persTables information on population margins for persons
#' @return Returns an object of class \code{\linkS4class{simPopObj}} with an
#' updated population listed in slot 'pop'.
#' @author Bernhard Meindl, Johannes Gussenbauer and Matthias Templ
Expand Down Expand Up @@ -292,7 +294,7 @@ calibPop <- function(inp, split=NULL, splitUpper=NULL, temp = 1, epsP.factor = 0
warning("only first variable will be used to divide the population into strata")
}

hid_help <- doub <- new.weights <- temporaryhid <- x <- NULL
weight_choose <- hid_help <- doub <- new.weights <- temporaryhid <- x <- NULL
data <- popData(inp)
hid <- popObj(inp)@hhid
pid <- popObj(inp)@pid
Expand Down Expand Up @@ -380,8 +382,8 @@ calibPop <- function(inp, split=NULL, splitUpper=NULL, temp = 1, epsP.factor = 0

final_weights <- foreach(x=1:length(split.number), .options.snow=list(preschedule=TRUE)) %dopar% {
split.x <- split.number[x]
splitUpper.x <- data[.(split.x),,on=c(split)][[splitUpper]][1]
data0 <- data[.(splitUpper.x),,on=.(upazilaCode)]
splitUpper.x <- data[list(split.x),,on=c(split)][[splitUpper]][1]
data0 <- data[list(splitUpper.x),,on=c(split)]
totals0 <- subsetList(totals,split=split,x=split.x)
rm(inp)
simAnnealingDT(
Expand All @@ -395,8 +397,8 @@ calibPop <- function(inp, split=NULL, splitUpper=NULL, temp = 1, epsP.factor = 0
}else if ( !have_win ) {# linux/mac
final_weights <- mclapply(1:length(split.number), function(x) {
split.x <- split.number[x]
splitUpper.x <- data[.(split.x),,on=c(split)][[splitUpper]][1]
data0 <- data[.(splitUpper.x),,on=.(upazilaCode)]
splitUpper.x <- data[list(split.x),,on=c(split)][[splitUpper]][1]
data0 <- data[list(splitUpper.x),,on=c(split)]
totals0 <- subsetList(totals,split=split,x=split.x)
rm(inp)
simAnnealingDT(
Expand All @@ -410,8 +412,8 @@ calibPop <- function(inp, split=NULL, splitUpper=NULL, temp = 1, epsP.factor = 0
} else {
final_weights <- lapply(1:length(split.number), function(x) {
split.level <- split.number[x]
splitUpper.x <- data[.(split.level),,on=c(split)][[splitUpper]][1]
data0 <- data[.(splitUpper.x),,on=.(upazilaCode)]
splitUpper.x <- data[list(split.level),,on=c(split)][[splitUpper]][1]
data0 <- data[list(splitUpper.x),,on=c(split)]
totals0 <- subsetList(totals,split=split,x=split.level)
simAnnealingDT(
data0=data0,
Expand Down
26 changes: 13 additions & 13 deletions R/simAnnealingDT.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,17 @@ checkObjective <- function(totals0,epsH,epsP,epsMinN=0){

# update totals0 with population totals
updateTotals <- function(totals0,data0,hhid,numberPop="weight_choose"){
firstPersonInHousehold <- NULL
for(i in 1:length(totals0)){

byVars <- intersect(colnames(totals0[[i]]),colnames(data0))
if(grepl("pers",names(totals0[i]))){
totals0[[i]] <- merge(totals0[[i]][,mget(c(byVars,"Freq"))],
data0[,.(FreqPop=sum(get(numberPop))),by=c(byVars)],
data0[,list(FreqPop=sum(get(numberPop))),by=c(byVars)],
by=c(byVars))
}else{
totals0[[i]] <- merge(totals0[[i]][,mget(c(byVars,"Freq"))],
data0[firstPersonInHousehold==TRUE,.(FreqPop=sum(get(numberPop))),by=c(byVars)],
data0[firstPersonInHousehold==TRUE,list(FreqPop=sum(get(numberPop))),by=c(byVars)],
by=c(byVars))
}
}
Expand All @@ -48,7 +49,7 @@ updateTotals <- function(totals0,data0,hhid,numberPop="weight_choose"){
# calc objective function
# absolute sum of number of people/households in population - number of people/household in contingenca table
calcObjective <- function(totals0){

Freq <- FreqPop <- NULL
objective <- sapply(totals0,function(z){
z[,sum(abs(FreqPop-Freq))]
})
Expand All @@ -72,7 +73,7 @@ setRedraw <- function(objective,med_hh=1,fac=2/3){

# set redraw gap
setRedrawGap <- function(totals0,med_hh=1,scale.redraw=0.5){

Freq <- FreqPop <- NULL
med_hh_help <- rep(1,length(totals0))
med_hh_help[grepl("pers",names(totals0))] <- med_hh

Expand All @@ -88,7 +89,7 @@ setRedrawGap <- function(totals0,med_hh=1,scale.redraw=0.5){

# get probabilites for resampling
getProbabilities <- function(totals0,data0,select_add,select_remove,med_hh){

helpMergeIndex <- Freq <- FreqPop <- NULL
# cat("prep totals_diff\n")
totals_diff <- copy(totals0)
totals_diff_merged <- NULL
Expand Down Expand Up @@ -201,7 +202,7 @@ simAnnealingDT <- function(data0,totals0,params,sizefactor=2,
######################################
# initialize weights
data0[,weight_choose:=0]
data0[.(split.level),weight_choose:=1,on=c(split)]
data0[list(split.level),weight_choose:=1,on=c(split)]
init_index <- data0[weight_choose==1,which=TRUE]
init_weight <- rep(0L,max_n)
init_weight[init_index] <- 1
Expand Down Expand Up @@ -237,7 +238,7 @@ simAnnealingDT <- function(data0,totals0,params,sizefactor=2,
if ( checkObjective(totals0,epsH,epsP,epsMinN) ) {
setkeyv(data0,"sim_ID")
selectVars <- c(hhid,params[["pid"]],"weight_choose")
out <- data0[,..selectVars]
out <- data0[, selectVars, with = FALSE]
cat(paste0("Convergence successfull for ",split," ",split.level),"\n")
} else {

Expand Down Expand Up @@ -300,13 +301,12 @@ simAnnealingDT <- function(data0,totals0,params,sizefactor=2,
}

add_index <- add_hh%%nrow(data0)+1
data0[add_index,.N,by=htype]
####################################
## create new composition
init_weight_new <- copy(init_weight)

init_weight_new <- simPop:::updateVecC(init_weight_new,add_index=add_hh, remove_index=remove_hh, hhsize=size, hhid=id, sizefactor=size_all)
data0[ ,weight_choose_new:=simPop:::sumVec(init_weight_new,size_all)]
init_weight_new <- updateVecC(init_weight_new,add_index=add_hh, remove_index=remove_hh, hhsize=size, hhid=id, sizefactor=size_all)
data0[ ,weight_choose_new:=sumVec(init_weight_new,size_all)]

######################################
# calculate objective
Expand Down Expand Up @@ -368,7 +368,7 @@ simAnnealingDT <- function(data0,totals0,params,sizefactor=2,
if(do.observe){
observe.count <- observe.count +1
if(observe.count>=observe.times){
breakCond <- matrixStats::colSds(observe.obj)/matrixStats::colMeans2(observe.obj)
breakCond <- colSds(observe.obj)/colMeans2(observe.obj)
breakCond <- max(breakCond)
if(breakCond< observe.break){
break # if objective doesnt move anymore break up loop
Expand Down Expand Up @@ -427,7 +427,7 @@ simAnnealingDT <- function(data0,totals0,params,sizefactor=2,
if(do.observe){
observe.count <- observe.count +1
if(observe.count>=observe.times){
breakCond <- matrixStats::colSds(observe.obj)/matrixStats::colMeans2(observe.obj)
breakCond <- colSds(observe.obj)/colMeans2(observe.obj)
breakCond <- max(breakCond)
if(breakCond< observe.break){
break # if objective doesnt move anymore break up loop
Expand Down Expand Up @@ -468,7 +468,7 @@ simAnnealingDT <- function(data0,totals0,params,sizefactor=2,
}
setkeyv(data0,"sim_ID")
selectVars <- c(hhid,params[["pid"]],"weight_choose")
out <- data0[,..selectVars]
out <- data0[, selectVars, with = FALSE]
}

out[,c(split):=split.level]
Expand Down
35 changes: 18 additions & 17 deletions R/simRelation.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ simulateValues <- function(dataSample, dataPop, params) {
if ( !nrow(dataSample) ) {
return(character())
}
predNames <- NULL
meth <- params$method
cur.var <- params$cur.var
hasNewLevels <- params$hasNewLevels
Expand Down Expand Up @@ -110,9 +111,9 @@ simulateValues <- function(dataSample, dataPop, params) {
if ( length(ind) == 1 ) {
resample <- function(k, n, p) rep.int(1, n[k])
} else if ( length(ind) == 2 ) {
resample <- function(k, n, p) simPop:::spSample(n[k], c(1-p[k],p[k]))
resample <- function(k, n, p) spSample(n[k], c(1-p[k],p[k]))
} else {
resample <- function(k, n, p) simPop:::spSample(n[k], p[k,])
resample <- function(k, n, p) spSample(n[k], p[k,])
}
# generate realizations for each combination
if ( length(exclude) == 0 ) {
Expand All @@ -134,7 +135,7 @@ simulateValues <- function(dataSample, dataPop, params) {
# anyway) and replace the values of non-directly related
# in the third step
dataPopHID <- dataPop[[hid]]
sim <- simHead[.(dataPopHID),,on=c(hid)][[cur.var]]
sim <- simHead[list(dataPopHID),,on=c(hid)][[cur.var]]
rm(simHead)

# third step: simulate category of non-directly related
Expand All @@ -152,7 +153,7 @@ simulateValues <- function(dataSample, dataPop, params) {
hidSample <- dataSample[[hid]]
names(add) <- hidSample[indSampleHead]
add <- add[as.character(hidSample)]
iHead <- simPop:::getHeadName(cur.var)
iHead <- getHeadName(cur.var)
dataSample[, iHead] <- add
# limit sample and population data to non-directly related household members
dataSampleWork <- dataSample[indSampleNonDirect,]
Expand Down Expand Up @@ -382,7 +383,7 @@ simRelation <- function(simPopObj, relation = "relate", head = "head",
method=c("multinom", "ctree","cforest","ranger"),
by = "strata") {

x <- newAdditionalVarible <- NULL
V1 <- x <- newAdditionalVarible <- NULL

method <- match.arg(method)

Expand Down Expand Up @@ -411,12 +412,12 @@ simRelation <- function(simPopObj, relation = "relate", head = "head",
if(length(problematicHHsample)>0){
warning("Sample:There are households without a head, a random head will be asigned.")
setkeyv(data_sample,hid)
data_sample[.(problematicHHsample),c(relation):=makeOneRandom(get(relation),head), by=c(hid)]
data_sample[list(problematicHHsample),c(relation):=makeOneRandom(get(relation),head), by=c(hid)]
}
if(length(problematicHHpop)>0){
warning("Population:There are households without a head, a random head will be asigned.")
setkeyv(data_pop,hid)
data_pop[.(problematicHHpop),c(relation):=makeOneRandom(get(relation),head), by=c(hid)]
data_pop[list(problematicHHpop),c(relation):=makeOneRandom(get(relation),head), by=c(hid)]
}


Expand All @@ -441,7 +442,7 @@ simRelation <- function(simPopObj, relation = "relate", head = "head",
# }

nr_strata <- length(levels(data_sample[[curStrata]]))
pp <- simPop:::parallelParameters(nr_cpus=nr_cpus, nr_strata=nr_strata)
pp <- parallelParameters(nr_cpus=nr_cpus, nr_strata=nr_strata)

parallel <- pp$parallel
nr_cores <- pp$nr_cores
Expand Down Expand Up @@ -525,27 +526,27 @@ simRelation <- function(simPopObj, relation = "relate", head = "head",
}else{
curRegModel <- regModel
}
regInput <- simPop:::regressionInput(simPopObj, additional=additional[counter], regModel=curRegModel)
regInput <- regressionInput(simPopObj, additional=additional[counter], regModel=curRegModel)
# names of predictor variables
predNames <- setdiff(regInput[[1]]$predNames, c(dataS@hhsize, curStrata, relation))

# observations with missings are excluded from simulation
exclude <- simPop:::getExclude.data.table(data_sample[,c(additional,predNames),with=FALSE])
exclude <- getExclude.data.table(data_sample[,c(additional,predNames),with=FALSE])
if ( length(exclude) > 0 ) {
sampWork <- data_sample[-exclude,]
} else {
sampWork <- data_sample
}

# variables are coerced to factors
sampWork <- simPop:::checkFactor(sampWork, unique(c(curStrata, additional)))
data_pop <- simPop:::checkFactor(data_pop_o, unique(c(curStrata)))
sampWork <- checkFactor(sampWork, unique(c(curStrata, additional)))
data_pop <- checkFactor(data_pop_o, unique(c(curStrata)))
# components of multinomial model are specified
levelsResponse <- levels(sampWork[[i]])
# simulation of variables using a sequence of multinomial models
if ( method == "multinom" ) {
formula.cmd <- paste(i, "~", paste(predNames, collapse = " + "))
formula.cmd_nd <- paste(i, "~", paste(c(predNames,simPop:::getHeadName(i)), collapse = " + "))
formula.cmd_nd <- paste(i, "~", paste(c(predNames,getHeadName(i)), collapse = " + "))
formula.cmd <- paste0("suppressWarnings(multinom(", formula.cmd)
formula.cmd_nd <- paste0("suppressWarnings(multinom(", formula.cmd_nd)
if(!dataS@ispopulation){
Expand All @@ -562,7 +563,7 @@ simRelation <- function(simPopObj, relation = "relate", head = "head",
}else if ( method == "ctree" ) {
# simulation via recursive partitioning and regression trees
formula.cmd <- paste(i, "~", paste(predNames, collapse = " + "))
formula.cmd_nd <- paste(i, "~", paste(c(predNames,simPop:::getHeadName(i)), collapse = " + "))
formula.cmd_nd <- paste(i, "~", paste(c(predNames,getHeadName(i)), collapse = " + "))
formula.cmd <- paste("suppressWarnings(ctree(", formula.cmd)
formula.cmd_nd <- paste("suppressWarnings(ctree(", formula.cmd_nd)
if(!dataS@ispopulation){
Expand All @@ -578,7 +579,7 @@ simRelation <- function(simPopObj, relation = "relate", head = "head",
}else if ( method == "cforest" ) {
# simulation via random forest
formula.cmd <- paste(i, "~", paste(predNames, collapse = " + "))
formula.cmd_nd <- paste(i, "~", paste(c(predNames,simPop:::getHeadName(i)), collapse = " + "))
formula.cmd_nd <- paste(i, "~", paste(c(predNames,getHeadName(i)), collapse = " + "))
formula.cmd <- paste("suppressWarnings(cforest(", formula.cmd)
formula.cmd_nd <- paste("suppressWarnings(cforest(", formula.cmd_nd)
if(!dataS@ispopulation){
Expand All @@ -592,7 +593,7 @@ simRelation <- function(simPopObj, relation = "relate", head = "head",
}else if ( method == "ranger" ) {
# simulation via random forest
formula.cmd <- paste(i, "~", paste(predNames, collapse = " + "))
formula.cmd_nd <- paste(i, "~", paste(c(predNames,simPop:::getHeadName(i)), collapse = " + "))
formula.cmd_nd <- paste(i, "~", paste(c(predNames,getHeadName(i)), collapse = " + "))
formula.cmd <- paste("suppressWarnings(ranger(", formula.cmd)
formula.cmd_nd <- paste("suppressWarnings(ranger(", formula.cmd_nd)
if(!dataS@ispopulation){
Expand Down Expand Up @@ -662,7 +663,7 @@ simRelation <- function(simPopObj, relation = "relate", head = "head",

values <- lapply(levels(data_sample[[curStrata]]), function(x) {
simulateValues(
dataSample=data_sample[.(x),,on=c(curStrata)],
dataSample=data_sample[list(x),,on=c(curStrata)],
dataPop=data_pop[indStrata[[x]], c(predNames, simPopObj@pop@hhid, relation), with=FALSE], params
)
})
Expand Down

0 comments on commit 87910d1

Please sign in to comment.