Skip to content

Commit

Permalink
phylofactorization
Browse files Browse the repository at this point in the history
  • Loading branch information
reptalex committed Jul 7, 2018
1 parent 3ab245e commit e6837da
Show file tree
Hide file tree
Showing 13 changed files with 60 additions and 118 deletions.
1 change: 0 additions & 1 deletion DESCRIPTION
Expand Up @@ -22,7 +22,6 @@ Imports:
grDevices(>= 3.2.2),
caper (>= 0.5.2),
Biostrings (>= 2.38.4),
biglm (>= 0.9.1),
mgcv (>= 1.8-16),
ggplot2 (>= 2.2.1),
ggtree (>= 1.8.2),
Expand Down
53 changes: 15 additions & 38 deletions R/PhyloFactor.R
Expand Up @@ -9,8 +9,6 @@
#' @param contrast.fcn Contrast function. Default is an efficient version of \code{BalanceContrast}. Another built-in option is \code{\link{amalgamate}} - for amalgamation-based analyses of compositional data, set \code{transform.fcn=I} and \code{contrast.fcn=amalgamate}.
#' @param method Which default objective function to use either "glm", "max.var" or "gam".
#' @param nfactors Number of clades or factors to produce in phylofactorization. Default, NULL, will iterate phylofactorization until either dim(Data)[1]-1 factors, or until stop.fcn returns T
#' @param quiet Logical, default is \code{FALSE}, indicating whether or not to display standard warnings.
#' @param trust.me Logical, default \code{FALSE}, indicating whether or not to trust the input Data to be compositional with no zeros.
#' @param small.output Logical, indicating whether or not to trim output. If \code{TRUE}, output may not work with downstream summary and plotting wrappers.
#' @param stop.fcn Currently, accepts input of 'KS'. Coming soon: input your own function of the environment in phylofactor to determine when to stop.
#' @param stop.early Logical indicating if stop.fcn should be evaluated before (stop.early=T) or after (stop.early=F) choosing an edge maximizing the objective function.
Expand All @@ -19,7 +17,6 @@
#' @param ncores Number of cores for built-in parallelization of phylofactorization. Parallelizes the extraction of groups, amalgamation of data based on groups, regression, and calculation of objective function. Be warned - this can lead to R taking over a system's memory.
#' @param tolerance Tolerance for deviation of column sums of data from 1. if abs(colSums(Data)-1)>tolerance, a warning message will be displayed.
#' @param delta Numerical value for replacement of zeros. Default is 0.65, so zeros will be replaced with 0.65*min(Data[Data>0])
#' @param smallglm Logical allowing use of \code{bigglm} when \code{ncores} is not \code{NULL}. If \code{TRUE}, will use regular \code{glm()} at base of regression. If \code{FALSE}, will use slower but memory-efficient \code{bigglm}. Default is false.
#' @param choice.fcn Function for customized choice function. Must take as input the numeric vector of ilr coefficients \code{y}, the input meta-data/independent-variable \code{X}, and a logical \code{PF.output}. If \code{PF.output==F}, the output of \code{choice.fcn} must be a two-member list containing numerics \code{output$objective} and \code{output$stopStatistic}. Phylofactor will choose the edge which maximizes \code{output$objective} and a customzed input \code{stop.fcn} can be used with the \code{output$stopStatistics} to stop phylofactor internally.
#' @param cluster.depends Character parsed and evaluated by cluster to load all dependencies for custom choice.fcn. e.g. \code{cluster.depends <- 'library(bayesm)'}
#' @param ... optional input arguments for \code{\link{glm}} or, if \code{method=='gam'}, input for \code{nlme::gam}
Expand Down Expand Up @@ -315,7 +312,7 @@
#' #The optimal environment for this simulated organism is mu=-1
#' c('sigma'=sigma,'sigma.hat'=sigma.hat) #The standard deviation is ~0.9.

PhyloFactor <- function(Data,tree,X=NULL,frmla = Data~X,choice='var',transform.fcn=log,contrast.fcn=NULL,method='glm',nfactors=NULL,quiet=T,trust.me=F,small.output=F,stop.fcn=NULL,stop.early=NULL,KS.Pthreshold=0.01,alternative='greater',ncores=NULL,tolerance=1e-10,delta=0.65,smallglm=T,choice.fcn=NULL,cluster.depends='',...){
PhyloFactor <- function(Data,tree,X=NULL,frmla = Data~X,choice='var',transform.fcn=log,contrast.fcn=NULL,method='glm',nfactors=NULL,small.output=F,stop.fcn=NULL,stop.early=NULL,KS.Pthreshold=0.01,alternative='greater',ncores=NULL,tolerance=1e-10,delta=0.65,choice.fcn=NULL,cluster.depends='',...){


######################################################## Housekeeping #################################################################################
Expand All @@ -332,11 +329,9 @@ PhyloFactor <- function(Data,tree,X=NULL,frmla = Data~X,choice='var',transform.f
}
}
}
if (all(rownames(Data) %in% tree$tip.label)==F){stop('some rownames of Data are not found in tree')}
if (all(tree$tip.label %in% rownames(Data))==F){
if(!quiet){
warning('some tips in tree are not found in dataset - output PF$tree will contain a trimmed tree')
}
if (!all(rownames(Data) %in% tree$tip.label)){stop('some rownames of Data are not found in tree')}
if (!all(tree$tip.label %in% rownames(Data))){
warning('some tips in tree are not found in dataset - output PF$tree will contain a trimmed tree')
tree <- ape::drop.tip(tree,setdiff(tree$tip.label,rownames(Data)))}
if (!all(rownames(Data)==tree$tip.label)){
warning('rows of data are in different order of tree tip-labels - use output$data for downstream analysis, or set Data <- Data[output$tree$tip.label,]')
Expand Down Expand Up @@ -385,37 +380,19 @@ PhyloFactor <- function(Data,tree,X=NULL,frmla = Data~X,choice='var',transform.f

###################### Default treatment of Data #################################

if (all.equal(transform.fcn,log)==TRUE){
if (all.equal(transform.fcn,log)==T){
if (any(c(Data)<0)){
stop('For log-transformed data analysis, all entries of Data must be greater than or equal to 0')
}
if (!trust.me){
if (any(Data==0)){
if (delta==0.65){
if (!quiet){
warning('Data has zeros and will receive default modification of zeros. Zeros will be replaced with delta*min(Data[Data>0]), default delta=0.65')
}
}
rplc <- function(x,delta){
x[x==0]=min(x[x>0])*delta
return(x)
}

Data <- apply(Data,MARGIN=2,FUN=rplc,delta=delta)

if (any(Data==0)){
if (delta==0.65){
warning('Data has zeros and will receive default modification of zeros. Zeros will be replaced column wise with delta*min(x[x>0]), default delta=0.65')
}
rplc <- function(x,delta){
x[x==0]=min(x[x>0])*delta
return(x)
}
# if (any(abs(colSums(Data)-1)>tolerance)){
# if (!quiet){
# warning('Column Sums of Data are not sufficiently close to 1 - Data will be re-normalized by column sums')
# }
# Data <- t(clo(t(Data)))
#
# if (any(abs(colSums(Data)-1)>tolerance)){
# if (!quiet){
# warning('Attempt to divide Data by column sums did not bring column sums within "tolerance" of 1 - will proceed with factorization, but such numerical instability may affect the accuracy of the results')
# }
# }
# }
Data <- apply(Data,MARGIN=2,FUN=rplc,delta=delta)
}
}
#####################################################################################
Expand Down Expand Up @@ -569,8 +546,8 @@ PhyloFactor <- function(Data,tree,X=NULL,frmla = Data~X,choice='var',transform.f
}

############# Perform Regression on all of Groups, and implement choice function ##############
# PhyloReg <- PhyloRegression(TransformedData,X,frmla,Grps,contrast.fcn,choice,treeList,cl,totalvar,ix_cl,treetips,grpsizes,tree_map,quiet,nms,smallglm,choice.fcn)
PhyloReg <- PhyloRegression(TransformedData,X,frmla,Grps,contrast.fcn,choice,treeList,cl,totalvar,ix_cl,treetips,grpsizes,tree_map,quiet,nms,smallglm,choice.fcn=choice.fcn,method,...)
# PhyloReg <- PhyloRegression(TransformedData,X,frmla,Grps,contrast.fcn,choice,treeList,cl,totalvar,ix_cl,treetips,grpsizes,tree_map,nms,choice.fcn)
PhyloReg <- PhyloRegression(TransformedData,X,frmla,Grps,contrast.fcn,choice,treeList,cl,totalvar,ix_cl,treetips,grpsizes,tree_map,nms,choice.fcn=choice.fcn,method,...)
############################## EARLY STOP #####################################
###############################################################################

Expand Down
21 changes: 6 additions & 15 deletions R/PhyloRegression.R
Expand Up @@ -13,13 +13,11 @@
#' @param treetips Number of tips in treeList for quickly identifying whether nodes correspond to a root
#' @param grpsizes Number of nodes in each tree of treeList
#' @param tree_map Cumulative number of nodes in trees of treeList - allows rapid mapping of nodes in ix_cl to appropriate tree in treeList.
#' @param quiet Logical to supress warnings
#' @param nms rownames of TransformedData, allowing reliable mapping of rows of data to tree.
#' @param smallglm Logical. See \code{\link{PhyloFactor}}
#' @param choice.fcn optional customized choice function to choose 'best' edge; see \code{\link{PhyloFactor}}
#' @param method See \code{\link{PhyloFactor}}
#' @param ... optional input arguments for \code{\link{glm}}
PhyloRegression <- function(TransformedData,X,frmla,Grps=NULL,contrast.fcn=NULL,choice,treeList=NULL,cl,totalvar=NULL,ix_cl,treetips=NULL,grpsizes=NULL,tree_map=NULL,quiet=T,nms=NULL,smallglm=F,choice.fcn,method='glm',...){
PhyloRegression <- function(TransformedData,X,frmla,Grps=NULL,contrast.fcn=NULL,choice,treeList=NULL,cl,totalvar=NULL,ix_cl,treetips=NULL,grpsizes=NULL,tree_map=NULL,nms=NULL,choice.fcn,method='glm',...){
#cl - optional phyloCluster input for parallelization of regression across multiple groups.
D <- dim(TransformedData)[1]
xx=X
Expand All @@ -36,9 +34,8 @@ PhyloRegression <- function(TransformedData,X,frmla,Grps=NULL,contrast.fcn=NULL,
}
if (choice != 'custom'){
################ DEFAULT choice.fcn #########################
# GLMs <- apply(Y,1,FUN = phylofactor::pglm,xx=X,frmla=frmla,smallglm=T)
if (method != 'max.var'){
GLMs <- apply(Y,1,FUN = pglm,xx=X,frmla=frmla,smallglm=T,...)
GLMs <- apply(Y,1,FUN = pglm,xx=X,frmla=frmla,...)
stats <- matrix(sapply(GLMs,FUN=phylofactor::getStats),ncol=3,byrow=T)
#contains Pvalues, F statistics, and explained var
rownames(stats) <- 1:ngrps
Expand Down Expand Up @@ -80,20 +77,16 @@ PhyloRegression <- function(TransformedData,X,frmla,Grps=NULL,contrast.fcn=NULL,
winner <- sample(winner,1)
}
} else { ##### PARALLEL #####
Winners=parallel::clusterApply(cl,x=ix_cl,fun= function(x,tree_map,treeList,treetips,contrast.fcn,choice,method,smallglm,frmla,xx,choice.fcn,...) findWinner(x,tree_map=tree_map,treeList=treeList,treetips=treetips,contrast.fcn=contrast.fcn,choice=choice,method=method,smallglm=smallglm,frmla=frmla,xx=xx,choice.fcn=choice.fcn,...) ,tree_map=tree_map,treeList=treeList,treetips=treetips,contrast.fcn=contrast.fcn,choice=choice,method=method,smallglm=smallglm,frmla=frmla,xx=xx,choice.fcn=choice.fcn,...)
# Winners=parallel::clusterApply(cl,x=ix_cl,fun= function(x,tree_map,treeList,treetips,contrast.fcn,choice,smallglm,frmla,xx,choice.fcn) findWinner(x,tree_map=tree_map,treeList=treeList,treetips=treetips,contrast.fcn=contrast.fcn,choice=choice,smallglm=smallglm,frmla=frmla,xx=xx,choice.fcn=choice.fcn) ,tree_map=tree_map,treeList=treeList,treetips=treetips,contrast.fcn=contrast.fcn,choice=choice,smallglm=smallglm,frmla=frmla,xx=xx,choice.fcn=choice.fcn)

# Winners=lapply(ix_cl,FUN=function(x,tree_map,treeList,treetips,contrast.fcn,choice,smallglm,frmla,xx,choice.fcn) findWinner(nset=x,tree_map=tree_map,treeList=treeList,treetips=treetips,contrast.fcn=contrast.fcn,choice=choice,smallglm=smallglm,frmla=frmla,xx=xx,choice.fcn=choice.fcn) ,tree_map=tree_map,treeList=treeList,treetips=treetips,choice=choice,smallglm=smallglm,frmla=frmla,xx=xx,choice.fcn=choice.fcn)
# Recall: output from findWinner is $grp and then our objective function output: $objective, $Fstat, or $ExplainedVar, corresponding to choice='custom','F', and 'var', respectivley.

Winners=parallel::clusterApply(cl,x=ix_cl,fun= function(x,tree_map,treeList,treetips,contrast.fcn,choice,method,frmla,xx,choice.fcn,...) findWinner(x,tree_map=tree_map,treeList=treeList,treetips=treetips,contrast.fcn=contrast.fcn,choice=choice,method=method,frmla=frmla,xx=xx,choice.fcn=choice.fcn,...) ,tree_map=tree_map,treeList=treeList,treetips=treetips,contrast.fcn=contrast.fcn,choice=choice,method=method,frmla=frmla,xx=xx,choice.fcn=choice.fcn,...)

grps <- lapply(Winners,getElement,'grp')
Y <- lapply(grps,BalanceContrast,TransformedData=TransformedData)


####################################### DEFAULT REGRESSIONS #####################
if (choice != 'custom'){
if (method != 'max.var'){
gg <- lapply(Y,FUN = pglm,xx=X,frmla=frmla,smallglm=T,...)
gg <- lapply(Y,FUN = pglm,xx=X,frmla=frmla,...)
stats <- lapply(gg,getStats)
if (choice=='var'){
objective <- sapply(stats,function(x) x['ExplainedVar'])
Expand All @@ -113,10 +106,8 @@ PhyloRegression <- function(TransformedData,X,frmla,Grps=NULL,contrast.fcn=NULL,

winner=which(objective==max(objective))
if (length(winner)>1){
if (!quiet){
warning(paste('Objective function produced',toString(length(winner)),
warning(paste('Objective function produced',toString(length(winner)),
'identical groups. Will choose group at random.',sep=' '))
}
winner <- sample(winner,1)
}
}
Expand Down
12 changes: 4 additions & 8 deletions R/findWinner.R
Expand Up @@ -7,12 +7,11 @@
#' @param contrast.fcn See \code{\link{PhyloFactor}} or example functions \code{\link{BalanceContrast}}, \code{\link{amalgamate}}
#' @param choice string indicating how we choose the winner. Must be either \code{'var'}, \code{'F'}, or \code{'phyca'}
#' @param method See \code{\link{PhyloFactor}}
#' @param smallglm Logical - whether or not to use regular \code{glm}. if smallglm=F, will use \code{\link{bigglm}} from the \code{\link{biglm}} package.
#' @param frmla Formula for \code{\link{glm}}. See \code{\link{PhyloFactor}} for more details.
#' @param xx data frame containing non-ILR (\code{Data}) variables used in \code{frmla}
#' @param choice.fcn See \code{\link{PhyloFactor}}
#' @param ... optional input arguments to \code{\link{glm}}
findWinner <- function(nset,tree_map,treeList,treetips,contrast.fcn=NULL,choice,method='glm',smallglm=F,frmla=NULL,xx=NULL,choice.fcn=NULL,...){
findWinner <- function(nset,tree_map,treeList,treetips,contrast.fcn=NULL,choice,method='glm',frmla=NULL,xx=NULL,choice.fcn=NULL,...){


########### set-up and prime variables #############
Expand Down Expand Up @@ -100,11 +99,8 @@ findWinner <- function(nset,tree_map,treeList,treetips,contrast.fcn=NULL,choice,
#########################################################
args <- list('data'=dataset,'formula'=frmla,...)
############ Performing Regression ######################
if(smallglm){
gg=do.call(stats::glm,args)
} else {
gg=do.call(biglm::bigglm,args)
}
gg=do.call(stats::glm,args)

#########################################################

############# Update output if objective is larger #######
Expand Down Expand Up @@ -149,7 +145,7 @@ findWinner <- function(nset,tree_map,treeList,treetips,contrast.fcn=NULL,choice,


################## modify output glm for default choices #################
if (choice %in% c('var','F') & !smallglm & method!='max.var'){ #convert bigglm to glm
if (choice %in% c('var','F') & method!='max.var'){ #convert bigglm to glm
if (is.null(contrast.fcn)){
Y <- BalanceContrast(output$grp,TransformedData)
} else {
Expand Down
7 changes: 1 addition & 6 deletions R/pglm.R
Expand Up @@ -3,7 +3,6 @@
#' @param y response variable
#' @param xx independent variable
#' @param frmla Formula for dependence of y on x
#' @param smallglm Logical. See \code{\link{PhyloFactor}}
#' @param ... optional input arguments to \code{\link{glm}}
#' @return glm object
#' @examples
Expand All @@ -18,9 +17,5 @@ pglm <- function(y,xx,frmla,smallglm=T,...){
names(dataset) <- c('Data',names(xx))
# dataset <- stats::model.frame(frmla,data = dataset)
args <- list('data'=dataset,'formula'=frmla,...)
if(smallglm){
return(do.call(stats::glm,args))
} else {
return(do.call(biglm::bigglm,args))
}
return(do.call(stats::glm,args))
}

0 comments on commit e6837da

Please sign in to comment.