diff --git a/DESCRIPTION b/DESCRIPTION index 09ef051..716c9ba 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,13 +1,16 @@ Package: changepoint Type: Package Title: Methods for Changepoint Detection -Version: 2.2.5 -Date: 2022-11-08 +Version: 2.3 +Date: 2023-07-06 Authors@R: c(person("Rebecca", "Killick", role=c("aut","cre"),email="r.killick@lancs.ac.uk"), - person("Kaylea", "Haynes", role="aut"), - person("Idris", "Eckley", role=c("ths")), - person("Paul","Fearnhead",role=c("ctb","ths")), person("Robin", "Long", role="ctb"),person("Jamie","Lee",role="ctr")) + person("Kaylea", "Haynes", role="aut"), + person("Paul","Fearnhead",role=c("ctb","ths")), + person("Robin", "Long", role="ctb"), + person("Simon", "Taylor",role="ctb"), + person("Idris", "Eckley", role=c("ths")), + person("Jamie","Lee",role="ctr")) Maintainer: Rebecca Killick BugReports: https://github.com/rkillick/changepoint/issues URL: https://github.com/rkillick/changepoint/ @@ -16,14 +19,15 @@ Depends: R(>= 3.2), methods, stats, zoo(>= 0.9-1) Suggests: testthat, vdiffr License: GPL LazyData: true -Packaged: 2022-11-08 14:14:13 UTC; killick +Packaged: 2023-07-06 14:14:13 UTC; killick NeedsCompilation: yes Repository: CRAN Date/Publication: 2022-11-08 15:50:02 UTC Author: Rebecca Killick [aut, cre], Kaylea Haynes [aut], - Idris Eckley [ths], Paul Fearnhead [ctb, ths], Robin Long [ctb], + Simon Taylor [ctb], + Idris Eckley [ths], Jamie Lee [ctr] diff --git a/NEWS b/NEWS index 3d26237..a4222df 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,10 @@ +Version 2.3 +=========== +* Added Binomial cost function to cpt.meanvar(). Thanks go to Simon Taylor for contributing this new cost function. +* Changed default method in cpt.*() functions to PELT +* Fixed bug in SegNeigh algorithm where if the distribution used was Gamma the shape parameter was not passed to the lower functions. Thanks go to Simon Taylor for highlighting the bug. + + Version 2.2.5 ============= * Thanks to Toby Hocking for highlighting that when pen=0, BinSeg doesn't need to warn that ncpts=Q as this is by design. Warning for pen=0 removed. diff --git a/R/CROPS.R b/R/CROPS.R index e5f91c5..bf67843 100644 --- a/R/CROPS.R +++ b/R/CROPS.R @@ -1,17 +1,19 @@ -CROPS <- function(data, penalty="CROPS", pen.value, method="PELT", test.stat="Normal", class=TRUE, param.est=TRUE, minseglen, shape, func){ +CROPS <- function(data, penalty="CROPS", pen.value, method="PELT", test.stat="Normal", class=TRUE, param.est=TRUE, minseglen, shape, func, size){ if(method != "PELT"){stop('CROPS is a valid penalty choice only if method="PELT", please change your method or your penalty.')} mu <- mean(data) - sumstat=cbind(c(0,cumsum(coredata(data))),c(0,cumsum(coredata(data)^2)),cumsum(c(0,(coredata(data)-mu)^2))) - + switch(test.stat, "Normal" = {stat = "norm"}, "Exponential" = {stat = "exp"}, "Gamma" = {stat = "gamma"}, "Poisson" = {stat = "poisson"}, - {stop("Only Normal, Exponential, Gamma and Poisson are valid test statistics")} + "Binomial" = {stat = "binomial"}, + {stop("Only Normal, Exponential, Gamma, Poisson and Binomial are valid test statistics")} ) costfunc = paste0(func, ".", stat) + sumstat = SumStats(test.stat, data, size) + out = range_of_penalties(sumstat, cost=costfunc, min_pen=pen.value[1], max_pen=pen.value[2], minseglen=minseglen) if(func=="var"){ @@ -23,10 +25,10 @@ CROPS <- function(data, penalty="CROPS", pen.value, method="PELT", test.stat="No } if(class==TRUE){ - ans = class_input(data=data,cpttype=cpttype, method="PELT", test.stat=test.stat, penalty=penalty, pen.value=pen.value, minseglen=minseglen, param.estimates=param.est, out=out,shape=shape) + ans = class_input(data=data,cpttype=cpttype, method="PELT", test.stat=test.stat, penalty=penalty, pen.value=pen.value, minseglen=minseglen, param.estimates=param.est, out=out,shape=shape,size=size) if(func=="var"){ param.est(ans)=c(param.est(ans),mean=mu) } return(ans) }else{return(out)} -} \ No newline at end of file +} diff --git a/R/SegNeigh_one_func_minseglen.R b/R/SegNeigh_one_func_minseglen.R index d281986..5f5e5c1 100644 --- a/R/SegNeigh_one_func_minseglen.R +++ b/R/SegNeigh_one_func_minseglen.R @@ -4,14 +4,18 @@ dataToString <- function(data){ -SEGNEIGH <- function(data, pen.value, costfunc, Q, var=0, shape=1){ +SEGNEIGH <- function(data, pen.value, costfunc, Q, var=0, shape=1,size=NA){ #split the costfunction by . and then use the first two elements in eval(parse()) if(Q<=0){stop(paste('Q is the maximum number of changepoints so should be greater than 0'))} split=strsplit(costfunc, "\\.") - if(var!=0){ - text = c("segneigh",".", split[[1]][1],".", split[[1]][2],"(",dataToString(data),",",Q,",",pen.value,",","know.mean=",TRUE,",","mu=",var,")") + if(split[[1]][2] == "gamma"){ + text = c("segneigh",".", split[[1]][1],".", split[[1]][2],"(",dataToString(data),",",shape,",",Q,",",pen.value,")") + }else if(split[[1]][2] == "binomial"){ + text = c("segneigh",".", split[[1]][1],".", split[[1]][2],"(",dataToString(data),",",dataToString(size),",",Q,",",pen.value,")") + }else if(var!=0){ + text = c("segneigh",".", split[[1]][1],".", split[[1]][2],"(",dataToString(data),",",Q,",",pen.value,",","know.mean=",TRUE,",","mu=",var,")") }else{ text = c("segneigh",".", split[[1]][1],".", split[[1]][2],"(",dataToString(data),",",Q,",",pen.value,")") } @@ -21,4 +25,4 @@ SEGNEIGH <- function(data, pen.value, costfunc, Q, var=0, shape=1){ out = eval(parse(text=paste(text, collapse=""))) return(out) -} \ No newline at end of file +} diff --git a/R/binomial.R b/R/binomial.R new file mode 100644 index 0000000..188e135 --- /dev/null +++ b/R/binomial.R @@ -0,0 +1,273 @@ +cpt.meanvar.binom <- function(data,penalty="MBIC",pen.value=0,method="PELT",Q=5,class=TRUE, + param.estimates=TRUE,minseglen=2,size=NA){ + ##Checks already performed by cpt.meanvar: + # data contains no NAs + # method=="SegNeigh" & minseglen>2 ==> stop() + if(is.na(size)){ + stop("For Binomial cost functions the size argument must be specified.") + } + if(method=="AMOC"){ + return(single.meanvar.binomial(data, size, penalty, pen.value, class, + param.estimates,minseglen)) + }else if(method=="PELT" || method=="BinSeg" || method=="SegNeigh"){ + return(multiple.meanvar.binomial(data, size, mul.method=method,penalty, + pen.value,Q,class,param.estimates,minseglen)) + }else{ + stop("Invalid Method, must be AMOC, PELT, SegNeigh or BinSeg") + } +} + +single.meanvar.binomial <- function(data, size, penalty="MBIC",pen.value=0,class=TRUE, + param.estimates=TRUE,minseglen){ + ##checks on data & size + if(any(data<0) || any(size<0)) stop("data and size needs to be positive.") + if(any(data%%1!=0) || any(size%%1!=0)) stop("data and size needs to be integers.") + if(length(size)==1){ + SIZE <- data*0 + size ## repeat size to the dimension of data & inherit class + }else if(length(data)==length(size)){ + if(!is.numeric(size)) stop("Only numeric size allowed") + if(anyNA(size)){stop("Missing value: NA is not allowed in size.")} + SIZE <- size + }else{ + stop("Size needs to be either a single number or have the same dimensions as data.") + } + if(any(data > SIZE)) stop("Elements of data cannot be greater than the corresponding element of size.") + if(is.null(dim(data))){ + #single dataset + n <- length(data) + }else{ + n <- nrow(data) + } + if(n<4) stop("Data must have at least 4 observations to fit this changepoint model.") + if(n<(2*minseglen)) stop("Minimum segment length is too large to include a changepoint in this data") + + pen.value = penalty_decision(penalty, pen.value, n, diffparam=1, asymcheck="meanvar.binomial",method="AMOC") + if(is.null(dim(data))){ + #single dataset + tmp = single.meanvar.binomial.calc(coredata(data),coredata(SIZE),extrainf=TRUE,minseglen) + if(penalty=="MBIC"){ + tmp[3] = tmp[3] + log(tmp[1]) + log(n-tmp[1]+1) + } + ans = decision(tmp[1],tmp[2],tmp[3],penalty,n,diffparam=1,pen.value) + if(class){ + return(class_input(data, size=SIZE, method="AMOC", + penalty=penalty, pen.value=ans$pen, + minseglen=minseglen, param.estimates=param.estimates, + out = c(0,ans$cpt))) + }else{ + return(ans$cpt) + } + }else{ + #multiple datasets + tmp = single.meanvar.binomial.calc(data,SIZE,extrainf=TRUE,minseglen) + if(penalty=="MBIC"){ + tmp[,3] = tmp[,3] + log(tmp[,1]) + log(n-tmp[,1]+1) + } + ans = decision(tmp[,1],tmp[,2],tmp[,3],penalty,n,diffparam=1,pen.value) + if(class){ + rep = ncol(data) + out = list() + for(i in 1:rep){ + if(length(size)==1){ size_out<- size }else{ size_out<-size[,i] } + # the above looks repetitive from line 25 but 25 is generic and 71 is specific to multivariate + out[[i]] <- class_input(data[,i],size=size_out, + method="AMOC", penalty=penalty, + pen.value=ans$pen, minseglen=minseglen, + param.estimates=param.estimates, out = c(0,ans$cpt[i])) + } + return(out) + }else{ + return(ans$cpt) + } + } +} + + + +single.meanvar.binomial.calc <- function(data,size,extrainf=TRUE,minseglen){ + singledim <- function(data,size,extrainf=TRUE,minseglen){ + n = length(data) + y = c(0,cumsum(data)) + m = c(0,cumsum(size)) + if(y[n+1] == 0 || y[n+1]==m[n+1]){ + null = Inf + }else{ + null=-2*(y[n+1]*log(y[n+1])-m[n+1]*log(m[n+1]) + (m[n+1]-y[n+1])*log(m[n+1]-y[n+1])) + } + taustar = minseglen:(n-minseglen) + sx1 = y[taustar+1] + sm1 = m[taustar+1] + sx1_sm1 = sm1-sx1 + sx2 = y[n+1]-y[taustar+1] + sm2 = m[n+1]-m[taustar+1] + sx2_sm2 = sm2-sx2 + tmp = sx1*log(sx1) - sm1*log(sm1) + sx1_sm1*log(sx1_sm1) + tmp = tmp + sx2*log(sx2) - sm2*log(sm2) + sx2_sm2*log(sx2_sm2) + ##NOTE: + ## probMLE = c(rep(sx1/sm1,taustar[i]),rep(sx2/sm2,n-taustar[i])) + ## sum(dbinom(data,size,probMLE,log=TRUE)) == tmp + sum(lchoose(size,data)) + + tmp = -2*tmp #convert to deviance scale + if(anyNA(tmp)) tmp[is.na(tmp)] = Inf + tau=which(tmp==min(tmp,na.rm=T))[1] + taulike = tmp[tau] + tau = tau + minseglen - 1 ## correcting for the fact that we are starting at minseglen + if(extrainf){ + out <- c(tau,null,taulike) + names(out) = c("cpt","null","alt") + return(out) + }else{ + return(tau) + } + } + + if(is.null(dim(data))){ + cpt = singledim(data,size,extrainf,minseglen) + }else{ + rep = nrow(data) + n = ncol(data) + if(!extrainf){ + cpt=rep(0,rep) + for(i in 1:rep){ + cpt[i] <- singledim(data[i,], size[i,], extrainf, minseglen) + } + }else{ + cpt = matrix(0,ncol=3,nrow=rep) + for(i in 1:rep){ + cpt[i,] = singledim(data[i,], size[i,], extrainf, minseglen) + } + colnames(cpt) <- c("cpt","null","alt") + } + return(cpt) + } +} + +multiple.meanvar.binomial <- function(data, size, mul.method="PELT",penalty="MBIC", + pen.value=0,Q=5,class=TRUE,param.estimates=TRUE,minseglen){ + ##checks on data & size + if(length(size)==1){ + SIZE <- data*0 + size ## repeat size to the dimension of data & inherit class + }else if(length(data)==length(size)){ + if(!is.numeric(size)) stop("Only numeric size allowed") + if(anyNA(size)){stop("Missing value: NA is not allowed in the size as changepoint methods are only sensible for regularly spaced data.")} + SIZE <- size + }else{ + stop("Size needs to be either a single number or have the same dimensions of data.") + } + if(any(data<0) || any(size<0)) stop("Binomal test statistic requires postive data and size.") + if(any(data%%1!=0) || any(size%%1!=0)) stop("Binomal test statistic requires integer data and size.") + if(any(data > SIZE)) stop("Binomal test statistic requires data <= size.") + + costfunc = "meanvar.binomial" + if(penalty == "MBIC"){ + if(mul.method == "SegNeigh"){ + stop("MBIC penalty not implemented for SegNeigh method, please choose an alternative penalty") + } + costfunc = paste0(costfunc,".mbic") + } + + diffparam = 1 + if(is.null(dim(data))){ + n = length(data) + }else{ + n = ncol(data) + } + + if(n<(2*minseglen)) stop("Minimum segment length is too large to include a change in this data") + pen.value = penalty_decision(penalty, pen.value, n, diffparam = 1, asymcheck = costfunc, method = mul.method) + if(is.null(dim(data))){ + out = data_input(data=data,method=mul.method, pen.value=pen.value, costfunc=costfunc, + minseglen=minseglen, Q=Q, size=size) + if(class){ + return(class_input(data=data, size=SIZE, method=mul.method, + penalty=penalty, pen.value=pen.value, + minseglen=minseglen, param.estimates=param.estimates, + out = out, Q=Q)) + }else{ + return(out[[2]]) + } + }else{ + rep = nrow(data) + out = list() + for(i in 1:rep){ + out[[i]] = data_input(data=data[i,],method=mul.method, pen.value=pen.value, + costfunc=costfunc, minseglen=minseglen, Q=Q, size=size[i,]) + } + cpts = lapply(out,'[[',2) + if(class){ + ans = list() + for(i in 1:rep){ + if(length(size)==1){ size_out<- size }else{ size_out<-size[i,] } + ans[[i]] = class_input(data=data[i,],size=size_out, + method=mul.method, penalty=penalty, + pen.value=pen.value, minseglen=minseglen, + param.estimates=param.estimates, + out = out[[i]], Q=Q) + } + return(ans) + }else{ + return(cpts) + } + } +} + +segneigh.meanvar.binomial <- function(data, size=1, Q=5, pen=0){ + ##nb size is the same dimension of data + ##checks on data and size should have already been performed + n <- length(data) + if(n<4){stop('Data must have atleast 4 observations to fit a changepoint model.')} + if(Q>((n/2)+1)){stop(paste('Q is larger than the maximum number of segments',(n/2)+1))} + + all.seg=matrix(0,ncol=n,nrow=n) + if(length(size)==1){ SIZE <- data*0 + size }else{SIZE <- size} + for(i in 1:n){ + sumx=0 + sumsize = 0 + for(j in i:n){ + sumx=sumx+data[j] + sumsize = sumsize + SIZE[j] + if(sumx==0 || sumx==sumsize){ + all.seg[i,j]=-Inf + } + else{ + all.seg[i,j]=sumx*log(sumx) - sumsize*log(sumsize) + (sumsize-sumx)*log(sumsize-sumx) + } + } + } + like.Q=matrix(0,ncol=n,nrow=Q) + like.Q[1,]=all.seg[1,] + cp=matrix(NA,ncol=n,nrow=Q) + for(q in 2:Q){ + for(j in q:n){ + like=NULL + if((j-2-q)<0){v=q} + else{v=(q):(j-2)} + like=like.Q[q-1,v]+all.seg[v+1,j] + + like.Q[q,j]= max(like,na.rm=TRUE) + cp[q,j]=which(like==max(like,na.rm=TRUE))[1]+(q-1) + } + + } + cps.Q=matrix(NA,ncol=Q,nrow=Q) + for(q in 2:Q){ + cps.Q[q,1]=cp[q,n] + for(i in 1:(q-1)){ + cps.Q[q,(i+1)]=cp[(q-i),cps.Q[q,i]] + } + } + + op.cps=NULL + k=0:(Q-1) + + for(i in 1:length(pen)){ + criterion=-2*like.Q[,n]+k*pen[i] + + op.cps=c(op.cps,which(criterion==min(criterion,na.rm=T))-1) + } + if(op.cps==(Q-1)){warning('The number of segments identified is Q, it is advised to increase Q to make sure changepoints have not been missed.')} + if(op.cps==0){cpts=n} + else{cpts=c(sort(cps.Q[op.cps+1,][cps.Q[op.cps+1,]>0]),n)} + + return(list(cps=t(apply(cps.Q,1,sort,na.last=TRUE)),cpts=cpts,op.cpts=op.cps,pen=pen,like=criterion[op.cps+1],like.Q=like.Q[,n])) +} diff --git a/R/class_input.R b/R/class_input.R index db1dcda..afb33c2 100644 --- a/R/class_input.R +++ b/R/class_input.R @@ -1,4 +1,4 @@ -class_input <- function(data, cpttype, method, test.stat, penalty, pen.value, minseglen, param.estimates, out=list(), Q=NA, shape=NA){ +class_input <- function(data, cpttype, method, test.stat, penalty, pen.value, minseglen, param.estimates, out=list(), Q=NA, shape=NA,size=NA){ if(method=="BinSeg" || method=="SegNeigh" || penalty=="CROPS"){ ans=new("cpt.range") }else{ @@ -7,11 +7,13 @@ class_input <- function(data, cpttype, method, test.stat, penalty, pen.value, mi data.set(ans)=data;cpttype(ans)=cpttype;method(ans)=method; test.stat(ans)=test.stat;pen.type(ans)=penalty;pen.value(ans)=pen.value;minseglen(ans)=minseglen;ans@date=date(); if(penalty!="CROPS"){ # crops is only one that doesn't give a single set of cpts - cpts(ans)=out[[2]] + cpts(ans)=as.integer(out[[2]]) if(param.estimates==TRUE){ if(test.stat == "Gamma"){ - ans=param(ans, shape) + ans=param(ans, shape) + }else if(test.stat == "Binomial"){ + ans=param(ans, size) }else{ ans=param(ans) } @@ -31,7 +33,7 @@ class_input <- function(data, cpttype, method, test.stat, penalty, pen.value, mi if(method=="BinSeg"){ l=list() for(i in 1:(length(out$cps)/2)){ - l[[i]] = out$cps[1,1:i] + l[[i]] = out$cps[1,1:i] } m = t(sapply(l, '[', 1:max(sapply(l, length)))) @@ -46,6 +48,7 @@ class_input <- function(data, cpttype, method, test.stat, penalty, pen.value, mi cpts.full(ans) = m pen.value.full(ans) = out[[1]][1,] if(test.stat=="Gamma"){param.est(ans)$shape=shape} + if(test.stat == "Binomial"){param.est(ans)$size=size} } return(ans) diff --git a/R/cpt.R b/R/cpt.R index 7e485f6..4500b39 100644 --- a/R/cpt.R +++ b/R/cpt.R @@ -1,4 +1,4 @@ -cpt.mean=function(data,penalty="MBIC",pen.value=0,method="AMOC",Q=5,test.stat="Normal",class=TRUE,param.estimates=TRUE,minseglen=1){ +cpt.mean=function(data,penalty="MBIC",pen.value=0,method="PELT",Q=5,test.stat="Normal",class=TRUE,param.estimates=TRUE,minseglen=1){ checkData(data) if(method=="SegNeigh" & minseglen>1){stop("minseglen not yet implemented for SegNeigh method, use PELT instead.")} if(minseglen<1){minseglen=1;warning('Minimum segment length for a change in mean is 1, automatically changed to be 1.')} @@ -51,7 +51,7 @@ cpt.mean=function(data,penalty="MBIC",pen.value=0,method="AMOC",Q=5,test.stat="N } } -#cpt.reg=function(data,penalty="MBIC",pen.value=0,method="AMOC",Q=5,test.stat="Normal",class=TRUE,param.estimates=TRUE){ +#cpt.reg=function(data,penalty="MBIC",pen.value=0,method="PELT",Q=5,test.stat="Normal",class=TRUE,param.estimates=TRUE){ # if(test.stat !="Normal"){ stop("Invalid test statistic, must be Normal") } # if(method=="AMOC"){ # return(single.reg.norm(data,penalty,pen.value,class,param.estimates)) @@ -68,7 +68,7 @@ cpt.mean=function(data,penalty="MBIC",pen.value=0,method="AMOC",Q=5,test.stat="N # } #} -cpt.var=function(data,penalty="MBIC",pen.value=0,know.mean=FALSE, mu=NA,method="AMOC",Q=5,test.stat="Normal",class=TRUE,param.estimates=TRUE,minseglen=2){ +cpt.var=function(data,penalty="MBIC",pen.value=0,know.mean=FALSE, mu=NA,method="PELT",Q=5,test.stat="Normal",class=TRUE,param.estimates=TRUE,minseglen=2){ checkData(data) if(method=="SegNeigh" & minseglen>2){stop("minseglen not yet implemented for SegNeigh method, use PELT instead.")} if(minseglen<2){minseglen=2;warning('Minimum segment length for a change in variance is 2, automatically changed to be 2.')} @@ -123,11 +123,11 @@ cpt.var=function(data,penalty="MBIC",pen.value=0,know.mean=FALSE, mu=NA,method=" } } -cpt.meanvar=function(data,penalty="MBIC",pen.value=0,method="AMOC",Q=5,test.stat="Normal",class=TRUE,param.estimates=TRUE,shape=1,minseglen=2){ +cpt.meanvar=function(data,penalty="MBIC",pen.value=0,method="PELT",Q=5,test.stat="Normal",class=TRUE,param.estimates=TRUE,shape=1,size=NA,minseglen=2){ checkData(data) if(method=="SegNeigh" & minseglen>2){stop("minseglen not yet implemented for SegNeigh method, use PELT instead.")} if(minseglen<2){ - if(!(minseglen==1 & (test.stat=="Poisson"|test.stat=="Exponential"))){ + if(!(minseglen==1 & (test.stat=="Poisson"|test.stat=="Exponential"|test.stat=="Binomial"))){ minseglen=2;warning('Minimum segment length for a change in mean and variance is 2, automatically changed to be 2.')} } if(penalty == "CROPS"){ @@ -209,15 +209,19 @@ cpt.meanvar=function(data,penalty="MBIC",pen.value=0,method="AMOC",Q=5,test.stat stop("Invalid Method, must be AMOC, PELT, SegNeigh or BinSeg") } } + else if(test.stat=="Binomial"){ + return(cpt.meanvar.binom(data=data,penalty=penalty,pen.value=pen.value,method=method,Q=Q, + class=class,param.estimates=param.estimates,minseglen=minseglen,size=size)) + } else{ - stop("Invalid test statistic, must be Normal, Gamma, Exponential or Poisson") + stop("Invalid test statistic, must be Normal, Gamma, Exponential, Poisson or Binomial") } } checkData = function(data){ if(!is.numeric(data)){ stop("Only numeric data allowed") - } - if(anyNA(data)){stop("Missing value: NA is not allowed in the data as changepoint methods are only sensible for regularly spaced data.")} + } + if(anyNA(data)){stop("Missing value: NA is not allowed in the data. If suitable for the application, drop the NAs but then be careful with inference.")} } diff --git a/R/cpt.class.R b/R/cpt.class.R index 4023bcc..0edcc14 100644 --- a/R/cpt.class.R +++ b/R/cpt.class.R @@ -413,7 +413,7 @@ setClass("cpt",slots=list(data.set="ts", cpttype="character", method="character" # parameter functions setGeneric("param", function(object,...) standardGeneric("param")) - setMethod("param", "cpt", function(object,shape,...) { + setMethod("param", "cpt", function(object,shape,size,...) { param.mean=function(object){ cpts=c(0,object@cpts) #nseg=length(cpts)-1 @@ -447,6 +447,20 @@ setClass("cpt",slots=list(data.set="ts", cpttype="character", method="character" } return(tmpscale) } + param.prob=function(object,size){ + cpts=c(0,object@cpts) + data=data.set(object) + if(length(size)==1) size=rep(size,length(data)) + y=c(0,cumsum(data)) + m=c(0,cumsum(size)) + tmpprob=NULL + for(j in 1:nseg(object)){ + sx <- y[cpts[j+1]+1]-y[cpts[j]+1] + sm <- m[cpts[j+1]+1]-m[cpts[j]+1] + tmpprob[j]=sx/sm + } + return(tmpprob) + } param.trend=function(object){ cpts=c(0,object@cpts) seglen=seg.len(object) @@ -506,6 +520,9 @@ setClass("cpt",slots=list(data.set="ts", cpttype="character", method="character" else if(test.stat(object)=="Poisson"){ param.est(object)<-list(lambda=param.mean(object)) } + else if(test.stat(object)=="Binomial"){ + param.est(object)<-list(prob=param.prob(object,size=size),size=size) + } else{ stop("Unknown test statistic for a change in mean and variance") } @@ -543,7 +560,7 @@ setClass("cpt",slots=list(data.set="ts", cpttype="character", method="character" return(object) }) - setMethod("param", "cpt.range", function(object,ncpts=NA,shape,...) { + setMethod("param", "cpt.range", function(object,ncpts=NA,shape,size,...) { if(is.na(ncpts)){ cpts=object@cpts if(cpts[1]!=0){cpts=c(0,cpts)} # PELT derivatives don't include the 0 @@ -587,6 +604,21 @@ setClass("cpt",slots=list(data.set="ts", cpttype="character", method="character" } return(tmpscale) } + param.prob=function(object,size){ + nseg=length(cpts)-1 + cpts=c(0,object@cpts) + data=data.set(object) + if(length(size)==1) size=rep(size,length(data)) + y=c(0,cumsum(data)) + m=c(0,cumsum(size)) + tmpprob=NULL + for(j in 1:nseg){ + sx <- y[cpts[j+1]+1]-y[cpts[j]+1] + sm <- m[cpts[j+1]+1]-m[cpts[j]+1] + tmpprob[j]=sx/sm + } + return(tmpprob) + } param.trend=function(object,cpts){ seglen=cpts[-1]-cpts[-length(cpts)] data=data.set(object) @@ -646,6 +678,9 @@ setClass("cpt",slots=list(data.set="ts", cpttype="character", method="character" else if(test.stat(object)=="Poisson"){ param.est<-list(lambda=param.mean(object,cpts)) } + else if(test.stat(object)=="Binomial"){ + param.est(object)<-list(prob=param.prob(object,size=size),size=size) + } else{ stop("Unknown test statistic for a change in mean and variance") } @@ -782,7 +817,12 @@ setClass("cpt",slots=list(data.set="ts", cpttype="character", method="character" object=param(x) cat('done.\n') } - plot(data.set.ts(x),...) + if(test.stat(x)=="Binomial"){ + data = data.set.ts(x) + size = param.est(x)$size + plot(data/size,...) + } + else{plot(data.set.ts(x),...)} if(cpttype(x)=="variance" || cpttype(x)=="nonparametric (empirical_distribution)"){ abline(v=index(data.set.ts(x))[cpts(x)],col=cpt.col,lwd=cpt.width,lty=cpt.style) } @@ -801,6 +841,9 @@ setClass("cpt",slots=list(data.set="ts", cpttype="character", method="character" else if(test.stat(x)=="Poisson"){ means=param.est(x)$lambda } + else if(test.stat(x)=="Binomial"){ + means=param.est(x)$prob + } else{ stop('Invalid Changepoint test statistic') } @@ -836,7 +879,12 @@ setClass("cpt",slots=list(data.set="ts", cpttype="character", method="character" } return(invisible(NULL)) } - plot(data.set.ts(x),...) + if(test.stat(x)=="Binomial"){ + data = data.set.ts(x) + size = param.est(x)$size + plot(data/size,...) + } + else{plot(data.set.ts(x),...)} if(is.na(ncpts)){ if(pen.type(x)=="CROPS"){ stop('CROPS does not supply an optimal set of changepoints, set ncpts to the desired segmentation to plot or use diagnostic=TRUE to identify an appropriate number of changepoints') @@ -854,6 +902,9 @@ setClass("cpt",slots=list(data.set="ts", cpttype="character", method="character" if(test.stat(x)=="Gamma"){ param.est=param(x,ncpts,shape=param.est(x)$shape) } + else if(test.stat(x)=="Binomial"){ + param.est=param(x,ncpts,size=param.est(x)$size) + } else{ param.est=param(x,ncpts) } @@ -874,6 +925,9 @@ setClass("cpt",slots=list(data.set="ts", cpttype="character", method="character" else if(test.stat(x)=="Poisson"){ means=param.est(param.est)$lambda } + else if(test.stat(x)=="Binomial"){ + means=param.est(x)$prob + } else{ stop('Invalid Changepoint test statistic') } @@ -1076,6 +1130,29 @@ setClass("cpt",slots=list(data.set="ts", cpttype="character", method="character" } } } + else if(test.stat(object)=="Binomial"){ + if(cpttype(object)!="mean and variance"){ + stop("Unknown changepoint type for test.stat='Binomial', must be 'mean and variance'") + } + else{ + y=c(0,cumsum(data.set(object))) + m=c(0,cumsum(param.est(object)$size)) + cpts=c(0,object@cpts) + #nseg=length(cpts)-1 + tmplike=sum(lchoose(param.est(object)$size,data.set(object))) + for(j in 1:nseg(object)){ + sx <- y[cpts[j+1]+1]-y[cpts[j]+1] + sm <- m[cpts[j+1]+1]-m[cpts[j]+1] + seglike <- sx*log(sx) + (sm-sx)*log(sm-sx) - sm*log(sm) + tmplike <- tmplike + seglike + } + if(pen.type(object)=="MBIC"){ + like=c(-2*tmplike, -2*tmplike+(nseg(object)-2)*pen.value(object)+sum(log(seg.len(object)))) + }else{ + like=c(-2*tmplike,-2*tmplike+(nseg(object)-1)*pen.value(object)) + } + } + } else{stop("logLik is only valid for distributional assumptions, not CUSUM or CSS")} names(like)=c("-2*logLik","-2*Loglike+pen") return(like) @@ -1232,6 +1309,29 @@ setClass("cpt",slots=list(data.set="ts", cpttype="character", method="character" names(like)=c("-like","-likepen") } } + else if(test.stat(object)=="Binomial"){ + if(cpttype(object)!="mean and variance"){ + stop("Unknown changepoint type for test.stat='Binomial', must be 'mean and variance'") + } + else{ + y=c(0,cumsum(data.set(object))) + m=c(0,cumsum(param.est(object)$size)) + cpts=c(0,object@cpts) + #nseg=length(cpts)-1 + tmplike=sum(lchoose(param.est(object)$size,data.set(object))) + for(j in 1:nseg){ + sx <- y[cpts[j+1]+1]-y[cpts[j]+1] + sm <- m[cpts[j+1]+1]-m[cpts[j]+1] + seglike <- sx*log(sx) + (sm-sx)*log(sm-sx) - sm*log(sm) + tmplike <- tmplike + seglike + } + if(pen.type(object)=="MBIC"){ + like=c(-2*tmplike, -2*tmplike+(nseg(object)-2)*pen.value(object)+sum(log(seg.len(object)))) + }else{ + like=c(-2*tmplike,-2*tmplike+(nseg(object)-1)*pen.value(object)) + } + } + } else{stop("logLik is only valid for distributional assumptions, not CUSUM or CSS")} return(like) }) @@ -1287,4 +1387,4 @@ setClass("cpt",slots=list(data.set="ts", cpttype="character", method="character" stats::acf(data[(cpts[i]+1):cpts[i+1]],main=paste("Series part:",(cpts[i]+1),"-",cpts[i+1]),...) } }) - \ No newline at end of file + diff --git a/R/data_input.R b/R/data_input.R index 66ae4fc..ac92738 100644 --- a/R/data_input.R +++ b/R/data_input.R @@ -1,10 +1,10 @@ -data_input <- function(data, method, pen.value, costfunc, minseglen, Q, var=0, shape=1){ +data_input <- function(data, method, pen.value, costfunc, minseglen, Q, var=0, shape=1,size=NA){ if(var !=0){ - mu<-var + sumstat = SumStats(test.stat=costfunc, data=data, size=size, mu=var) }else{ - mu <- mean(data) + sumstat = SumStats(test.stat=costfunc, data=data, size=size) } - sumstat=cbind(c(0,cumsum(coredata(data))),c(0,cumsum(coredata(data)^2)),cumsum(c(0,(coredata(data)-mu)^2))) + if(method=="PELT"){ #out=PELT.meanvar.norm(coredata(data),pen.value) out=PELT(sumstat,pen=pen.value,cost_func = costfunc,minseglen=minseglen, shape=shape) ## K NEW ## @@ -20,10 +20,10 @@ data_input <- function(data, method, pen.value, costfunc, minseglen, Q, var=0, s } else if(method=="SegNeigh"){ #out=segneigh.meanvar.norm(coredata(data),Q,pen.value) - out=SEGNEIGH(data=data, pen.value=pen.value, Q=Q, costfunc=costfunc, var=var, shape=shape) + out=SEGNEIGH(data=data, pen.value=pen.value, Q=Q, costfunc=costfunc, var=var, shape=shape,size=size) # if(out$op.cpts==0){cpts=n} # else{cpts=c(sort(out$cps[out$op.cpts+1,][out$cps[out$op.cpts+1,]>0]),n)} } return(out) -} \ No newline at end of file +} diff --git a/R/exp.R b/R/exp.R index f0e82b9..87fc785 100644 --- a/R/exp.R +++ b/R/exp.R @@ -56,7 +56,7 @@ single.meanvar.exp<-function(data,penalty="MBIC",pen.value=0,class=TRUE,param.es n=ncol(data) } if(n<4){stop('Data must have atleast 4 observations to fit a changepoint model.')} - if(n<(2*minseglen)){stop('Minimum segment legnth is too large to include a change in this data')} + if(n<(2*minseglen)){stop('Minimum segment length is too large to include a change in this data')} penalty_decision(penalty, pen.value, n, diffparam=1, asymcheck="meanvar.exp", method="AMOC") if(is.null(dim(data))==TRUE || length(dim(data)) == 1){ @@ -274,7 +274,7 @@ multiple.meanvar.exp=function(data,mul.method="PELT",penalty="MBIC",pen.value=0, else{ n=ncol(data) } - if(n<(2*minseglen)){stop('Minimum segment legnth is too large to include a change in this data')} + if(n<(2*minseglen)){stop('Minimum segment length is too large to include a change in this data')} pen.value = penalty_decision(penalty, pen.value, n, diffparam=1, asymcheck=costfunc, method=mul.method) diff --git a/R/gamma.R b/R/gamma.R index 52d86fd..ca9fb5c 100644 --- a/R/gamma.R +++ b/R/gamma.R @@ -59,7 +59,7 @@ single.meanvar.gamma<-function(data,shape=1,penalty="MBIC",pen.value=0,class=TRU n=ncol(data) } if(n<4){stop('Data must have atleast 4 observations to fit a changepoint model.')} - if(n<(2*minseglen)){stop('Minimum segment legnth is too large to include a change in this data')} + if(n<(2*minseglen)){stop('Minimum segment length is too large to include a change in this data')} pen.value = penalty_decision(penalty, pen.value, n, diffparam=1, asymcheck="meanvar.gamma", method="AMOC") if(is.null(dim(data))==TRUE || length(dim(data)) == 1){ @@ -267,7 +267,7 @@ multiple.meanvar.gamma=function(data,shape=1,mul.method="PELT",penalty="MBIC",pe else{ n=ncol(data) } - if(n<(2*minseglen)){stop('Minimum segment legnth is too large to include a change in this data')} + if(n<(2*minseglen)){stop('Minimum segment length is too large to include a change in this data')} pen.value = penalty_decision(penalty, pen.value, n, diffparam, asymcheck = costfunc, method=mul.method) diff --git a/R/multiple.nonparametric.R b/R/multiple.nonparametric.R index 13d8834..b513e5d 100644 --- a/R/multiple.nonparametric.R +++ b/R/multiple.nonparametric.R @@ -123,7 +123,7 @@ multiple.var.css=function(data,mul.method="BinSeg",penalty="MBIC",pen.value=0,Q= else{ n=ncol(data) } - if(n<(2*minseglen)){stop('Minimum segment legnth is too large to include a change in this data')} + if(n<(2*minseglen)){stop('Minimum segment length is too large to include a change in this data')} pen.value = penalty_decision(penalty, pen.value, n, diffparam, asymcheck = costfunc, method=mul.method) if(is.null(dim(data))==TRUE || length(dim(data)) == 1){ @@ -316,7 +316,7 @@ multiple.mean.cusum=function(data,mul.method="BinSeg",penalty="Asymptotic",pen.v else{ n=ncol(data) } - if(n<(2*minseglen)){stop('Minimum segment legnth is too large to include a change in this data')} + if(n<(2*minseglen)){stop('Minimum segment length is too large to include a change in this data')} pen.value = penalty_decision(penalty, pen.value, n, diffparam, asymcheck = costfunc, method=mul.method) if(is.null(dim(data))==TRUE || length(dim(data)) == 1){ diff --git a/R/multiple.norm.R b/R/multiple.norm.R index 2f96697..090f506 100644 --- a/R/multiple.norm.R +++ b/R/multiple.norm.R @@ -502,7 +502,7 @@ multiple.var.norm=function(data,mul.method="PELT",penalty="MBIC",pen.value=0,Q=5 n=ncol(data) } if(n<4){stop('Data must have atleast 4 observations to fit a changepoint model.')} - if(n<(2*minseglen)){stop('Minimum segment legnth is too large to include a change in this data')} + if(n<(2*minseglen)){stop('Minimum segment length is too large to include a change in this data')} pen.value = penalty_decision(penalty, pen.value, n, diffparam, asymcheck=costfunc, method=mul.method) @@ -568,7 +568,7 @@ multiple.mean.norm=function(data,mul.method="PELT",penalty="MBIC",pen.value=0,Q= else{ n=ncol(data) } - if(n<(2*minseglen)){stop('Minimum segment legnth is too large to include a change in this data')} + if(n<(2*minseglen)){stop('Minimum segment length is too large to include a change in this data')} pen.value = penalty_decision(penalty, pen.value, n, diffparam, asymcheck = costfunc, method=mul.method) @@ -621,7 +621,7 @@ multiple.meanvar.norm=function(data,mul.method="PELT",penalty="MBIC",pen.value=0 else{ n=ncol(data) } - if(n<(2*minseglen)){stop('Minimum segment legnth is too large to include a change in this data')} + if(n<(2*minseglen)){stop('Minimum segment length is too large to include a change in this data')} pen.value = penalty_decision(penalty, pen.value, n, diffparam, asymcheck = costfunc, method=mul.method) diff --git a/R/poisson.R b/R/poisson.R index 0b84f80..92aaa5f 100644 --- a/R/poisson.R +++ b/R/poisson.R @@ -64,7 +64,7 @@ single.meanvar.poisson<-function(data,penalty="MBIC",pen.value=0,class=TRUE,para n=ncol(data) } if(n<4){stop('Data must have atleast 4 observations to fit a changepoint model.')} - if(n<(2*minseglen)){stop('Minimum segment legnth is too large to include a change in this data')} + if(n<(2*minseglen)){stop('Minimum segment length is too large to include a change in this data')} pen.value = penalty_decision(penalty, pen.value, n, diffparam=1, asymcheck="meanvar.poisson", method="AMOC") if(is.null(dim(data))==TRUE || length(dim(data)) == 1){ @@ -175,7 +175,8 @@ multiple.meanvar.poisson=function(data,mul.method="PELT",penalty="MBIC",pen.valu else{ n=ncol(data) } - if(n<(2*minseglen)){stop('Minimum segment legnth is too large to include a change in this data')} + if(n<4){stop('Data must have atleast 4 observations to fit a changepoint model.')} + if(n<(2*minseglen)){stop('Minimum segment length is too large to include a change in this data')} pen.value = penalty_decision(penalty, pen.value, n, diffparam=1, asymcheck=costfunc, method=mul.method) if(is.null(dim(data))==TRUE || length(dim(data)) == 1){ diff --git a/R/single.nonparametric.R b/R/single.nonparametric.R index a8fc560..3e52806 100644 --- a/R/single.nonparametric.R +++ b/R/single.nonparametric.R @@ -57,7 +57,7 @@ single.var.css<-function(data,penalty="MBIC",pen.value=0,class=TRUE,param.estima n=ncol(data) } if(n<4){stop('Data must have atleast 4 observations to fit a changepoint model.')} - if(n<(2*minseglen)){stop('Minimum segment legnth is too large to include a change in this data')} + if(n<(2*minseglen)){stop('Minimum segment length is too large to include a change in this data')} pen.value = penalty_decision(penalty, pen.value, n, diffparam, asymcheck = "var.css", method="AMOC") if(is.null(dim(data))==TRUE || length(dim(data)) == 1){ @@ -164,7 +164,7 @@ single.mean.cusum<-function(data,penalty="Asymptotic",pen.value=0.05,class=TRUE, n=ncol(data) } if(n<2){stop('Data must have atleast 2 observations to fit a changepoint model.')} - if(n<(2*minseglen)){stop('Minimum segment legnth is too large to include a change in this data')} + if(n<(2*minseglen)){stop('Minimum segment length is too large to include a change in this data')} pen.value = penalty_decision(penalty, pen.value, n, diffparam=1, asymcheck="mean.cusum", method="AMOC") if(is.null(dim(data))==TRUE || length(dim(data)) == 1){ diff --git a/R/single.norm.R b/R/single.norm.R index ff13331..bcffbca 100644 --- a/R/single.norm.R +++ b/R/single.norm.R @@ -56,7 +56,7 @@ single.mean.norm<-function(data,penalty="MBIC",pen.value=0,class=TRUE,param.esti n=ncol(data) } if(n<2){stop('Data must have atleast 2 observations to fit a changepoint model.')} - if(n<(2*minseglen)){stop('Minimum segment legnth is too large to include a change in this data')} + if(n<(2*minseglen)){stop('Minimum segment length is too large to include a change in this data')} pen.value = penalty_decision(penalty, pen.value, n, diffparam=1, asymcheck="mean.norm", method="AMOC") if(is.null(dim(data))==TRUE || length(dim(data)) == 1){ # single dataset @@ -143,7 +143,7 @@ single.var.norm<-function(data,penalty="MBIC",pen.value=0,know.mean=FALSE,mu=NA, n=ncol(data) } if(n<4){stop('Data must have atleast 4 observations to fit a changepoint model.')} - if(n<(2*minseglen)){stop('Minimum segment legnth is too large to include a change in this data')} + if(n<(2*minseglen)){stop('Minimum segment length is too large to include a change in this data')} pen.value = penalty_decision(penalty, pen.value, n, diffparam=1, asymcheck="var.norm", method="AMOC") @@ -274,7 +274,7 @@ single.meanvar.norm<-function(data,penalty="MBIC",pen.value=0,class=TRUE,param.e n=ncol(data) } if(n<4){stop('Data must have atleast 4 observations to fit a changepoint model.')} - if(n<(2*minseglen)){stop('Minimum segment legnth is too large to include a change in this data')} + if(n<(2*minseglen)){stop('Minimum segment length is too large to include a change in this data')} pen.value = penalty_decision(penalty, pen.value, n, diffparam=1, asymcheck="meanvar.norm", method="AMOC") diff --git a/R/sumstat.R b/R/sumstat.R new file mode 100644 index 0000000..9347d21 --- /dev/null +++ b/R/sumstat.R @@ -0,0 +1,29 @@ +SumStats <- function(test.stat, data, size, mu=NA){ + #output: matrix nrow = length(data) & ncol = 3 + if(grepl("binomial",tolower(test.stat))){ + #Col 1: sum of data + #Col 2: sum of size + #Col 3: BLANK (set to zero) + y = c(0, cumsum(coredata(data))) + if(length(data) == length(size)){ + m = c(0, cumsum(coredata(size))) + }else if(length(size)==1){ + m = (0:length(data))*coredata(size) + }else{ + stop("Dimenstion of size is not as expected.") + } + blank = rep(0,length(data)+1) + sumstat <- cbind(y,m,blank) + }else{ + ##Required for Normal, Exponential, Gamma and Poisson + #col 1: sum of data + #col 2: sum of squared data + #col 3: sum of squared deviations from the sample mean (overall?) + if(is.na(mu)) mu = mean(data) + y = c(0,cumsum(coredata(data))) + y2 = c(0, cumsum(coredata(data)^2)) + rss = c(0,cumsum((coredata(data)-mu)^2)) + sumstat <- cbind(y,y2,rss) + } + return(sumstat) +} \ No newline at end of file diff --git a/R/zzz.R b/R/zzz.R index 4c914f8..a17ec5c 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -3,5 +3,7 @@ f <- read.dcf(file.path(libname, pkgname, "DESCRIPTION"), c("Version", "Date")) packageStartupMessage('Successfully loaded changepoint package version ', - f[1,1],'\n See NEWS for details of changes.') -} \ No newline at end of file + f[1,1], + '\n Note that the default method in the cpt.* functions has changed from AMOC to PELT', + '\n See NEWS for details of changes.') +} diff --git a/man/changepoint-package.Rd b/man/changepoint-package.Rd index ffc07fb..5c031c5 100644 --- a/man/changepoint-package.Rd +++ b/man/changepoint-package.Rd @@ -12,15 +12,15 @@ Implements various mainstream and specialised changepoint methods for finding si \tabular{ll}{ Package: \tab changepoint\cr Type: \tab Package\cr -Version: \tab 2.2.5 \cr -Date: \tab 2022-11-08\cr +Version: \tab 2.3 \cr +Date: \tab 2023-07-06\cr License: \tab GPL\cr LazyLoad: \tab yes\cr } } \author{ -Rebecca Killick , Kaylea Haynes with contributions from Idris A. Eckley , Paul Fearnhead . +Rebecca Killick , Kaylea Haynes with contributions from Paul Fearnhead and Simon Taylor . Maintainer: Rebecca Killick } diff --git a/man/class_input.Rd b/man/class_input.Rd index 05558e6..85760ec 100644 --- a/man/class_input.Rd +++ b/man/class_input.Rd @@ -12,7 +12,7 @@ WARNING: No checks on arguments are performed! } \usage{ class_input(data, cpttype, method, test.stat, penalty, pen.value, minseglen, -param.estimates, out=list(), Q=NA, shape=NA) +param.estimates, out=list(), Q=NA, shape=NA, size=NA) } \arguments{ \item{data}{ @@ -47,6 +47,9 @@ param.estimates, out=list(), Q=NA, shape=NA) } \item{shape}{ Value of the assumed known shape parameter required when test.stat="Gamma". +} + \item{size}{ + Value of the assumed known size parameter required when test.stat="Binomial". This is the number of trials for each observation in data. Can be a single value (assumed the same over time), a vector the length of data, or a matrix the same dimensions as data. } } \details{ diff --git a/man/cpt.mean.Rd b/man/cpt.mean.Rd index 134e151..1e17700 100644 --- a/man/cpt.mean.Rd +++ b/man/cpt.mean.Rd @@ -7,7 +7,7 @@ Identifying Changes in Mean Calculates the optimal positioning and (potentially) number of changepoints for data using the user specified method. } \usage{ -cpt.mean(data,penalty="MBIC",pen.value=0,method="AMOC",Q=5,test.stat="Normal",class=TRUE, +cpt.mean(data,penalty="MBIC",pen.value=0,method="PELT",Q=5,test.stat="Normal",class=TRUE, param.estimates=TRUE,minseglen=1) } \arguments{ diff --git a/man/cpt.meanvar.Rd b/man/cpt.meanvar.Rd index 41c40dd..28eb9f9 100644 --- a/man/cpt.meanvar.Rd +++ b/man/cpt.meanvar.Rd @@ -7,8 +7,8 @@ Identifying Changes in Mean and Variance Calculates the optimal positioning and (potentially) number of changepoints for data using the user specified method. } \usage{ -cpt.meanvar(data,penalty="MBIC",pen.value=0,method="AMOC",Q=5,test.stat="Normal", -class=TRUE,param.estimates=TRUE,shape=1,minseglen=2) +cpt.meanvar(data,penalty="MBIC",pen.value=0,method="PELT",Q=5,test.stat="Normal", +class=TRUE,param.estimates=TRUE,shape=1,size=NA,minseglen=2) } \arguments{ \item{data}{ @@ -27,7 +27,7 @@ class=TRUE,param.estimates=TRUE,shape=1,minseglen=2) The maximum number of changepoints to search for using the "BinSeg" method. The maximum number of segments (number of changepoints + 1) to search for using the "SegNeigh" method. } \item{test.stat}{ - The assumed test statistic / distribution of the data. Currently only "Normal", "Gamma", "Exponential" and "Poisson" are supported. + The assumed test statistic / distribution of the data. Currently only "Normal", "Gamma", "Exponential", "Poisson" and "Binomial" are supported. } \item{class}{ Logical. If TRUE then an object of class \code{cpt} is returned. @@ -37,6 +37,9 @@ class=TRUE,param.estimates=TRUE,shape=1,minseglen=2) } \item{shape}{ Value of the assumed known shape parameter required when test.stat="Gamma". +} + \item{size}{ + Value of the assumed known size parameter required when test.stat="Binomial". This is the number of trials for each observation in data. Can be a single value (assumed the same over time), a vector the length of data, or a matrix the same dimensions as data. } \item{minseglen}{ Positive integer giving the minimum segment length (no. of observations between changes), default is the minimum allowed by theory. @@ -76,6 +79,8 @@ Change in Exponential model: Chen, J. and Gupta, A. K. (2000) \emph{Parametric s Change in Poisson model: Chen, J. and Gupta, A. K. (2000) \emph{Parametric statistical change point analysis}, Birkhauser +Change in Binomial model: Chen, J. and Gupta, A. K. (2000) \emph{Parametric statistical change point analysis}, Birkhauser + PELT Algorithm: Killick R, Fearnhead P, Eckley IA (2012) Optimal detection of changepoints with a linear computational cost, \emph{JASA} \bold{107(500)}, 1590--1598 CROPS: Haynes K, Eckley IA, Fearnhead P (2014) Efficient penalty search for multiple changepoint problems (in submission), arXiv:1412.3617 diff --git a/man/cpt.var.Rd b/man/cpt.var.Rd index 59008f6..4c446b5 100644 --- a/man/cpt.var.Rd +++ b/man/cpt.var.Rd @@ -7,7 +7,7 @@ Identifying Changes in Variance Calculates the optimal positioning and (potentially) number of changepoints for data using the user specified method. } \usage{ -cpt.var(data,penalty="MBIC",pen.value=0,know.mean=FALSE,mu=NA,method="AMOC",Q=5, +cpt.var(data,penalty="MBIC",pen.value=0,know.mean=FALSE,mu=NA,method="PELT",Q=5, test.stat="Normal",class=TRUE,param.estimates=TRUE,minseglen=2) } \arguments{ diff --git a/man/param-methods.Rd b/man/param-methods.Rd index 3826116..416b544 100644 --- a/man/param-methods.Rd +++ b/man/param-methods.Rd @@ -12,7 +12,7 @@ \describe{ \item{\code{signature(object = "cpt")}}{ - Replaces the \code{param.value} slot in object with the parameter estimates that are appropriate for the changepoint type (\code{cpttype} slot). If the Gamma test statistic is used then the shape parameter is required as a variable. + Replaces the \code{param.value} slot in object with the parameter estimates that are appropriate for the changepoint type (\code{cpttype} slot). If the Gamma test statistic is used then the shape parameter is required as an argument. If the Binomial test statistic is used then the size parameter is required as an argument } \item{\code{signature(object = "cpt.range")}}{ Returns a new (blank) cpt.range object where the \code{param.value} slot contains the parameter estimates for a segmentation with the \code{ncpts} specified number of changepoints. This is also tailored to the changepoint type (\code{cpttype} slot) of the original object. If the Gamma test statistic is used then the shape parameter is required as a variable. An example of use is given in the \link{cpt.mean} help file. diff --git a/src/BinSeg_one_func_minseglen.c b/src/BinSeg_one_func_minseglen.c index edab9e0..2f8930e 100644 --- a/src/BinSeg_one_func_minseglen.c +++ b/src/BinSeg_one_func_minseglen.c @@ -39,12 +39,14 @@ void binseg(char** cost_func, //Descibe the cost function used i.e. norm.mean.co double mll_meanvar_exp(double, double, double, int, double); double mll_meanvar_gamma(double, double, double, int, double); double mll_meanvar_poisson(double, double, double, int, double); + double mll_meanvar_binomial(double, double, double, int, double); double mbic_var(double, double, double, int, double); double mbic_mean(double, double, double, int, double); double mbic_meanvar(double, double, double, int, double); double mbic_meanvar_exp(double, double, double, int, double); double mbic_meanvar_gamma(double, double, double, int, double); double mbic_meanvar_poisson(double, double, double, int, double); + double mbic_meanvar_binomial(double, double, double, int, double); double (*costfunction)(double, double, double, int, double); if (strcmp(*cost_func,"var.norm")==0){ @@ -65,7 +67,10 @@ void binseg(char** cost_func, //Descibe the cost function used i.e. norm.mean.co else if (strcmp(*cost_func,"meanvar.poisson")==0){ costfunction = &mll_meanvar_poisson; } - else if (strcmp(*cost_func,"mean.norm.mbic")==0){ + else if (strcmp(*cost_func,"meanvar.binomial")==0){ + costfunction = &mll_meanvar_binomial; + } +else if (strcmp(*cost_func,"mean.norm.mbic")==0){ costfunction = &mbic_mean; } else if (strcmp(*cost_func,"var.norm.mbic")==0){ @@ -83,7 +88,10 @@ void binseg(char** cost_func, //Descibe the cost function used i.e. norm.mean.co else if (strcmp(*cost_func,"meanvar.poisson.mbic")==0){ costfunction = &mbic_meanvar_poisson; } - +else if (strcmp(*cost_func,"meanvar.binomial.mbic")==0){ + costfunction = &mbic_meanvar_binomial; + } + void max_which(double*, int, double*, int*); void order_vec(int[], int); diff --git a/src/PELT_one_func_minseglen.c b/src/PELT_one_func_minseglen.c index da39530..bdfdeab 100644 --- a/src/PELT_one_func_minseglen.c +++ b/src/PELT_one_func_minseglen.c @@ -52,12 +52,14 @@ void PELTC(char** cost_func, double mll_meanvar_exp(double, double, double, int, double); double mll_meanvar_gamma(double, double, double, int, double); double mll_meanvar_poisson(double, double, double, int, double); + double mll_meanvar_binomial(double, double, double, int, double); double mbic_var(double, double, double, int, double); double mbic_mean(double, double, double, int, double); double mbic_meanvar(double, double, double, int, double); double mbic_meanvar_exp(double, double, double, int, double); double mbic_meanvar_gamma(double, double, double, int, double); double mbic_meanvar_poisson(double, double, double, int, double); + double mbic_meanvar_binomial(double, double, double, int, double); if (strcmp(*cost_func,"var.norm")==0){ costfunction = &mll_var; @@ -77,6 +79,9 @@ void PELTC(char** cost_func, else if (strcmp(*cost_func,"meanvar.poisson")==0){ costfunction = &mll_meanvar_poisson; } + else if (strcmp(*cost_func,"meanvar.binomial")==0){ + costfunction = &mll_meanvar_binomial; + } else if (strcmp(*cost_func,"mean.norm.mbic")==0){ costfunction = &mbic_mean; } @@ -95,7 +100,10 @@ void PELTC(char** cost_func, else if (strcmp(*cost_func,"meanvar.poisson.mbic")==0){ costfunction = &mbic_meanvar_poisson; } - + else if (strcmp(*cost_func,"meanvar.binomial.mbic")==0){ + costfunction = &mbic_meanvar_binomial; + } + // int *lastchangecpts; // lastchangecpts = (int *)calloc(*n+1,sizeof(int)); diff --git a/src/cost_general_functions.c b/src/cost_general_functions.c index 8d4baba..d5e9ef5 100644 --- a/src/cost_general_functions.c +++ b/src/cost_general_functions.c @@ -43,6 +43,14 @@ double mll_meanvar_poisson(double x, double x2, double x3, int n, double shape){ else{return(2*x*(log(n)-log(x)));} } +double mll_meanvar_binomial(double x, double x2, double x3, int n, double shape){ + //x = sum of data + //x2 = sum of size + if(x==0){return(0);} + if(x==x2){return(0);} + else{return(2*x2*log(x2)-2*x*log(x)-2*(x2-x)*log(x2-x));} +} + double mbic_var(double x, double x2, double x3, int n, double shape){ if(x3<=0){x3=0.00000000001;} return(n*(log(2*M_PI)+log(x3/n)+1)+log(n)); /* M_PI is in Rmath.h */ @@ -72,6 +80,13 @@ double mbic_meanvar_poisson(double x, double x2, double x3, int n, double shape) else{return(2*x*(log(n)-log(x))+log(n));} } +double mbic_meanvar_binomial(double x, double x2, double x3, int n, double shape){ + //x = sum of data + //x2 = sum of size + if(x==0){return(0);} + if(x==x2){return(0);} + else{return(2*x2*log(x2)-2*x*log(x)-2*(x2-x)*log(x2-x) + log(n));} +} void max_which(double *array,int n,double *maxout,int *whichout){ // Function to find maximum of an array with n elements that is put in max diff --git a/tests/testthat/test-cptmean.R b/tests/testthat/test-cptmean.R index 306c310..e469d7e 100644 --- a/tests/testthat/test-cptmean.R +++ b/tests/testthat/test-cptmean.R @@ -152,7 +152,7 @@ checkCROPS <- function(){ }else{ - expect_that(cpt.mean(data=data[[d]], method=methods[m],penalty=penalties[p], pen.value=cropspenval[[cr]], test.stat=testStats[[ts]], class=cl, param.estimates=pe), throws_error('Only Normal, Exponential, Gamma and Poisson are valid test statistics')) + expect_that(cpt.mean(data=data[[d]], method=methods[m],penalty=penalties[p], pen.value=cropspenval[[cr]], test.stat=testStats[[ts]], class=cl, param.estimates=pe), throws_error('Only Normal, Exponential, Gamma, Poisson and Binomial are valid test statistics')) } } }) @@ -216,7 +216,7 @@ checkOtherPenalties <- function(methodLog){ for(d in 1:length(data)){ if(is.element(NA, data[[d]])){ test_that(paste0("Test #",t," :data=",d), { - expect_that(cpt.mean(data=data[[d]]),throws_error("Missing value: NA is not allowed in the data as changepoint methods are only sensible for regularly spaced data.")) + expect_that(cpt.mean(data=data[[d]]),throws_error("Missing value: NA is not allowed in the data. If suitable for the application, drop the NAs but then be careful with inference.")) t = t + 1 }) diff --git a/tests/testthat/test-cptmeanvar.R b/tests/testthat/test-cptmeanvar.R index ed99578..0cd55d3 100644 --- a/tests/testthat/test-cptmeanvar.R +++ b/tests/testthat/test-cptmeanvar.R @@ -190,7 +190,7 @@ checkCROPS <- function(){ }else{ - expect_that(cpt.meanvar(data=data[[d]], method=methods[m],penalty=penalties[p], pen.value=cropspenval[[cr]], test.stat=testStats[[ts]], class=cl, param.estimates=pe), throws_error('Only Normal, Exponential, Gamma and Poisson are valid test statistics')) + expect_that(cpt.meanvar(data=data[[d]], method=methods[m],penalty=penalties[p], pen.value=cropspenval[[cr]], test.stat=testStats[[ts]], class=cl, param.estimates=pe), throws_error('Only Normal, Exponential, Gamma, Poisson and Binomial are valid test statistics')) } } t=t+1 @@ -208,7 +208,7 @@ for(d in 1:length(data)){ if(is.element(NA, data[[d]])){ test_that(paste0("Test #",t," :data=",d,"penalty=",penalties[p],", method=",methods[m],",class=",cl,", param=",pe,", test.stat=",testStats[ts]), { - expect_that(cpt.meanvar(data=data[[d]]),throws_error('Missing value: NA is not allowed in the data as changepoint methods are only sensible for regularly spaced data.')) + expect_that(cpt.meanvar(data=data[[d]]),throws_error('Missing value: NA is not allowed in the data. If suitable for the application, drop the NAs but then be careful with inference.')) #not user friendly error : Error in if (teststat >= pen.value) { : # missing value where TRUE/FALSE needed # In addition: Warning message: @@ -221,7 +221,7 @@ for(d in 1:length(data)){ }else if(length(data[[d]]) <= 2){ test_that(paste0("Test #",t," :data=",d,"penalty=",penalties[p],", method=",methods[m],",class=",cl,", param=",pe,", test.stat=",testStats[ts]), { - expect_that(cpt.meanvar(data=data[[d]]),throws_error("Data must have atleast")) + expect_that(cpt.meanvar(data=data[[d]], method=methods[m], test.stat=testStats[ts]),throws_error("Data must have atleast")) t = t + 1 }) diff --git a/tests/testthat/test-cptvar.R b/tests/testthat/test-cptvar.R index 4b9f044..58f3c56 100644 --- a/tests/testthat/test-cptvar.R +++ b/tests/testthat/test-cptvar.R @@ -193,7 +193,7 @@ checkCROPS <- function(){ }else{ - expect_that(cpt.var(data=data[[d]], method=methods[m],penalty=penalties[p], pen.value=cropspenval[[cr]], test.stat=testStats[[ts]], class=cl, param.estimates=pe), throws_error('Only Normal, Exponential, Gamma and Poisson are valid test statistics')) + expect_that(cpt.var(data=data[[d]], method=methods[m],penalty=penalties[p], pen.value=cropspenval[[cr]], test.stat=testStats[[ts]], class=cl, param.estimates=pe), throws_error('Only Normal, Exponential, Gamma, Poisson and Binomial are valid test statistics')) } } }) @@ -209,7 +209,7 @@ for(d in 1:length(data)){ if(is.element(NA, data[[d]])){ test_that(paste0("Test #",t," :data=",d,", penalty=",penalties[p],", method=",methods[m],",class=",cl,", param=",pe,", test.stat=",testStats[ts]), { - expect_that(cpt.var(data=data[[d]]),throws_error('Missing value: NA is not allowed in the data as changepoint methods are only sensible for regularly spaced data.')) + expect_that(cpt.var(data=data[[d]]),throws_error('Missing value: NA is not allowed in the data. If suitable for the application, drop the NAs but then be careful with inference.')) #not user friendly error : Error in if (teststat >= pen.value) { : # missing value where TRUE/FALSE needed # In addition: Warning message: diff --git a/tests/testthat/test-examples.R b/tests/testthat/test-examples.R index ef29b67..68168fd 100644 --- a/tests/testthat/test-examples.R +++ b/tests/testthat/test-examples.R @@ -5,19 +5,19 @@ context("man file example tests") set.seed(1) x=c(rnorm(100,0,1),rnorm(100,0,10)) ansvar=cpt.var(x) -test_that('var1',expect_identical(cpts(ansvar),100)) +test_that('var1',expect_identical(cpts(ansvar),as.integer(100))) # change in mean set.seed(1) y=c(rnorm(100,0,1),rnorm(100,5,1)) ansmean=cpt.mean(y) -test_that('mean1',expect_identical(cpts(ansmean),100)) +test_that('mean1',expect_identical(cpts(ansmean),as.integer(100))) # change in mean and variance set.seed(1) z=c(rnorm(100,0,1),rnorm(100,2,10)) ansmeanvar=cpt.meanvar(z) -test_that('meanvar1',expect_identical(cpts(ansmeanvar),100)) +test_that('meanvar1',expect_identical(cpts(ansmeanvar),as.integer(100))) @@ -31,9 +31,9 @@ set.seed(1) x=c(rnorm(100,0,1),rnorm(100,10,1)) test_that('mean2',expect_equivalent(cpt.mean(x,penalty="SIC",method="AMOC",class=FALSE),c(100,1))) ans=cpt.mean(x,penalty="Asymptotic",pen.value=0.01,method="AMOC") -test_that('mean3',expect_identical(cpts(ans),100)) +test_that('mean3',expect_identical(cpts(ans),as.integer(100))) ans=cpt.mean(x,penalty="Manual",pen.value=0.8,method="AMOC",test.stat="CUSUM") -test_that('mean4',expect_equivalent(cpts(ans),101)) +test_that('mean4',expect_equivalent(cpts(ans),as.integer(101))) # Example of multiple changes in mean at 50,100,150 in simulated normal data set.seed(1)