* Added Binomial cost function to cpt.meanvar(). Thanks go to Simon T…

…aylor for contributing this new cost function. * Changed default method in cpt.*() functions to PELT * Fixed bug in SegNeigh algorithm where if the distribution used was Gamma the shape parameter was not passed to the lower functions. Thanks go to Simon Taylor for highlighting the bug.
rkillick · Jul 6, 2023 · 32d2e6f · 32d2e6f
1 parent 5253a05
commit 32d2e6f
Show file tree

Hide file tree

Showing 31 changed files with 551 additions and 83 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,13 +1,16 @@
 Package: changepoint
 Type: Package
 Title: Methods for Changepoint Detection
-Version: 2.2.5
-Date: 2022-11-08
+Version: 2.3
+Date: 2023-07-06
 Authors@R: c(person("Rebecca", "Killick",
         role=c("aut","cre"),email="r.killick@lancs.ac.uk"),
-				person("Kaylea", "Haynes", role="aut"),        
-				person("Idris", "Eckley", role=c("ths")),
-        person("Paul","Fearnhead",role=c("ctb","ths")), person("Robin", "Long", role="ctb"),person("Jamie","Lee",role="ctr"))
+	person("Kaylea", "Haynes", role="aut"),        
+        person("Paul","Fearnhead",role=c("ctb","ths")), 
+	person("Robin", "Long", role="ctb"),
+	person("Simon", "Taylor",role="ctb"),
+	person("Idris", "Eckley", role=c("ths")),
+	person("Jamie","Lee",role="ctr"))
 Maintainer: Rebecca Killick <r.killick@lancs.ac.uk>
 BugReports: https://github.com/rkillick/changepoint/issues
 URL: https://github.com/rkillick/changepoint/
@@ -16,14 +19,15 @@ Depends: R(>= 3.2), methods, stats, zoo(>= 0.9-1)
 Suggests: testthat, vdiffr
 License: GPL
 LazyData: true
-Packaged: 2022-11-08 14:14:13 UTC; killick
+Packaged: 2023-07-06 14:14:13 UTC; killick
 NeedsCompilation: yes
 Repository: CRAN
 Date/Publication: 2022-11-08 15:50:02 UTC
 Author: Rebecca Killick [aut, cre],
   Kaylea Haynes [aut],
-  Idris Eckley [ths],
   Paul Fearnhead [ctb, ths],
   Robin Long [ctb],
+  Simon Taylor [ctb],
+  Idris Eckley [ths],
   Jamie Lee [ctr]
 
diff --git a/NEWS b/NEWS
@@ -1,3 +1,10 @@
+Version 2.3
+===========
+* Added Binomial cost function to cpt.meanvar().  Thanks go to Simon Taylor for contributing this new cost function.
+* Changed default method in cpt.*() functions to PELT
+* Fixed bug in SegNeigh algorithm where if the distribution used was Gamma the shape parameter was not passed to the lower functions.  Thanks go to Simon Taylor for highlighting the bug.
+
+
 Version 2.2.5
 =============
 * Thanks to Toby Hocking for highlighting that when pen=0, BinSeg doesn't need to warn that ncpts=Q as this is by design. Warning for pen=0 removed.

diff --git a/R/CROPS.R b/R/CROPS.R
@@ -1,17 +1,19 @@
-CROPS <- function(data, penalty="CROPS", pen.value, method="PELT", test.stat="Normal", class=TRUE, param.est=TRUE, minseglen, shape, func){
+CROPS <- function(data, penalty="CROPS", pen.value, method="PELT", test.stat="Normal", class=TRUE, param.est=TRUE, minseglen, shape, func, size){
   if(method != "PELT"){stop('CROPS is a valid penalty choice only if method="PELT", please change your method or your penalty.')}
   mu <- mean(data)
-  sumstat=cbind(c(0,cumsum(coredata(data))),c(0,cumsum(coredata(data)^2)),cumsum(c(0,(coredata(data)-mu)^2)))
-
+
   switch(test.stat,
     "Normal" = {stat = "norm"},
     "Exponential" = {stat = "exp"},
     "Gamma" = {stat = "gamma"},
     "Poisson" = {stat = "poisson"},
-    {stop("Only Normal, Exponential, Gamma and Poisson are valid test statistics")}
+    "Binomial" = {stat = "binomial"},
+    {stop("Only Normal, Exponential, Gamma, Poisson and Binomial are valid test statistics")}
   )
   costfunc = paste0(func, ".", stat)
 
+  sumstat = SumStats(test.stat, data, size)
+
   out = range_of_penalties(sumstat, cost=costfunc, min_pen=pen.value[1], max_pen=pen.value[2], minseglen=minseglen)
 
   if(func=="var"){
@@ -23,10 +25,10 @@ CROPS <- function(data, penalty="CROPS", pen.value, method="PELT", test.stat="No
   }
 
   if(class==TRUE){
-      ans = class_input(data=data,cpttype=cpttype, method="PELT", test.stat=test.stat, penalty=penalty, pen.value=pen.value, minseglen=minseglen, param.estimates=param.est, out=out,shape=shape)
+      ans = class_input(data=data,cpttype=cpttype, method="PELT", test.stat=test.stat, penalty=penalty, pen.value=pen.value, minseglen=minseglen, param.estimates=param.est, out=out,shape=shape,size=size)
       if(func=="var"){
         param.est(ans)=c(param.est(ans),mean=mu)
       }
     return(ans)
   }else{return(out)}
-}
+}
diff --git a/R/SegNeigh_one_func_minseglen.R b/R/SegNeigh_one_func_minseglen.R
@@ -4,14 +4,18 @@ dataToString <- function(data){
 
 
 
-SEGNEIGH <- function(data, pen.value, costfunc, Q, var=0, shape=1){
+SEGNEIGH <- function(data, pen.value, costfunc, Q, var=0, shape=1,size=NA){
 
   #split the costfunction by . and then use the first two elements in eval(parse())
   if(Q<=0){stop(paste('Q is the maximum number of changepoints so should be greater than 0'))}
 
   split=strsplit(costfunc, "\\.")
-  if(var!=0){  
-  text = c("segneigh",".", split[[1]][1],".", split[[1]][2],"(",dataToString(data),",",Q,",",pen.value,",","know.mean=",TRUE,",","mu=",var,")")
+  if(split[[1]][2] == "gamma"){
+    text = c("segneigh",".", split[[1]][1],".", split[[1]][2],"(",dataToString(data),",",shape,",",Q,",",pen.value,")")
+  }else if(split[[1]][2] == "binomial"){
+    text = c("segneigh",".", split[[1]][1],".", split[[1]][2],"(",dataToString(data),",",dataToString(size),",",Q,",",pen.value,")")
+  }else if(var!=0){  
+    text = c("segneigh",".", split[[1]][1],".", split[[1]][2],"(",dataToString(data),",",Q,",",pen.value,",","know.mean=",TRUE,",","mu=",var,")")
   }else{
     text = c("segneigh",".", split[[1]][1],".", split[[1]][2],"(",dataToString(data),",",Q,",",pen.value,")")    
   }
@@ -21,4 +25,4 @@ SEGNEIGH <- function(data, pen.value, costfunc, Q, var=0, shape=1){
   out = eval(parse(text=paste(text, collapse="")))
 
   return(out)
-}
+}
diff --git a/R/binomial.R b/R/binomial.R
@@ -0,0 +1,273 @@
+cpt.meanvar.binom <- function(data,penalty="MBIC",pen.value=0,method="PELT",Q=5,class=TRUE,
+                  param.estimates=TRUE,minseglen=2,size=NA){
+  ##Checks already performed by cpt.meanvar:
+  #   data contains no NAs
+  #   method=="SegNeigh" & minseglen>2 ==> stop()
+  if(is.na(size)){
+    stop("For Binomial cost functions the size argument must be specified.")
+  }
+  if(method=="AMOC"){
+    return(single.meanvar.binomial(data, size, penalty, pen.value, class,
+                                   param.estimates,minseglen))
+  }else if(method=="PELT" || method=="BinSeg" || method=="SegNeigh"){
+    return(multiple.meanvar.binomial(data, size, mul.method=method,penalty,
+                                     pen.value,Q,class,param.estimates,minseglen))
+  }else{
+    stop("Invalid Method, must be AMOC, PELT, SegNeigh or BinSeg")
+  }
+}
+
+single.meanvar.binomial <- function(data, size, penalty="MBIC",pen.value=0,class=TRUE,
+                        param.estimates=TRUE,minseglen){
+  ##checks on data & size
+  if(any(data<0) || any(size<0)) stop("data and size needs to be positive.")
+  if(any(data%%1!=0) || any(size%%1!=0)) stop("data and size needs to be integers.")
+  if(length(size)==1){
+    SIZE <- data*0 + size ## repeat size to the dimension of data & inherit class
+  }else if(length(data)==length(size)){
+    if(!is.numeric(size)) stop("Only numeric size allowed")
+    if(anyNA(size)){stop("Missing value: NA is not allowed in size.")}
+    SIZE <- size
+  }else{
+    stop("Size needs to be either a single number or have the same dimensions as data.")  
+  }
+  if(any(data > SIZE)) stop("Elements of data cannot be greater than the corresponding element of size.")
+  if(is.null(dim(data))){
+    #single dataset
+    n <- length(data)
+  }else{
+    n <- nrow(data)
+  }
+  if(n<4) stop("Data must have at least 4 observations to fit this changepoint model.")
+  if(n<(2*minseglen)) stop("Minimum segment length is too large to include a changepoint in this data")
+
+  pen.value = penalty_decision(penalty, pen.value, n, diffparam=1, asymcheck="meanvar.binomial",method="AMOC")
+  if(is.null(dim(data))){
+    #single dataset
+    tmp = single.meanvar.binomial.calc(coredata(data),coredata(SIZE),extrainf=TRUE,minseglen)
+    if(penalty=="MBIC"){
+      tmp[3] = tmp[3] + log(tmp[1]) + log(n-tmp[1]+1)
+    }
+    ans = decision(tmp[1],tmp[2],tmp[3],penalty,n,diffparam=1,pen.value)
+    if(class){
+      return(class_input(data, size=SIZE, method="AMOC",
+                               penalty=penalty, pen.value=ans$pen,
+                               minseglen=minseglen, param.estimates=param.estimates,
+                               out = c(0,ans$cpt)))
+    }else{
+      return(ans$cpt)
+    }
+  }else{
+    #multiple datasets
+    tmp = single.meanvar.binomial.calc(data,SIZE,extrainf=TRUE,minseglen)
+    if(penalty=="MBIC"){
+      tmp[,3] = tmp[,3] + log(tmp[,1]) + log(n-tmp[,1]+1)
+    }
+    ans = decision(tmp[,1],tmp[,2],tmp[,3],penalty,n,diffparam=1,pen.value)
+    if(class){
+      rep = ncol(data)
+      out = list()
+      for(i in 1:rep){
+        if(length(size)==1){ size_out<- size }else{ size_out<-size[,i] } 
+        # the above looks repetitive from line 25 but 25 is generic and 71 is specific to multivariate
+        out[[i]] <- class_input(data[,i],size=size_out,
+                      method="AMOC", penalty=penalty, 
+                      pen.value=ans$pen, minseglen=minseglen, 
+                      param.estimates=param.estimates, out = c(0,ans$cpt[i]))
+      }
+      return(out)
+    }else{
+      return(ans$cpt)
+    }
+  }
+}
+
+
+
+single.meanvar.binomial.calc <- function(data,size,extrainf=TRUE,minseglen){
+  singledim <- function(data,size,extrainf=TRUE,minseglen){
+    n = length(data)
+    y = c(0,cumsum(data))
+    m = c(0,cumsum(size))
+    if(y[n+1] == 0 || y[n+1]==m[n+1]){
+      null = Inf
+    }else{
+      null=-2*(y[n+1]*log(y[n+1])-m[n+1]*log(m[n+1]) + (m[n+1]-y[n+1])*log(m[n+1]-y[n+1]))
+    }
+    taustar = minseglen:(n-minseglen)
+    sx1 = y[taustar+1]
+    sm1 = m[taustar+1]
+    sx1_sm1 = sm1-sx1
+    sx2 = y[n+1]-y[taustar+1]
+    sm2 = m[n+1]-m[taustar+1]
+    sx2_sm2 = sm2-sx2
+    tmp = sx1*log(sx1) - sm1*log(sm1) + sx1_sm1*log(sx1_sm1)
+    tmp = tmp + sx2*log(sx2) - sm2*log(sm2) + sx2_sm2*log(sx2_sm2)
+    ##NOTE: 
+    ## probMLE = c(rep(sx1/sm1,taustar[i]),rep(sx2/sm2,n-taustar[i])) 
+    ## sum(dbinom(data,size,probMLE,log=TRUE)) == tmp + sum(lchoose(size,data))
+
+    tmp = -2*tmp  #convert to deviance scale
+    if(anyNA(tmp)) tmp[is.na(tmp)] = Inf
+    tau=which(tmp==min(tmp,na.rm=T))[1]
+    taulike = tmp[tau]
+    tau = tau + minseglen - 1 ## correcting for the fact that we are starting at minseglen
+    if(extrainf){
+      out <- c(tau,null,taulike)
+      names(out) = c("cpt","null","alt")
+      return(out)
+    }else{
+      return(tau)
+    }
+  }
+
+  if(is.null(dim(data))){
+    cpt = singledim(data,size,extrainf,minseglen)
+  }else{
+    rep = nrow(data) 
+    n = ncol(data)   
+    if(!extrainf){
+      cpt=rep(0,rep)
+      for(i in 1:rep){
+        cpt[i] <- singledim(data[i,], size[i,], extrainf, minseglen)
+      }
+    }else{
+      cpt = matrix(0,ncol=3,nrow=rep)
+      for(i in 1:rep){
+        cpt[i,] = singledim(data[i,], size[i,], extrainf, minseglen)
+      }
+      colnames(cpt) <- c("cpt","null","alt")
+    }
+    return(cpt)
+  }
+}
+
+multiple.meanvar.binomial <- function(data, size, mul.method="PELT",penalty="MBIC",
+                          pen.value=0,Q=5,class=TRUE,param.estimates=TRUE,minseglen){
+  ##checks on data & size
+  if(length(size)==1){
+    SIZE <- data*0 + size ## repeat size to the dimension of data & inherit class
+  }else if(length(data)==length(size)){
+    if(!is.numeric(size)) stop("Only numeric size allowed")
+    if(anyNA(size)){stop("Missing value: NA is not allowed in the size as changepoint methods are only sensible for regularly spaced data.")}
+    SIZE <- size
+  }else{
+    stop("Size needs to be either a single number or have the same dimensions of data.")  
+  }
+  if(any(data<0) || any(size<0)) stop("Binomal test statistic requires postive data and size.")
+  if(any(data%%1!=0) || any(size%%1!=0)) stop("Binomal test statistic requires integer data and size.")
+  if(any(data > SIZE)) stop("Binomal test statistic requires data <= size.")
+
+  costfunc = "meanvar.binomial"
+  if(penalty == "MBIC"){
+    if(mul.method == "SegNeigh"){
+      stop("MBIC penalty not implemented for SegNeigh method, please choose an alternative penalty")
+    }
+    costfunc = paste0(costfunc,".mbic")
+  }
+
+  diffparam = 1
+  if(is.null(dim(data))){
+    n = length(data)
+  }else{
+    n = ncol(data)
+  }
+
+  if(n<(2*minseglen)) stop("Minimum segment length is too large to include a change in this data")
+  pen.value = penalty_decision(penalty, pen.value, n, diffparam = 1, asymcheck = costfunc, method = mul.method)  
+  if(is.null(dim(data))){
+    out = data_input(data=data,method=mul.method, pen.value=pen.value, costfunc=costfunc,
+                     minseglen=minseglen, Q=Q, size=size)
+    if(class){
+      return(class_input(data=data, size=SIZE, method=mul.method,
+                               penalty=penalty, pen.value=pen.value,
+                               minseglen=minseglen, param.estimates=param.estimates,
+                               out = out, Q=Q))
+    }else{
+      return(out[[2]])
+    }
+  }else{
+    rep = nrow(data)
+    out = list()
+    for(i in 1:rep){
+      out[[i]] = data_input(data=data[i,],method=mul.method, pen.value=pen.value,
+                            costfunc=costfunc, minseglen=minseglen, Q=Q, size=size[i,])
+    }
+    cpts = lapply(out,'[[',2)
+    if(class){
+      ans = list()
+      for(i in 1:rep){
+        if(length(size)==1){ size_out<- size }else{ size_out<-size[i,] }
+        ans[[i]] = class_input(data=data[i,],size=size_out,
+                                     method=mul.method, penalty=penalty, 
+                                     pen.value=pen.value, minseglen=minseglen, 
+                                     param.estimates=param.estimates, 
+                                     out = out[[i]], Q=Q)
+      }
+      return(ans)
+    }else{
+      return(cpts)
+    }
+  }
+}
+
+segneigh.meanvar.binomial <- function(data, size=1, Q=5, pen=0){
+  ##nb size is the same dimension of data
+  ##checks on data and size should have already been performed
+  n <- length(data)
+  if(n<4){stop('Data must have atleast 4 observations to fit a changepoint model.')}
+  if(Q>((n/2)+1)){stop(paste('Q is larger than the maximum number of segments',(n/2)+1))}
+
+  all.seg=matrix(0,ncol=n,nrow=n)
+  if(length(size)==1){ SIZE <- data*0 + size }else{SIZE <- size}
+  for(i in 1:n){
+    sumx=0
+    sumsize = 0
+    for(j in i:n){
+      sumx=sumx+data[j]
+      sumsize = sumsize + SIZE[j]
+      if(sumx==0 || sumx==sumsize){
+        all.seg[i,j]=-Inf
+      }
+      else{
+        all.seg[i,j]=sumx*log(sumx) - sumsize*log(sumsize) + (sumsize-sumx)*log(sumsize-sumx)
+      }
+    }
+  }
+  like.Q=matrix(0,ncol=n,nrow=Q)
+  like.Q[1,]=all.seg[1,]
+  cp=matrix(NA,ncol=n,nrow=Q)
+  for(q in 2:Q){
+    for(j in q:n){
+      like=NULL
+      if((j-2-q)<0){v=q}
+      else{v=(q):(j-2)}
+      like=like.Q[q-1,v]+all.seg[v+1,j]
+
+      like.Q[q,j]= max(like,na.rm=TRUE)
+      cp[q,j]=which(like==max(like,na.rm=TRUE))[1]+(q-1)
+    }
+
+  }
+  cps.Q=matrix(NA,ncol=Q,nrow=Q)
+  for(q in 2:Q){
+    cps.Q[q,1]=cp[q,n]
+    for(i in 1:(q-1)){
+      cps.Q[q,(i+1)]=cp[(q-i),cps.Q[q,i]]
+    }
+  }
+
+  op.cps=NULL
+  k=0:(Q-1)
+
+  for(i in 1:length(pen)){
+    criterion=-2*like.Q[,n]+k*pen[i]
+
+    op.cps=c(op.cps,which(criterion==min(criterion,na.rm=T))-1)
+  }
+  if(op.cps==(Q-1)){warning('The number of segments identified is Q, it is advised to increase Q to make sure changepoints have not been missed.')}
+  if(op.cps==0){cpts=n}
+  else{cpts=c(sort(cps.Q[op.cps+1,][cps.Q[op.cps+1,]>0]),n)}
+
+  return(list(cps=t(apply(cps.Q,1,sort,na.last=TRUE)),cpts=cpts,op.cpts=op.cps,pen=pen,like=criterion[op.cps+1],like.Q=like.Q[,n]))
+}