Skip to content

Commit

Permalink
Merge 9a5ac8c into df1956b
Browse files Browse the repository at this point in the history
  • Loading branch information
pmcharrison committed Jan 29, 2019
2 parents df1956b + 9a5ac8c commit e4ed7c3
Show file tree
Hide file tree
Showing 26 changed files with 262 additions and 163 deletions.
9 changes: 5 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
Package: tst
Title: Timestamped Suffix Trees (TSTs)
Version: 0.1.1
Version: 0.2.0
Authors@R: person("Peter", "Harrison", email = "pmc.harrison@gmail.com", role = c("aut", "cre"))
Description: Timestamped suffix trees (TSTs) are suffix trees that store timestamps for each state transition in the training sequence(s). Given a sequence of consecutive symbols, the TST can efficiently return the timepoints when this sequence occurred in the training data, the symbols that followed this sequence in the training data, and the timepoints when these continuation symbols were observed.
Depends: R (>= 3.4.2)
Depends: R (>= 3.4.0)
License: MIT + file LICENSE
Imports:
methods
methods,
tibble
Encoding: UTF-8
LazyData: true
ByteCompile: true
Expand All @@ -15,6 +16,6 @@ Suggests:
shiny (>= 1.1.0),
testthat (>= 2.0.0),
covr (>= 3.2.0)
RoxygenNote: 6.1.0
RoxygenNote: 6.1.1
URL: https://github.com/pmcharrison/tst
BugReports: https://github.com/pmcharrison/tst/issues
10 changes: 5 additions & 5 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
# Generated by roxygen2: do not edit by hand

S3method(plot,tree)
S3method(print,tree)
S3method(plot,tst)
S3method(print,tst)
export(add_seq)
export(count_ngram)
export(demo_suffix_tree)
export(get_active_order)
export(get_num_children)
export(is.tree)
export(last_location)
export(is.tst)
export(new_tree)
export(num_observed)
export(reset_active_nodes)
export(when_context)
export(when_continuation)
export(when_continuation_terminal)
export(when_continuations)
export(when_ngram)
importFrom(graphics,"plot")
importFrom(methods,"is")
importFrom(tibble,tibble)
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# tst 0.2.0

* The trees now distinguish between the time of an event and its ordinal position.
31 changes: 18 additions & 13 deletions R/add-seq.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
#' In the latter case, only the tree's active nodes are updated.
#' @param tree Suffix tree as produced by \code{new_tree}.
#' @param seq Sequence to add, should be a numeric or character vector.
#' @param time Either \code{NULL} or a numeric vector of timepoints
#' of the same length as \code{seq}, which should be in ascending order.
#' If \code{NULL}, then defaults to a sequence in steps of 1
#' beginning from the number of events already in the tree
#' (so, for an empty tree: 0, 1, 2, 3, ...).
#' @param save Boolean; whether or not to save the sequence in the tree
#' (if \code{save = FALSE}, only the active nodes are updated).
#' @param when Boolean; ignored if \code{save = FALSE}; provides locations
#' for each element of \code{seq}. If \code{NULL} (default), locations
#' are determined as a numeric sequence beginning at the last location
#' entered into the tree, and increasing by 1 each time.
#' @param reset_active_nodes By default, the tree's active nodes are reset
#' before adding the new sequence (\code{reset_active_nodes = TRUE}).
#' Otherwise the new sequence is treated as a continuation of the previous
Expand All @@ -30,19 +31,22 @@
#' t <- new_tree()
#' add_seq(t, sample(5, 5, replace = TRUE))
#' plot(t)
add_seq <- function(tree, seq, save = TRUE, when = NULL,
add_seq <- function(tree,
seq,
save = TRUE,
time = NULL,
reset_active_nodes = TRUE,
terminate = TRUE,
visual = FALSE) {
# Inputs ####
stopifnot(is(tree, "tree"), is.atomic(seq), is.scalar.logical(save),
is.null.or(when, is.numeric), is.scalar.logical(reset_active_nodes),
stopifnot(is.tst(tree), is.atomic(seq), is.scalar.logical(save),
is.null.or(time, is.numeric), is.scalar.logical(reset_active_nodes),
is.scalar.logical(terminate), is.scalar.logical(visual))
if (save) {
if (is.null(when)) {
when <- seq(from = tree$when + 1L,
if (is.null(time)) {
time <- seq(from = tree$num_observed,
length.out = length(seq))
} else stopifnot(identical(length(when), length(seq)))
} else stopifnot(identical(length(time), length(seq)))
}

# Prep ####
Expand All @@ -56,14 +60,15 @@ add_seq <- function(tree, seq, save = TRUE, when = NULL,
value <- as.character(seq[i])
if (identical(value, tree$terminal))
stop("sequence cannot contain terminal character: ", tree$terminal)
add_symbol(tree, value = value, save = save, when = when[i])
add_symbol(tree, value = value, save = save, time = time[i])
if (visual) plot(tree, wait = i < length(seq), print = TRUE)
}

# Terminal ####
if (save && terminate) add_symbol(tree, value = tree$terminal,
if (save && terminate) add_symbol(tree,
value = tree$terminal,
save = TRUE,
when = when[i],
time = time[i],
terminal = TRUE)
tree
}
7 changes: 4 additions & 3 deletions R/add-symbol.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#' - Start at the highest-order context.
#' - If the symbol is novel, increment count, move to the next-lowest-order node, and repeat.
#' - If the symbol is not novel, increment count and stop.
add_symbol <- function(tree, value, save, when, terminal = FALSE) {
add_symbol <- function(tree, value, save, time, terminal = FALSE) {
stopifnot(length(tree$active_nodes) > 0L)
tree$active_order <- NA
exclude_update <- FALSE
Expand All @@ -14,14 +14,15 @@ add_symbol <- function(tree, value, save, when, terminal = FALSE) {
res <- take_path(node = tree$active_nodes[[j]],
value = value,
save = save,
when = when,
time = time,
pos = tree$num_observed + 1L,
terminal = terminal,
exclude_update = exclude_update)
if (is.na(tree$active_order) && !is(res, "empty_node")) tree$active_order <- j
if (!attr(res, "novel_symbol")) exclude_update <- TRUE
tree$active_nodes[[j]] <- res
}
if (save) tree$when <- when
if (save) tree$num_observed <- tree$num_observed + 1L
trim_active_nodes(tree, save = save)
add_root_to_active_nodes(tree)
}
2 changes: 2 additions & 0 deletions R/get-num-children.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#' excluding terminals if requested.
#' @param tree A suffix tree as produced by \code{new_tree}.
#' @param order Amount of context (e.g. 0 means no context).
#' @param exclude_terminals (Logical scalar)
#' Whether to exclude terminal symbols from this count.
#' @return The number of children of the active node with order as specified,
#' possibly including terminal symbols.
#' @export
Expand Down
4 changes: 2 additions & 2 deletions R/ngram.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ when_ngram <- function(tree, ngram) {
node <- get_root(tree)
for (val in ngram)
node <- take_path(node = node, value = val, save = FALSE)
if (is(node, "empty_node")) numeric() else as.numeric(unlist(node$log_0))
when_visited(node, update_excluded = FALSE)
}

#' Count n-gram occurrences
Expand All @@ -22,5 +22,5 @@ when_ngram <- function(tree, ngram) {
#' @return Integer n-gram count.
#' @export
count_ngram <- function(tree, ngram) {
length(when_ngram(tree, ngram))
nrow(when_ngram(tree, ngram))
}
8 changes: 5 additions & 3 deletions R/node.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
new_node <- function(value, when, terminal = FALSE) {
new_node <- function(value, pos, time, terminal = FALSE) {
self <- new.env()
self$value <- value
self$log_0 <- list(when) # without update exclusion
self$log_1 <- list(when) # with update exclusion
self$log_0 <- list(pos = list(pos), # without update exclusion
time = list(time))
self$log_1 <- list(pos = list(pos),
time = list(time)) # with update exclusion
self$children <- new.env()
class(self) <- c(if (terminal) "terminal",
"node")
Expand Down
2 changes: 1 addition & 1 deletion R/plot.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#' @export
plot.tree <- function(x, wait = FALSE, print = FALSE, shiny = FALSE,
plot.tst <- function(x, wait = FALSE, print = FALSE, shiny = FALSE,
update_excluded = FALSE, ...) {
if (!requireNamespace("DiagrammeR", quietly = TRUE)) {
stop("Package \"DiagrammeR\" needed for this function to work. Please install it.",
Expand Down
37 changes: 27 additions & 10 deletions R/take-path.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,23 @@
#' @param value Name of the (possibly non-existent) target node.
#' Should be a scalar character.
#' @param save Whether to save the journey into the tree.
#' @param when Ignored if save is FALSE; provides the timepoint that should
#' be saved into the tree.
#' @param time (Numeric scalar) Ignored if \code{save} is \code{FALSE};
#' provides the timepoint that should be saved into the tree.
#' @param pos (Integerish scalar) Ignored if \code{save} is \code{FALSE};
#' corresponds to the 1-indexed position of the event in the training data
#' (across all sequences).
#' @param terminal (Logical scalar) Whether or not the symbol is a terminal symbol.
#' @param exclude_update (Logical scalar) Whether or not updates should be excluded
#' for this event.
#' @return New node reached by taking this path (NA if no valid node found)
take_path <- function(node, value, save = FALSE, when = NULL, terminal = FALSE,
take_path <- function(node, value, save = FALSE,
time = NULL,
pos = NULL,
terminal = FALSE,
exclude_update = FALSE) {
if (is(node, "empty_node")) return(node)
if (save && is.null(when))
stop("if <save> is TRUE then <when> cannot be NULL")
if (save && (is.null(time) || is.null(pos)))
stop("if <save> is TRUE then <time> and <pos> cannot be NULL")
key <- as.character(value)
stopifnot(!is.na(key), is.scalar(key))
novel_symbol <- NULL
Expand All @@ -22,19 +31,27 @@ take_path <- function(node, value, save = FALSE, when = NULL, terminal = FALSE,
if (exclude_update)
stop("update exclusion should never apply to novel symbols")
if (save)
node$children[[key]] <- new_node(value, when, terminal = terminal) else
node$children[[key]] <- new_node(value,
time = time,
pos = pos,
terminal = terminal) else
EMPTY_NODE
} else {
novel_symbol <- FALSE
child <- node$children[[key]]
if (save) update_logs(child, when, exclude_update = exclude_update)
if (save) update_logs(child,
time = time,
pos = pos,
exclude_update = exclude_update)
child
}
attr(res, "novel_symbol") <- novel_symbol
res
}

update_logs <- function(node, when, exclude_update) {
node$log_0[[length(node$log_0) + 1L]] <- when
if (!exclude_update) node$log_1[[length(node$log_1) + 1L]] <- when
update_logs <- function(node, time, pos, exclude_update) {
for (log in c("log_0", if (!exclude_update) "log_1")) {
node[[log]]$time[[length(node[[log]]$time) + 1L]] <- time
node[[log]]$pos[[length(node[[log]]$pos) + 1L]] <- pos
}
}
49 changes: 26 additions & 23 deletions R/tree.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,45 +17,50 @@ new_tree <- function(order_bound = NULL, terminal = "$") {

# Main ####
x <- new.env()
x$root <- new_node(as.integer(NA), -Inf)
x$root <- new_node(as.integer(NA), pos = 0L, time = 0)
x$order_bound <- order_bound
x$active_nodes <- list() # ordered from smallest context to greatest context
x$active_order <- 0L
x$when <- 0L
x$num_observed <- 0L
x$terminal <- "$"
class(x) <- "tree"
class(x) <- "tst"
reset_active_nodes(x)
x
}

#' Is it a TST?
#'
#' Checks whether an object is of class "tst".
#'
#' @param x Object to check.
#'
#' @export
is.tree <- function(x) {
is(x, "tree")
is.tst <- function(x) {
is(x, "tst")
}

#' @export
print.tree <- function(x, ...) {
print.tst <- function(x, ...) {
order_bound <- if (is.null(x$order_bound)) "none" else x$order_bound
cat("A suffix tree with ", length(as.list(x$root$children)),
" observed symbols (including terminals)\n",
cat("A temporal suffix tree\n",
" - number of stored symbols (inc. terminals) = ", num_observed(x), "\n",
" - order bound = ", order_bound, "\n",
" - active order = ", x$active_order, "\n",
" - last symbol location = ", x$when, "\n",
sep = "")
}

get_root <- function(tree) {
stopifnot(is.tree(tree))
stopifnot(is.tst(tree))
tree$root
}

get_order_bound <- function(tree) {
stopifnot(is.tree(tree))
stopifnot(is.tst(tree))
tree$order_bound
}

get_active_nodes <- function(tree) {
stopifnot(is.tree(tree))
stopifnot(is.tst(tree))
tree$active_nodes
}

Expand All @@ -70,7 +75,7 @@ get_active_nodes <- function(tree) {
#' @export
#' @param tree Suffix tree, as produced by \code{new_tree()}.
get_active_order <- function(tree) {
stopifnot(is.tree(tree))
stopifnot(is.tst(tree))
tree$active_order
}

Expand All @@ -81,25 +86,23 @@ get_active_order <- function(tree) {
#' @param tree Suffix tree, as produced by \code{new_tree()}.
#' @export
reset_active_nodes <- function(tree) {
stopifnot(is.tree(tree))
stopifnot(is.tst(tree))
tree$active_nodes <- list(tree$root)
tree$active_order <- 0L
}

add_root_to_active_nodes <- function(tree) {
stopifnot(is.tree(tree))
stopifnot(is.tst(tree))
tree$active_nodes <- c(tree$root, tree$active_nodes)
}

#' Last location
#' Number of observed symbols
#'
#' Get the last stored location in the tree.
#' This is typically 0 (if no sequences have yet been stored in the tree)
#' or alternatively the last location in the last sequence entered into
#' the tree.
#' Returns the number of symbols that have been entered into the tree,
#' including repetitions.
#' @param tree Suffix tree, as produced by \code{new_tree()}.
#' @export
last_location <- function(tree) {
stopifnot(is.tree(tree))
tree$when
num_observed <- function(tree) {
stopifnot(is.tst(tree))
tree$num_observed
}
5 changes: 5 additions & 0 deletions R/tst-package.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#' @keywords internal
"_PACKAGE"

#' @importFrom tibble tibble
NULL

0 comments on commit e4ed7c3

Please sign in to comment.