Skip to content

Commit

Permalink
Added exp_sd parameter to register() and preprocess_data(), added new…
Browse files Browse the repository at this point in the history
… calc_variance() function, and updated calc_loglik() to use sigma_squared in every timepoint in the sum
  • Loading branch information
ruthkr committed Aug 23, 2023
1 parent d55c6f1 commit c3a5b7e
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 10 deletions.
5 changes: 4 additions & 1 deletion R/process_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#' \item{Scales data via \code{\link{scale_data}}.}
#'
#' @noRd
preprocess_data <- function(input, reference, query, scaling_method = c("none", "z-score", "min-max")) {
preprocess_data <- function(input, reference, query, exp_sd = NA, scaling_method = c("none", "z-score", "min-max")) {
# Suppress "no visible binding for global variable" note
gene_id <- NULL
accession <- NULL
Expand Down Expand Up @@ -39,6 +39,9 @@ preprocess_data <- function(input, reference, query, scaling_method = c("none",
# Scale data
scaled_data <- scale_data(mean_data, all_data, scaling_method)

# Calculate expression variance
scaled_data <- calc_variance(scaled_data, exp_sd)

return(scaled_data)
}

Expand Down
11 changes: 9 additions & 2 deletions R/register.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#' @param optimise_registration_parameters Whether to optimise registration parameters. By default, \code{TRUE}.
#' @param optimisation_method Optimisation method to use. Either \code{"nm"} for Nelder-Mead (default), \code{"lbfgsb"} for L-BFGS-B, or \code{"sa"} for Simulated Annealing.
#' @param optimisation_config Optional list with arguments to override the default optimisation configuration.
#' @param exp_sd Optional experimental standard deviation on the expression replicates.
#'
#' @return This function returns a list of data frames, containing:
#'
Expand Down Expand Up @@ -43,7 +44,8 @@ register <- function(input,
overlapping_percent = 0.5,
optimise_registration_parameters = TRUE,
optimisation_method = c("nm", "lbfgsb", "sa"),
optimisation_config = NULL) {
optimisation_config = NULL,
exp_sd = NA) {
# Suppress "no visible binding for global variable" note
gene_id <- NULL
accession <- NULL
Expand Down Expand Up @@ -71,7 +73,7 @@ register <- function(input,
)

# Preprocess data
all_data <- preprocess_data(input, reference, query, scaling_method)
all_data <- preprocess_data(input, reference, query, exp_sd, scaling_method)
gene_id_list <- unique(all_data$gene_id)
cli::cli_alert_info("Will process {length(gene_id_list)} gene{?s}.")

Expand Down Expand Up @@ -160,10 +162,15 @@ register <- function(input,
all_data_reg <- data.table::rbindlist(lapply(results, function(x) x$data_reg))
model_comparison <- data.table::rbindlist(lapply(results, function(x) x$model_comparison))

# Drop variance columns
all_data[, c("var") := NULL]
all_data_reg[, c("var") := NULL]

# Left join registered time points
setnames(all_data_reg, "timepoint", "timepoint_reg")
all_data[, timepoint_id := order(timepoint, expression_value), by = .(gene_id, accession, replicate)]
all_data_reg[, timepoint_id := order(timepoint_reg, expression_value), by = .(gene_id, accession, replicate)]

all_data <- merge(
all_data,
all_data_reg,
Expand Down
44 changes: 38 additions & 6 deletions R/utils-calcs.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,22 @@ calc_BIC <- function(logL, num_params, num_obs) {
#'
#' @noRd
calc_loglik <- function(model, data) {
y <- data$expression_value
n <- length(y)
preds <- stats::predict(model, newdata = data)
dist_squared <- sum((preds - y)^2)
sigma_squared <- sum((y - mean(y))^2, na.rm = TRUE) / (n - 1)
# Suppress "no visible binding for global variable" note
expression_value <- NULL
pred_expression_value <- NULL

data <- data.table::copy(data)

# Predict expressions
data[, pred_expression_value := stats::predict(model, newdata = data)]

# Calculate logLik
loglik <- -dist_squared / (2 * sigma_squared)
data[, dist_squared := (pred_expression_value - expression_value)^2]

dist_squared <- data$dist_squared
sigma_squared <- data$var

loglik <- -sum(dist_squared / (2 * sigma_squared))

return(loglik)
}
Expand All @@ -39,3 +47,27 @@ fit_spline_model <- function(data, x = "timepoint", num_spline_params = 4, degre

return(fit_object)
}

#' Calculate variance for the observed expression data
#'
#' @noRd
calc_variance <- function(all_data, exp_sd = NA) {
# Suppress "no visible binding for global variable" note
gene_id <- NULL
accession <- NULL
expression_value <- NULL
var <- NULL

if (any(is.na(exp_sd))) {
# TODO: notify user
# Calculate expression variance for replicates
# all_data[, var := sd(expression_value)^2, by = .(gene_id, accession, timepoint)]

# Calculate global expression variance
all_data[, var := (diff(range(expression_value)) / 10)^2, by = .(gene_id, accession)]
} else {
all_data[, var := exp_sd^2]
}

return(all_data)
}
5 changes: 4 additions & 1 deletion man/register.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit c3a5b7e

Please sign in to comment.