From 0ef9301bfd9bc100d2f8876b14bb2a4969ffe34a Mon Sep 17 00:00:00 2001 From: richelbilderbeek Date: Sat, 7 Jul 2018 19:38:24 +0200 Subject: [PATCH] Add tutorial for #9 --- .gitignore | 1 + R/bbt_run.R | 30 ++-- man/bbt_run.Rd | 36 ++-- vignettes/tutorial.R | 157 +++++++++++++++++ vignettes/tutorial.Rmd | 376 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 577 insertions(+), 23 deletions(-) create mode 100644 vignettes/tutorial.R create mode 100644 vignettes/tutorial.Rmd diff --git a/.gitignore b/.gitignore index 8adb7c6..e481c2a 100644 --- a/.gitignore +++ b/.gitignore @@ -42,3 +42,4 @@ vignettes/article_cache/ vignettes/article_files/ anthus_aco.log my_beast2_input.xml +vignettes/tutorial_cache/ diff --git a/R/bbt_run.R b/R/bbt_run.R index 3e5b507..db6f813 100644 --- a/R/bbt_run.R +++ b/R/bbt_run.R @@ -14,25 +14,35 @@ #' @param posterior_crown_age the posterior's crown age. Use NA to let #' BEAST2 estimate this parameter. Use a positive value to fix the #' crown age to that value -#' @param beast2_input_filename name of the BEAST2 configuration file +#' @param beast2_input_filename path of the BEAST2 configuration file. +#' By default, this file is put in a temporary folder with a random +#' filename, as the user needs not read it: it is used as input of BEAST2. +#' Specifying a \code{beast2_input_filename} allows to store that file +#' in a more permanently stored location. #' @param rng_seed the random number generator seed. Must be either #' \code{NA} or a positive non-zero value. An RNG seed of \code{NA} #' results in BEAST2 picking a random seed. #' @param beast2_output_log_filename name of the log file created by BEAST2, -#' containing the parameter estimates in time. By default, this temporary -#' file is put a default temporary folder and will be cleaned up by the -#' operating system. This file is a temporary file, as its content +#' containing the parameter estimates in time. By default, this +#' file is put a temporary folder with a random +#' filename, as the user needs not read it: its content #' is parsed and returned by this function. +#' Specifying a \code{beast2_output_log_filename} allows to store that file +#' in a more permanently stored location. #' @param beast2_output_trees_filenames name of the one or more trees -#' files created by BEAST2, one per alignment. By default, these temporary -#' files are put a default temporary folder and will be cleaned up by the -#' operating system. These files are temporary files, as their content +#' files created by BEAST2, one per alignment. By default, these +#' files are put a temporary folder with a random +#' filename, as the user needs not read it: their content #' is parsed and returned by this function. +#' Specifying \code{beast2_output_trees_filenames} allows to store these +#' one or more files in a more permanently stored location. #' @param beast2_output_state_filename name of the final state file created -#' by BEAST2, containing the operator acceptances. By default, this temporary -#' file is put a default temporary folder and will be cleaned up by the -#' operating system. This file is a temporary file, as its content +#' by BEAST2, containing the operator acceptances. By default, this +#' file is put a temporary folder with a random +#' filename, as the user needs not read it: its content #' is parsed and returned by this function. +#' Specifying a \code{beast2_output_state_filename} allows to store that file +#' in a more permanently stored location. #' @param beast2_jar_path path to the BEAST2 jar file #' @param verbose set to TRUE for more output #' @param cleanup set to FALSE to keep all temporary files diff --git a/man/bbt_run.Rd b/man/bbt_run.Rd index 6e417a8..66074b0 100644 --- a/man/bbt_run.Rd +++ b/man/bbt_run.Rd @@ -41,29 +41,39 @@ see \link[beautier]{create_mcmc}} BEAST2 estimate this parameter. Use a positive value to fix the crown age to that value} -\item{beast2_input_filename}{name of the BEAST2 configuration file} +\item{beast2_input_filename}{path of the BEAST2 configuration file. +By default, this file is put in a temporary folder with a random +filename, as the user needs not read it: it is used as input of BEAST2. +Specifying a \code{beast2_input_filename} allows to store that file +in a more permanently stored location.} \item{rng_seed}{the random number generator seed. Must be either \code{NA} or a positive non-zero value. An RNG seed of \code{NA} results in BEAST2 picking a random seed.} \item{beast2_output_log_filename}{name of the log file created by BEAST2, -containing the parameter estimates in time. By default, this temporary -file is put a default temporary folder and will be cleaned up by the -operating system. This file is a temporary file, as its content -is parsed and returned by this function.} +containing the parameter estimates in time. By default, this +file is put a temporary folder with a random +filename, as the user needs not read it: its content +is parsed and returned by this function. +Specifying a \code{beast2_output_log_filename} allows to store that file +in a more permanently stored location.} \item{beast2_output_trees_filenames}{name of the one or more trees -files created by BEAST2, one per alignment. By default, these temporary -files are put a default temporary folder and will be cleaned up by the -operating system. These files are temporary files, as their content -is parsed and returned by this function.} +files created by BEAST2, one per alignment. By default, these +files are put a temporary folder with a random +filename, as the user needs not read it: their content +is parsed and returned by this function. +Specifying \code{beast2_output_trees_filenames} allows to store these +one or more files in a more permanently stored location.} \item{beast2_output_state_filename}{name of the final state file created -by BEAST2, containing the operator acceptances. By default, this temporary -file is put a default temporary folder and will be cleaned up by the -operating system. This file is a temporary file, as its content -is parsed and returned by this function.} +by BEAST2, containing the operator acceptances. By default, this +file is put a temporary folder with a random +filename, as the user needs not read it: its content +is parsed and returned by this function. +Specifying a \code{beast2_output_state_filename} allows to store that file +in a more permanently stored location.} \item{beast2_jar_path}{path to the BEAST2 jar file} diff --git a/vignettes/tutorial.R b/vignettes/tutorial.R new file mode 100644 index 0000000..63a42ff --- /dev/null +++ b/vignettes/tutorial.R @@ -0,0 +1,157 @@ +## ----setup, include = FALSE---------------------------------------------- +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) + +## ------------------------------------------------------------------------ +library(babette) + +## ------------------------------------------------------------------------ +fasta_filename <- get_babette_path("anthus_aco.fas") +testit::assert(file.exists(fasta_filename)) + +## ------------------------------------------------------------------------ +mcmc <- create_mcmc(chain_length = 2000, store_every = 1000) + +## ----cache=TRUE---------------------------------------------------------- +out <- bbt_run( + fasta_filenames = fasta_filename, + mcmc = mcmc +) + +## ------------------------------------------------------------------------ +site_model <- create_site_model_jc69() +site_model <- create_jc69_site_model() + +## ----cache=TRUE---------------------------------------------------------- +out <- bbt_run( + fasta_filenames = fasta_filename, + site_models = site_model, + mcmc = mcmc +) + +## ------------------------------------------------------------------------ +clock_model <- create_clock_model_strict() +clock_model <- create_strict_clock_model() + +## ----cache=TRUE---------------------------------------------------------- +out <- bbt_run( + fasta_filenames = fasta_filename, + clock_models = clock_model, + mcmc = mcmc +) + +## ------------------------------------------------------------------------ +tree_prior <- create_tree_prior_yule() +tree_prior <- create_yule_tree_prior() + +## ----cache=TRUE---------------------------------------------------------- +out <- bbt_run( + fasta_filenames = fasta_filename, + tree_priors = tree_prior, + mcmc = mcmc +) + +## ------------------------------------------------------------------------ +mrca_prior <- create_mrca_prior( + alignment_id = get_alignment_id(fasta_filename = fasta_filename), + taxa_names = get_taxa_names(filename = fasta_filename)[1:2], + is_monophyletic = TRUE +) + +## ------------------------------------------------------------------------ +mrca_distr <- create_normal_distr( + mean = create_mean_param(value = 15.0), + sigma = create_sigma_param(value = 1.0) +) + +## ------------------------------------------------------------------------ +mrca_prior <- create_mrca_prior( + alignment_id = get_alignment_id(fasta_filename = fasta_filename), + taxa_names = get_taxa_names(filename = fasta_filename), + mrca_distr = mrca_distr +) + +## ----cache=TRUE---------------------------------------------------------- +out <- bbt_run( + fasta_filenames = fasta_filename, + mrca_priors = mrca_prior, + mcmc = mcmc +) + +## ----cache=TRUE---------------------------------------------------------- +# Prefer to use the 'create_mrca_prior' version below +out <- bbt_run( + fasta_filenames = fasta_filename, + posterior_crown_age = 15, + mcmc = mcmc +) + +## ----cache=TRUE---------------------------------------------------------- +# Prefer this over using 'posterior_crown_age' +out <- bbt_run( + fasta_filenames = fasta_filename, + mrca_priors = create_mrca_prior( + alignment_id = get_alignment_id(fasta_filename = fasta_filename), + taxa_names = get_taxa_names(filename = fasta_filename), + mrca_distr = create_normal_distr( + mean = create_mean_param(value = 15.0), + sigma = create_sigma_param(value = 0.0001) + ) + ), + mcmc = mcmc +) + +## ------------------------------------------------------------------------ +if (1 == 2) { + beast2_input_filename <- "beast_input.xml" + beast2_output_log_filename <- "beast_ouput.log" + beast2_output_trees_filenames <- "beast_output.trees" + beast2_output_state_filename <- "beast_state.xml.state" + all_files <- c( + beast2_input_filename, + beast2_output_log_filename, + beast2_output_trees_filenames, + beast2_output_state_filename + ) + + out <- bbt_run( + fasta_filenames = fasta_filename, + mcmc = mcmc, + beast2_input_filename = beast2_input_filename, + beast2_output_log_filename = beast2_output_log_filename, + beast2_output_trees_filenames = beast2_output_trees_filenames, + beast2_output_state_filename = beast2_output_state_filename + ) + testit::assert(all(file.exists(all_files))) + file.remove(all_files) +} + +## ----cache=TRUE---------------------------------------------------------- +out <- bbt_run( + fasta_filenames = fasta_filename, + mcmc = mcmc, + rng_seed = 314 +) + +## ------------------------------------------------------------------------ +beast2_jar_path <- beastier::get_default_beast2_jar_path() +print(beast2_jar_path) + +## ------------------------------------------------------------------------ +if (file.exists(beast2_jar_path)) { + out <- bbt_run( + fasta_filenames = fasta_filename, + mcmc = mcmc, + beast2_jar_path = beast2_jar_path + ) +} + +## ----cache=TRUE---------------------------------------------------------- +out <- bbt_run( + fasta_filenames = fasta_filename, + mcmc = mcmc, + verbose = TRUE +) + diff --git a/vignettes/tutorial.Rmd b/vignettes/tutorial.Rmd new file mode 100644 index 0000000..7775fa9 --- /dev/null +++ b/vignettes/tutorial.Rmd @@ -0,0 +1,376 @@ +--- +title: "babette Tutorial" +author: "Richel J.C. Bilderbeek" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{babette Tutorial} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +This vignette is a tutorial how to use `babette` and its most +important `bbt_run` function. + +First, load `babette`: + +```{r} +library(babette) +``` + +The main function of `babette` is `bbt_run`. Here is part of its help: + +``` +Do a full run: create a BEAST2 configuration file (like BEAUti 2), run BEAST2, parse results (like Tracer) + +Usage + +bbt_run( + fasta_filenames, + site_models, + clock_models, + tree_priors, + mrca_priors, + posterior_crown_age, + beast2_output_log_filename, + beast2_output_trees_filenames, + beast2_output_state_filename, + verbose, + cleanup +) +``` + +Simplifying this to all arguments that do not have a default: + +``` +bbt_run( + fasta_filenames +) +``` + +## `fasta_filenames` + +`fasta_filenames` is the `bbt_run` argument to specify the one or more +FASTA files to work on. `babette` is bundled with some FASTA files, +so obtaining a path to a FASTA file is easy: + +```{r} +fasta_filename <- get_babette_path("anthus_aco.fas") +testit::assert(file.exists(fasta_filename)) +``` + +With `fasta_filename` available, we have the minimal +requirements to call `bbt_run` like this: + +``` +out <- bbt_run(fasta_filenames = fasta_filename) +``` + +Note that this code is not ran, as it would take too long. +The reason this would take too long, is that +the MCMC run that will be executed is set to one million states by default. +To specify the MCMC options and shorten this run, +the `mcmc` argument is used. + +## `mcmc` + +`mcmc` is the `bbt_run` argument to specify the MCMC run options: + +```{r} +mcmc <- create_mcmc(chain_length = 2000, store_every = 1000) +``` + +With these MCMC options, we can now call `bbt_run` in way that +it will finish fast: + +```{r cache=TRUE} +out <- bbt_run( + fasta_filenames = fasta_filename, + mcmc = mcmc +) +``` + +The return value, `out` contains the results of the MCMC run. +For this tutorial, visualizing `out` is ignored, as the 'Demo' vignette +discusses this. Instead, we will work through the other `bbt_run` parameters. + +## `site_models` + +`site_models` is the `bbt_run` argument to select one or more site models. +As this tutorial works on a DNA alignment, such a site model can also +be called a nucleotide substitution model. + +Picking a site model is easy: just type: + +``` +create_site_model_ +``` + +This will trigger autocomplete to show all site models. + +The simplest site model is the Jukes-Cantor DNA substutition model. +To use this model in `babette`, do: + +```{r} +site_model <- create_site_model_jc69() +site_model <- create_jc69_site_model() +``` + +Using this site model: + +```{r cache=TRUE} +out <- bbt_run( + fasta_filenames = fasta_filename, + site_models = site_model, + mcmc = mcmc +) +``` + +## `clock_models` + +`clock_models` is the `bbt_run` argument to select one or more clock models. + +Picking a clock model is easy: just type: + +``` +create_clock_model_ +``` + +This will trigger autocomplete to show all clock models. + +The simplest site model is the strict clock model. +To use this model in `babette`, do: + +```{r} +clock_model <- create_clock_model_strict() +clock_model <- create_strict_clock_model() +``` + +Using this clock model: + +```{r cache=TRUE} +out <- bbt_run( + fasta_filenames = fasta_filename, + clock_models = clock_model, + mcmc = mcmc +) +``` + +## `tree_priors` + +`tree_priors` is the `bbt_run` argument to select one or more tree priors. + +Picking a tree prior is easy: just type: + +``` +create_tree_prior_ +``` + +This will trigger autocomplete to show all tree priors. + +The simplest tree prior is the Yule (pure-birth) tree prior. +To use this model in `babette`, do: + +```{r} +tree_prior <- create_tree_prior_yule() +tree_prior <- create_yule_tree_prior() +``` + +Using this tree prior: + +```{r cache=TRUE} +out <- bbt_run( + fasta_filenames = fasta_filename, + tree_priors = tree_prior, + mcmc = mcmc +) +``` + +## `mrca_priors` + +`mrca_priors` is the `bbt_run` argument to add one or more Most +Recent Common Ancestor (hence, MRCA) priors. +With such a prior, it can be specified which taxa have a shared +common ancestor and when it existed. + +Here is how to specify that the first two taxa in a FASTA file +are sister species: + +```{r} +mrca_prior <- create_mrca_prior( + alignment_id = get_alignment_id(fasta_filename = fasta_filename), + taxa_names = get_taxa_names(filename = fasta_filename)[1:2], + is_monophyletic = TRUE +) +``` + +To specify when the MRCA of all taxa was present, we'll first +create a prior distribution of the crown age, after wich we can +use that distribution. + +To assume the crown age to follow a normal distribution, +with a mean of 15.0 (time units), with a standard deviation of 1.0, +use `create_normal_distr`: + +```{r} +mrca_distr <- create_normal_distr( + mean = create_mean_param(value = 15.0), + sigma = create_sigma_param(value = 1.0) +) +``` + +To use that distribution in our MRCA prior: + +```{r} +mrca_prior <- create_mrca_prior( + alignment_id = get_alignment_id(fasta_filename = fasta_filename), + taxa_names = get_taxa_names(filename = fasta_filename), + mrca_distr = mrca_distr +) +``` + +Using such an MRCA prior: + +```{r cache=TRUE} +out <- bbt_run( + fasta_filenames = fasta_filename, + mrca_priors = mrca_prior, + mcmc = mcmc +) +``` + + +## `posterior_crown_age` + +`posterior_crown_age` is the `bbt_run` argument to specify a fixed crown age: + +```{r cache=TRUE} +# Prefer to use the 'create_mrca_prior' version below +out <- bbt_run( + fasta_filenames = fasta_filename, + posterior_crown_age = 15, + mcmc = mcmc +) +``` + +Prefer to use an MRCA prior with a sharp distribution around the desired crown +age! It is known such an MRCA prior gives better results: + +```{r cache=TRUE} +# Prefer this over using 'posterior_crown_age' +out <- bbt_run( + fasta_filenames = fasta_filename, + mrca_priors = create_mrca_prior( + alignment_id = get_alignment_id(fasta_filename = fasta_filename), + taxa_names = get_taxa_names(filename = fasta_filename), + mrca_distr = create_normal_distr( + mean = create_mean_param(value = 15.0), + sigma = create_sigma_param(value = 0.0001) + ) + ), + mcmc = mcmc +) +``` + +## Temporary filenames + +These are the `bbt_run` temporary filenames: + + * `beast2_input_filename`: the BEAST2 input file + * `beast2_output_log_filename`: the log file created by BEAST2 + * `beast2_output_trees_filenames`: the .tree file(s) created by BEAST2 + * `beast2_output_state_filename`: the `.xml.state` file created by BEAST2 + +The purpose of `babette` is to remove the explicit interfacing with BEAST2, +thus users needs not specify this. By default, these files have random names +in a temporary folder. + +To store these files in the present working directory, specify their names: + +```{r} +if (1 == 2) { + beast2_input_filename <- "beast_input.xml" + beast2_output_log_filename <- "beast_ouput.log" + beast2_output_trees_filenames <- "beast_output.trees" + beast2_output_state_filename <- "beast_state.xml.state" + all_files <- c( + beast2_input_filename, + beast2_output_log_filename, + beast2_output_trees_filenames, + beast2_output_state_filename + ) + + out <- bbt_run( + fasta_filenames = fasta_filename, + mcmc = mcmc, + beast2_input_filename = beast2_input_filename, + beast2_output_log_filename = beast2_output_log_filename, + beast2_output_trees_filenames = beast2_output_trees_filenames, + beast2_output_state_filename = beast2_output_state_filename + ) + testit::assert(all(file.exists(all_files))) + file.remove(all_files) +} +``` + +## `rng_seed` + +`rng_seed` is the `bbt_run` argument to specify a random number generator seed. + +```{r cache=TRUE} +out <- bbt_run( + fasta_filenames = fasta_filename, + mcmc = mcmc, + rng_seed = 314 +) +``` + +## `beast2_jar_path` + +`beast2_jar_path` is the `bbt_run` argument to specify the path +of the BEAST2 `.jar` file. When calling `install_beast2` with default +arguments, BEAST2 is installed to this location. + +This is the default path to the BEAST2 jar file: + +```{r} +beast2_jar_path <- beastier::get_default_beast2_jar_path() +print(beast2_jar_path) +``` + +Run `babette` if and only if BEAST2 is installed at that default location: + +```{r} +if (file.exists(beast2_jar_path)) { + out <- bbt_run( + fasta_filenames = fasta_filename, + mcmc = mcmc, + beast2_jar_path = beast2_jar_path + ) +} +``` + +## `verbose` + +`verbose` is the `bbt_run` argument to specify whether more verbose +output is desired. Set this to TRUE for more output: + +```{r cache=TRUE} +out <- bbt_run( + fasta_filenames = fasta_filename, + mcmc = mcmc, + verbose = TRUE +) +``` + +## `cleanup` + +set to FALSE to keep all temporary files +