precompiling v5 as well

ropensci · Jan 7, 2020 · 9d65ff8 · 9d65ff8
1 parent 76b7437
commit 9d65ff8
Show file tree

Hide file tree

Showing 7 changed files with 134 additions and 49 deletions.
diff --git a/vignettes/1_rodents.Rmd b/vignettes/1_rodents.Rmd
@@ -1,6 +1,6 @@
 ---
 title: "1. Build a database of all rodents"
-date: "2020-01-03"
+date: "2020-01-07"
 output: rmarkdown::html_vignette
 ---
 
@@ -35,6 +35,15 @@ db_create(min_length = 100, max_length = 1000)
 
 ```r
 library(restez)
+#> -------------
+#> restez v1.0.2
+#> -------------
+#> Remember to restez_path_set() and, then, restez_connect()
+#> 
+#> Attaching package: 'restez'
+#> The following object is masked _by_ '.GlobalEnv':
+#> 
+#>     record
 restez_path_set(rodents_path)
 restez_connect()
 #> Remember to run `restez_disconnect()`

diff --git a/vignettes/2_search_and_fetch.Rmd b/vignettes/2_search_and_fetch.Rmd
@@ -1,6 +1,6 @@
 ---
 title: "2. How to search for and fetch sequences"
-date: "2020-01-03"
+date: "2020-01-07"
 output: rmarkdown::html_vignette
 ---
 
@@ -62,7 +62,7 @@ system.time(expr = {
   coi_sequences <- gb_fasta_get(id = accessions)
   })
 #>    user  system elapsed 
-#>   0.466   0.944   1.425
+#>   0.475   0.967   1.748
 # time via Entrez
 system.time(expr = {
   coi_sequences_p1 <- rentrez::entrez_fetch(db = 'nucleotide',
@@ -82,12 +82,12 @@ system.time(expr = {
                                             rettype = 'fasta')
   })
 #>    user  system elapsed 
-#>   0.099   0.019   6.940
+#>   0.136   0.037   7.207
 # always disconnect
 restez_disconnect()
 ```
 <!-- Below is no longer relevant now that the size of sequences in the db has been limited.
-##Missing
+## Missing
 
 A user should know that if an ID cannot be found in the local database no error or warning is raised. This is why it can be good practice to test whether all the provided IDs are in the returned named vector. In this example, we can see that not all the accession IDs that were provided are in the returned `coi_sequences`. Why is that?
 

diff --git a/vignettes/3_parsing.Rmd b/vignettes/3_parsing.Rmd
@@ -172,59 +172,52 @@ restez_path_set(rodents_path)
 restez_connect()
 #> Remember to run `restez_disconnect()`
 (rand_id <- sample(suppressWarnings(list_db_ids()), 1))
-#> [1] "AB006613"
+#> [1] "AB008118"
 record <- gb_record_get(rand_id)
 (gb_extract(record = record, what = 'features'))
 #> [[1]]
 #> [[1]]$type
 #> [1] "source"
 #> 
 #> [[1]]$location
-#> [1] "1..930"
+#> [1] "1..714"
 #> 
 #> [[1]]$organism
-#> [1] "Rattus norvegicus"
+#> [1] "Mus musculus"
 #> 
 #> [[1]]$mol_type
-#> [1] "mRNA"
+#> [1] "genomic DNA"
 #> 
 #> [[1]]$strain
-#> [1] "Sprague-Dawley"
+#> [1] "129"
 #> 
 #> [[1]]$db_xref
-#> [1] "taxon:10116"
-#> 
-#> [[1]]$sex
-#> [1] "male"
-#> 
-#> [[1]]$tissue_type
-#> [1] "brown adipose"
-#> 
-#> [[1]]$dev_stage
-#> [1] "8-week-old"
+#> [1] "taxon:10090"
 #> 
 #> 
 #> [[2]]
 #> [[2]]$type
-#> [1] "CDS"
+#> [1] "gene"
 #> 
 #> [[2]]$location
-#> [1] "1..930"
+#> [1] "complement(380..510)"
+#> 
+#> [[2]]$gene
+#> [1] "Limk-2"
 #> 
-#> [[2]]$note
-#> [1] "UCP-2"
 #> 
-#> [[2]]$codon_start
-#> [1] "1"
+#> [[3]]
+#> [[3]]$type
+#> [1] "exon"
 #> 
-#> [[2]]$product
-#> [1] "uncoupling protein-2"
+#> [[3]]$location
+#> [1] "complement(380..510)"
 #> 
-#> [[2]]$protein_id
-#> [1] "BAA23383.1"
+#> [[3]]$gene
+#> [1] "Limk-2"
 #> 
-#> [[2]]$translation
-#> [1] "MVGFKATDVPPTATVKFLGAGTAACIADLITFPLDTAKVRLQIQGESQGLARTAASAQYRGVLGTILTMVRTEGPRSLYNGLVAGLQRQMSFASVRIGLYDSVKQFYTKGSEHAGIGSRLLAGSTTGALAVAVAQPTDVVKVRFQAQARAGGGRRYQSTVEAYKTIAREEGIRGLWKGTSPNVARNAIVNCTELVTYDLIKDTLLKANLMTDDLPCHFTSAFGAGFCTTVIASPVDVVKTRYMNSALGQYHSAGHCALTMLRKEGPRAFYKGFMPSFLRLGSWNVVMFVTYEQLKRALMAAYESREAPF"
+#> [[3]]$note
+#> [1] "1a"
 restez_disconnect()
 ```
 

diff --git a/vignettes/4_phylotar.Rmd b/vignettes/4_phylotar.Rmd
@@ -46,18 +46,18 @@ setup(wd = wd, txid = txid, ncbi_dr = ncbi_dr, mxsql = 500)
 # run just the first two stages for this demonstration
 taxise_run(wd)
 #> --------------------------------------------
-#> Starting stage TAXISE: [2020-01-03 14:49:31]
+#> Starting stage TAXISE: [2020-01-07 09:26:04]
 #> --------------------------------------------
 #> Searching taxonomic IDs ...
 #> Downloading taxonomic records ...
 #> . [1-28]
 #> Generating taxonomic dictionary ...
 #> ---------------------------------------------
-#> Completed stage TAXISE: [2020-01-03 14:49:34]
+#> Completed stage TAXISE: [2020-01-07 09:26:08]
 #> ---------------------------------------------
 download_run(wd)
 #> ----------------------------------------------
-#> Starting stage DOWNLOAD: [2020-01-03 14:49:34]
+#> Starting stage DOWNLOAD: [2020-01-07 09:26:08]
 #> ----------------------------------------------
 #> Identifying suitable clades ...
 #> Identified [1] suitable clades.
@@ -68,7 +68,7 @@ download_run(wd)
 #> . . Getting [100 sqs] from restez database...
 #> Successfully retrieved [100 sqs] in total.
 #> -----------------------------------------------
-#> Completed stage DOWNLOAD: [2020-01-03 14:49:39]
+#> Completed stage DOWNLOAD: [2020-01-07 09:26:12]
 #> -----------------------------------------------
 ```
 

diff --git a/vignettes/5_tips_and_tricks.Rmd b/vignettes/5_tips_and_tricks.Rmd
@@ -3,17 +3,16 @@ title: "5. Tips and Tricks"
 output: rmarkdown::html_vignette
 ---
 
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = TRUE)
-```
+
 
 ## Multiple restez paths
 
 It is not advisable to download the entire GenBank database to your machine. Equally, it is best to limit the size of a database. Databases that are too large will be slow to query and are more likely to cause memory issues. For example, you may actually make a query that demands more memory than is available on your machine. One solution to instead set multiple `restez` paths on your machine.
 
 You can either set up a path for different domains. Or you could download for a single set of domains and then create a database from the same downloaded files using the `alt_restez_path` argument. Do also make use of `restez_path_unset` to disconnect and unset the `restez` path.
 
-```{r, eval=FALSE}
+
+```r
 # a larger database from the same download files in rodents_path
 db_create(alt_restez_path = rodents_path, max_length = 2000)
 ```
@@ -23,11 +22,9 @@ db_create(alt_restez_path = rodents_path, max_length = 2000)
 
 Always ensure you disconnect after connecting to a `restez` path. Not doing so may lead to some strange database errors such as 'seg faults' or you may even be prevented from connecting to a database again until you restart R. In scripts you should always place `restez_disconnect()` as the end of the script or when you have stopped making queries. If you are making queries from your own custom function you should use `on.exit`. This allows you to run 'clean up' code whenever a function exits, even if it errors.
 
-```{r pathset, include=FALSE}
-pkgwd <- sub(pattern = 'vignettes', replacement = '' , x = getwd())
-rodents_path <- file.path(pkgwd, 'rodents')
-```
-```{r on-exit}
+
+
+```r
 suppressMessages(library(restez))
 random_definition <- function() {
   suppressMessages(restez_connect())
@@ -40,10 +37,22 @@ random_definition <- function() {
 }
 restez_path_set(rodents_path)
 (definition <- random_definition())
+```
+
+```
+##                                                                                              KU614570 
+## "Mus musculus clone PD151104P3E10 immunoglobulin heavy chain variable region (Igh) mRNA, partial cds"
+```
+
+```r
 # not connected outside of function!
 (restez_ready())
 ```
 
+```
+## [1] FALSE
+```
+
 ## Which domain?
 
 The `db_download` function lists the various possible GenBank domains that can be downloaded. You can work out which GenBank domain a sequence belongs to by its three letter code towards the end of its locus. For example, the top of the record for this sequence indicates it is in the rodent domain.
@@ -57,8 +66,8 @@ VERSION     LT548182.1
 
 ## Database performance and behaviour
 
-The `restez` package database is built with [`MonetDBlite`](https://github.com/hannesmuehleisen/MonetDBLite-R). If you
-encounter any errors that include the phrase "Server says", then an issue is
+The `restez` package database is built with [`MonetDBlite`](https://github.com/hannesmuehleisen/MonetDBLite-R).
+If you encounter any errors that include the phrase "Server says", then an issue is
 likely to have occurred within the database. Please raise such issues with
 [GitHub](https://github.com/ropensci/restez/issues). But keep the following
 factors in mind:

diff --git a/vignettes/5_tips_and_tricks.Rmd.orig b/vignettes/5_tips_and_tricks.Rmd.orig
@@ -0,0 +1,73 @@
+---
+title: "5. Tips and Tricks"
+output: rmarkdown::html_vignette
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## Multiple restez paths
+
+It is not advisable to download the entire GenBank database to your machine. Equally, it is best to limit the size of a database. Databases that are too large will be slow to query and are more likely to cause memory issues. For example, you may actually make a query that demands more memory than is available on your machine. One solution to instead set multiple `restez` paths on your machine.
+
+You can either set up a path for different domains. Or you could download for a single set of domains and then create a database from the same downloaded files using the `alt_restez_path` argument. Do also make use of `restez_path_unset` to disconnect and unset the `restez` path.
+
+```{r, eval=FALSE}
+# a larger database from the same download files in rodents_path
+db_create(alt_restez_path = rodents_path, max_length = 2000)
+```
+
+
+## Connecting and disconnecting
+
+Always ensure you disconnect after connecting to a `restez` path. Not doing so may lead to some strange database errors such as 'seg faults' or you may even be prevented from connecting to a database again until you restart R. In scripts you should always place `restez_disconnect()` as the end of the script or when you have stopped making queries. If you are making queries from your own custom function you should use `on.exit`. This allows you to run 'clean up' code whenever a function exits, even if it errors.
+
+```{r pathset, include=FALSE}
+pkgwd <- sub(pattern = 'vignettes', replacement = '' , x = getwd())
+rodents_path <- file.path(pkgwd, 'rodents')
+```
+```{r on-exit}
+suppressMessages(library(restez))
+random_definition <- function() {
+  suppressMessages(restez_connect())
+  on.exit(restez_disconnect())
+  if (restez_ready()) {
+    # deliberate mistake
+    id <- sample(list_db_ids(n = NULL), 1)[[1]]
+    return(gb_definition_get(id))
+  }
+}
+restez_path_set(rodents_path)
+(definition <- random_definition())
+# not connected outside of function!
+(restez_ready())
+```
+
+## Which domain?
+
+The `db_download` function lists the various possible GenBank domains that can be downloaded. You can work out which GenBank domain a sequence belongs to by its three letter code towards the end of its locus. For example, the top of the record for this sequence indicates it is in the rodent domain.
+
+```
+LOCUS       LT548182                 456 bp    DNA     linear   ROD 23-NOV-2016
+DEFINITION  TPA_inf: Cavia porcellus GLNH gene for globin H.
+ACCESSION   LT548182
+VERSION     LT548182.1
+```
+
+## Database performance and behaviour
+
+The `restez` package database is built with [`MonetDBlite`](https://github.com/hannesmuehleisen/MonetDBLite-R).
+If you encounter any errors that include the phrase "Server says", then an issue is
+likely to have occurred within the database. Please raise such issues with
+[GitHub](https://github.com/ropensci/restez/issues). But keep the following
+factors in mind:
+
+* Is your request from the database likely to return an object too large for
+your computer's RAM? If the size of database is 5GB then it is likely that
+a request pulling all of the sequence data and information into an R session
+will be around 5GB as well.
+* Are you building and storing the database on a separate USB drive? It has
+been noted that database behaviour can be unusual on separate USB drives. When
+an issue, please provide information about your USB drive's format, size and USB
+connections.
diff --git a/vignettes/precompile.R b/vignettes/precompile.R
@@ -1,4 +1,4 @@
-# First four vignettes require downloaded rodent db, must be precompiled:
+# First five vignettes require downloaded rodent db, must be precompiled:
 
 # gen rodent db
 if (!dir.exists(file.path('rodents'))) {
@@ -7,7 +7,8 @@ if (!dir.exists(file.path('rodents'))) {
 
 # precompile
 library(knitr)
-vgnts <- c('1_rodents.Rmd', '2_search_and_fetch.Rmd', '3_parsing.Rmd', '4_phylotaR.Rmd')
+vgnts <- c('1_rodents.Rmd', '2_search_and_fetch.Rmd', '3_parsing.Rmd',
+           '4_phylotaR.Rmd', '5_tips_and_tricks.Rmd')
 for (vgnt in vgnts) {
   knit(paste0("vignettes/", vgnt, ".orig"), paste0("vignettes/", vgnt))
 }