Merge pull request #1641 from rstudio/add-photolysis-dataset

Add the `photolysis` dataset
rstudio · Apr 27, 2024 · 620f94a · 620f94a
2 parents 88d9440 + 317461b
commit 620f94a
Show file tree

Hide file tree

Showing 33 changed files with 488 additions and 144 deletions.
diff --git a/R/datasets.R b/R/datasets.R
@@ -707,7 +707,6 @@
 #'
 "illness"
 
-
 #' Reaction rates for gas-phase atmospheric reactions of organic compounds
 #'
 #' @description
@@ -723,18 +722,18 @@
 #' chlorine (Cl) atoms.
 #'
 #' This compilation of rate constant (*k*) data as contains the values for rate
-#' constants at 298 K (in units of cm^3 molecules^–1 s^–1) as well as parameters
-#' that allow for the calculation of rate constants at different temperatures
-#' (the temperature dependence parameters: `A`, `B`, and `n`). Uncertainty
-#' values/factors and temperature limits are also provided here where
-#' information is available.
+#' constants at 298 K (in units of `cm^3 molecules^–1 s^–1`) as well as
+#' parameters that allow for the calculation of rate constants at different
+#' temperatures (the temperature dependence parameters: `A`, `B`, and `n`).
+#' Uncertainty values/factors and temperature limits are also provided here
+#' where information is available.
 #'
 #' @format A tibble with 1,683 rows and 39 variables:
 #' \describe{
 #' \item{compd_name}{The name of the primary compound undergoing
 #' reaction with OH, ozone, NO3, or Cl.}
 #' \item{cmpd_mwt}{The molecular weight of the compound in units of g/mol.}
-#' \item{cmpd_formula}{The atomic formula of the compound.}
+#' \item{cmpd_formula}{The chemical formula of the compound.}
 #' \item{cmpd_type}{The category of compounds that the `compd_name` falls
 #' under.}
 #' \item{cmpd_smiles}{The SMILES (simplified molecular-input line-entry system)
@@ -744,51 +743,51 @@
 #' \item{cmpd_inchikey}{The InChIKey, which is a hashed InChI value, has a fixed
 #' length of 27 characters. These values can be used to more easily perform
 #' database searches of chemical compounds.}
-#' \item{oh_k298}{Rate constant at 298 K for OH reactions.}
-#' \item{oh_uncert}{Uncertainty as a percentage for certain OH reactions.}
-#' \item{oh_u_fac}{Uncertainty as a plus/minus difference for certain OH
+#' \item{OH_k298}{Rate constant at 298 K for OH reactions.}
+#' \item{OH_uncert}{Uncertainty as a percentage for certain OH reactions.}
+#' \item{OH_u_fac}{Uncertainty as a plus/minus difference for certain OH
 #' reactions.}
-#' \item{oh_a, oh_b, oh_n}{Extended temperature dependence parameters for
+#' \item{OH_a, OH_b, OH_n}{Extended temperature dependence parameters for
 #' bimolecular OH reactions, to be used in the Arrhenius expression:
 #' `k(T)=A exp(-B/T) (T/300)^n`. In that, `A` is expressed as
 #' cm^3 molecules^-1 s^-1, `B` is in units of K, and `n` is dimensionless. Any
 #' `NA` values indicate that data is not available.}
-#' \item{oh_t_low, oh_t_high}{The low and high temperature boundaries (in units
-#' of K) for which the `oh_a`, `oh_b`, and `oh_n` parameters are valid.}
-#' \item{o3_k298}{Rate constant at 298 K for ozone reactions.}
-#' \item{o3_uncert}{Uncertainty as a percentage for certain ozone reactions.}
-#' \item{o3_u_fac}{Uncertainty as a plus/minus difference for certain ozone
+#' \item{OH_t_low, OH_t_high}{The low and high temperature boundaries (in units
+#' of K) for which the `OH_a`, `OH_b`, and `OH_n` parameters are valid.}
+#' \item{O3_k298}{Rate constant at 298 K for ozone reactions.}
+#' \item{O3_uncert}{Uncertainty as a percentage for certain ozone reactions.}
+#' \item{O3_u_fac}{Uncertainty as a plus/minus difference for certain ozone
 #' reactions.}
-#' \item{o3_a, o3_b, o3_n}{Extended temperature dependence parameters for
+#' \item{O3_a, O3_b, O3_n}{Extended temperature dependence parameters for
 #' bimolecular ozone reactions, to be used in the Arrhenius expression:
 #' `k(T)=A exp(-B/T) (T/300)^n`. In that, `A` is expressed as
 #' cm^3 molecules^-1 s^-1, `B` is in units of K, and `n` is dimensionless. Any
 #' `NA` values indicate that data is not available.}
-#' \item{o3_t_low, o3_t_high}{The low and high temperature boundaries (in units
-#' of K) for which the `o3_a`, `o3_b`, and `o3_n` parameters are valid.}
-#' \item{no3_k298}{Rate constant at 298 K for NO3 reactions.}
-#' \item{no3_uncert}{Uncertainty as a percentage for certain NO3 reactions.}
-#' \item{no3_u_fac}{Uncertainty as a plus/minus difference for certain NO3
+#' \item{O3_t_low, O3_t_high}{The low and high temperature boundaries (in units
+#' of K) for which the `O3_a`, `O3_b`, and `O3_n` parameters are valid.}
+#' \item{NO3_k298}{Rate constant at 298 K for NO3 reactions.}
+#' \item{NO3_uncert}{Uncertainty as a percentage for certain NO3 reactions.}
+#' \item{NO3_u_fac}{Uncertainty as a plus/minus difference for certain NO3
 #' reactions.}
-#' \item{no3_a, no3_b, no3_n}{Extended temperature dependence parameters for
+#' \item{NO3_a, NO3_b, NO3_n}{Extended temperature dependence parameters for
 #' bimolecular NO3 reactions, to be used in the Arrhenius expression:
 #' `k(T)=A exp(-B/T) (T/300)^n`. In that, `A` is expressed as
 #' cm^3 molecules^-1 s^-1, `B` is in units of K, and `n` is dimensionless. Any
 #' `NA` values indicate that data is not available.}
-#' \item{no3_t_low, no3_t_high}{The low and high temperature boundaries (in
-#' units of K) for which the `no3_a`, `no3_b`, and `no3_n` parameters are
+#' \item{NO3_t_low, NO3_t_high}{The low and high temperature boundaries (in
+#' units of K) for which the `NO3_a`, `NO3_b`, and `NO3_n` parameters are
 #' valid.}
-#' \item{cl_k298}{Rate constant at 298 K for Cl reactions.}
-#' \item{cl_uncert}{Uncertainty as a percentage for certain Cl reactions.}
-#' \item{cl_u_fac}{Uncertainty as a plus/minus difference for certain Cl
+#' \item{Cl_k298}{Rate constant at 298 K for Cl reactions.}
+#' \item{Cl_uncert}{Uncertainty as a percentage for certain Cl reactions.}
+#' \item{Cl_u_fac}{Uncertainty as a plus/minus difference for certain Cl
 #' reactions.}
-#' \item{cl_a, cl_b, cl_n}{Extended temperature dependence parameters for
+#' \item{Cl_a, Cl_b, Cl_n}{Extended temperature dependence parameters for
 #' bimolecular Cl reactions, to be used in the Arrhenius expression:
 #' `k(T)=A exp(-B/T) (T/300)^n`. In that, `A` is expressed as
 #' cm^3 molecules^-1 s^-1, `B` is in units of K, and `n` is dimensionless. Any
 #' `NA` values indicate that data is not available.}
-#' \item{cl_t_low, cl_t_high}{The low and high temperature boundaries (in units
-#' of K) for which the `cl_a`, `cl_b`, and `cl_n` parameters are valid.}
+#' \item{Cl_t_low, Cl_t_high}{The low and high temperature boundaries (in units
+#' of K) for which the `Cl_a`, `Cl_b`, and `Cl_n` parameters are valid.}
 #' }
 #'
 #' @section Examples:
@@ -808,10 +807,69 @@
 #' }}
 #'
 #' @section Dataset Introduced:
-#' *in development*
+#' *In Development*
 #'
 "reactions"
 
+#' Data on photolysis rates for gas-phase organic compounds
+#'
+#' @description
+#'
+#' The `photolysis` dataset contains numerical values for describing the
+#' photolytic degradation pathways of 25 compounds of relevance in atmospheric
+#' chemistry. Many volatile organic compounds (VOCs) are emitted in substantial
+#' quantities from both biogenic and anthropogenic sources, and they can have a
+#' major influence on the chemistry of the lower atmosphere. A portion of these
+#' can be transformed into other VOCs via the energy provided from light.
+#'
+#' In order to realistically predict the composition of the atmosphere and how
+#' it evolves over time, we need accurate estimates of photolysis rates. The
+#' data provided here in `photolysis` allows for computations of photolysis
+#' rates (*J*, having units of `s^-1`) as a function of the solar zenith angle
+#' (SZA). Having such values is essential when deploying atmospheric chemistry
+#' models.
+#'
+#' @format A tibble with 34 rows and 10 variables:
+#' \describe{
+#' \item{compd_name}{The name of the primary compound undergoing photolysis.}
+#' \item{cmpd_formula}{The chemical formula of the compound.}
+#' \item{products}{A product pathway for the photolysis of the compound.}
+#' \item{type}{The type of organic compound undergoing photolysis.}
+#' \item{l, m, n}{The parameter values given in the `l`, `m`, and `n` columns
+#' can be used to calculate the photolysis rate (*J*) as a function of the
+#' solar zenith angle (*X*, in radians) through the expression:
+#' `J = l * cos(X)^m * exp(-n * sec(X))`.}
+#' \item{quantum_yield}{In the context of photolysis reactions, this is the
+#' efficiency of a given photolytic reaction. In other words, it's the number of
+#' product molecules formed over the number of photons absorbed.}
+#' \item{wavelength_nm, sigma_298_cm2}{The `wavelength_nm` and `sigma_298_cm2`
+#' columns provide photoabsorption data for the compound undergoing photolysis.
+#' The values in `wavelength_nm` provide the wavelength of light in nanometer
+#' units; the `sigma_298_cm2` values are paired with the `wavelength_nm` values
+#' and they are in units of `cm^2 molecule^-1`.}
+#' }
+#'
+#' @section Examples:
+#'
+#' Here is a glimpse at the data available in `photolysis`.
+#'
+#' ```{r}
+#' dplyr::glimpse(photolysis)
+#' ```
+#'
+#' @family datasets
+#' @section Dataset ID and Badge:
+#' DATA-12
+#'
+#' \if{html}{\out{
+#' `r data_get_image_tag(file = "dataset_photolysis.png")`
+#' }}
+#'
+#' @section Dataset Introduced:
+#' *In Development*
+#'
+"photolysis"
+
 #' An ADSL-flavored clinical trial toy dataset
 #'
 #' @description
@@ -871,7 +929,7 @@
 #'
 #' @family datasets
 #' @section Dataset ID and Badge:
-#' DATA-12
+#' DATA-13
 #'
 #' \if{html}{\out{
 #' `r data_get_image_tag(file = "dataset_rx_adsl.png")`
@@ -944,7 +1002,7 @@
 #'
 #' @family datasets
 #' @section Dataset ID and Badge:
-#' DATA-13
+#' DATA-14
 #'
 #' \if{html}{\out{
 #' `r data_get_image_tag(file = "dataset_rx_addv.png")`

diff --git a/R/format_data.R b/R/format_data.R
@@ -8687,29 +8687,76 @@ fmt_units <- function(
 #'   dplyr::filter(grepl("^1-", cmpd_name)) |>
 #'   dplyr::select(cmpd_name, cmpd_formula, ends_with("k298")) |>
 #'   gt() |>
-#'   sub_missing() |>
+#'   tab_header(title = "Gas-phase reactions of selected terminal alkenes") |>
+#'   tab_spanner(
+#'     label = "Reaction Rate Constant at 298 K",
+#'     columns = ends_with("k298")
+#'   ) |>
 #'   fmt_chem(columns = cmpd_formula) |>
 #'   fmt_scientific() |>
+#'   sub_missing() |>
 #'   cols_label(
 #'     cmpd_name = "Alkene",
 #'     cmpd_formula = "Formula",
-#'     oh_k298 = "OH",
-#'     o3_k298 = "{{%O3%}}",
-#'     no3_k298 = "{{%NO3%}}",
-#'     cl_k298 = "Cl"
+#'     OH_k298 = "OH",
+#'     O3_k298 = "{{%O3%}}",
+#'     NO3_k298 = "{{%NO3%}}",
+#'     Cl_k298 = "Cl"
 #'   ) |>
-#'   tab_spanner(
-#'     label = "Reaction Rate Constant at 298 K",
-#'     columns = ends_with("k298")
-#'   ) |>
-#'   tab_header(title = "Gas-phase reactions of selected terminal alkenes") |>
 #'   opt_align_table_header(align = "left")
 #' ```
 #'
 #' \if{html}{\out{
 #' `r man_get_image_tag(file = "man_fmt_chem_1.png")`
 #' }}
 #'
+#' Taking just a few rows from the [`photolysis`] dataset, let's and create a
+#' new **gt** table. The `cmpd_formula` and `products` columns both contain
+#' text in chemistry notation (the first has compounds, and the second column
+#' has the products of photolysis reactions). These columns will be formatting
+#' by the `fmt_chem()` function. The compound formulas will be merged with the
+#' compound names via the [cols_merge()] function.
+#'
+#' ```r
+#' photolysis %>%
+#'   dplyr::filter(cmpd_name %in% c(
+#'     "hydrogen peroxide", "nitrous acid",
+#'     "nitric acid", "acetaldehyde",
+#'     "methyl peroxide", "methyl nitrate",
+#'     "ethyl nitrate", "isopropyl nitrate"
+#'   )) |>
+#'   dplyr::select(-c(l, m, n, quantum_yield, type)) |>
+#'   gt() |>
+#'   tab_header(title = "Photolysis pathways of selected VOCs") |>
+#'   fmt_chem(columns = c(cmpd_formula, products)) |>
+#'   cols_nanoplot(
+#'     columns = sigma_298_cm2,
+#'     columns_x_vals = wavelength_nm,
+#'     expand_x = c(200, 400),
+#'     new_col_name = "cross_section",
+#'     new_col_label = "Absorption Cross Section",
+#'     options = nanoplot_options(
+#'       show_data_points = FALSE,
+#'       data_line_stroke_width = 4,
+#'       data_line_stroke_color = "black",
+#'       show_data_area = FALSE
+#'     )
+#'   ) |>
+#'   cols_merge(
+#'     columns = c(cmpd_name, cmpd_formula),
+#'     pattern = "{1}, {2}"
+#'   ) |>
+#'   cols_label(
+#'     cmpd_name = "Compound",
+#'     products = "Products"
+#'   ) |>
+#'   opt_align_table_header(align = "left")
+#' ```
+#'
+#' \if{html}{\out{
+#' `r man_get_image_tag(file = "man_fmt_chem_2.png")`
+#' }}
+#'
 #' @family data formatting functions
 #' @section Function ID:
 #' 3-19

diff --git a/R/utils_units.R b/R/utils_units.R
@@ -37,6 +37,7 @@ define_units <- function(units_notation, is_chemical_formula = FALSE) {
 
     chem_text <- gsub(".*(\\%.*\\%).*", "\\1", input)
     chem_input <- gsub("^%|%$", "", chem_text)
+    chem_input <- gsub("\\(\\^([^\\)])", "( ^\\1", chem_input)
 
     # Replace single bonds
     for (i in seq(1, 10)) {

diff --git a/data-raw/11-reactions.R b/data-raw/11-reactions.R
@@ -80,24 +80,22 @@ reactions <-
   ) %>%
   dplyr::mutate(across(starts_with("feat"), ~ tidyr::replace_na(.x, 0)))
 
-colnames(reactions) <- tolower(colnames(reactions))
-
 reactions <-
   reactions %>%
-  dplyr::mutate(oh_uncert = case_when(
-    !is.na(oh_unc) ~ as.numeric(sub("%", "", oh_unc)) / 100,
+  dplyr::mutate(OH_uncert = case_when(
+    !is.na(OH_unc) ~ as.numeric(sub("%", "", OH_unc)) / 100,
     TRUE ~ NA_real_
   )) %>%
-  dplyr::mutate(o3_uncert = case_when(
-    !is.na(o3_unc) ~ as.numeric(sub("%", "", o3_unc)) / 100,
+  dplyr::mutate(O3_uncert = case_when(
+    !is.na(O3_unc) ~ as.numeric(sub("%", "", O3_unc)) / 100,
     TRUE ~ NA_real_
   )) %>%
-  dplyr::mutate(no3_uncert = case_when(
-    !is.na(no3_unc) ~ as.numeric(sub("%", "", no3_unc)) / 100,
+  dplyr::mutate(NO3_uncert = case_when(
+    !is.na(NO3_unc) ~ as.numeric(sub("%", "", NO3_unc)) / 100,
     TRUE ~ NA_real_
   )) %>%
-  dplyr::mutate(cl_uncert = case_when(
-    !is.na(cl_unc) ~ as.numeric(sub("%", "", cl_unc)) / 100,
+  dplyr::mutate(Cl_uncert = case_when(
+    !is.na(Cl_unc) ~ as.numeric(sub("%", "", Cl_unc)) / 100,
     TRUE ~ NA_real_
   )) %>%
   dplyr::select(-ends_with("unc"))
@@ -195,12 +193,12 @@ compound_types <-
 
 reactions <-
   reactions %>%
-  dplyr::inner_join(compound_types) %>%
+  dplyr::inner_join(compound_types, by = c("cmpd_type" = "cmpd_type")) %>%
   dplyr::relocate(cmpd_desc, .after = cmpd_struct_fml) %>%
-  dplyr::relocate(oh_uncert, .before = oh_u_fac) %>%
-  dplyr::relocate(o3_uncert, .before = o3_u_fac) %>%
-  dplyr::relocate(no3_uncert, .before = no3_u_fac) %>%
-  dplyr::relocate(cl_uncert, .before = cl_u_fac) %>%
+  dplyr::relocate(OH_uncert, .before = OH_u_fac) %>%
+  dplyr::relocate(O3_uncert, .before = O3_u_fac) %>%
+  dplyr::relocate(NO3_uncert, .before = NO3_u_fac) %>%
+  dplyr::relocate(Cl_uncert, .before = Cl_u_fac) %>%
   dplyr::select(-cmpd_type, -cmpd_no, -cmpd_struct_fml, -starts_with("feat")) %>%
   dplyr::rename(cmpd_name = cmpd_primary_name) %>%
   dplyr::rename(cmpd_formula = cmpd_atomic_fml) %>%

diff --git a/data-raw/12-photolysis.R b/data-raw/12-photolysis.R
@@ -0,0 +1,20 @@
+library(tidyverse)
+
+photolysis <-
+  readr::read_csv(
+    file = "data-raw/photolysis.csv",
+    col_types =
+      cols(
+        cmpd_name = col_character(),
+        cmpd_formula = col_character(),
+        products = col_character(),
+        type = col_character(),
+        l = col_double(),
+        m = col_double(),
+        n = col_double(),
+        quantum_yield = col_double(),
+        wavelength_nm = col_character(),
+        sigma_298_cm2 = col_character()
+      )
+  )
+
diff --git a/data-raw/12-gt_clinical.R → data-raw/13-gt_clinical.R b/data-raw/12-gt_clinical.R → data-raw/13-gt_clinical.R