# Notebook 01 - Quality Control

Sam Welch  
November 25, 2025

Load libraries and data.

In [None]:
suppressMessages(
  {
    print(Sys.time())
    library(STOPeData)
    library(DT)
    library(dplyr)
    library(eDataDRF)
    library(targets)
    library(here)
    library(glue)

    i_am("README.md")
    tar_config_set(store = here("_targets"))
  }
)


[1] "2025-12-30 13:44:23 CET"

## Data Quality Overview

``` r

#' Generate a text summary of data quality issues
#'
#' @param qc_report Output from check_data_quality()
#' @return A character string with markdown-formatted summary
format_quality_summary <- function(qc_report) {
  s <- qc_report$summary

  glue(
    "
Dataset contains **{s$total_rows}** rows from **{s$total_references}** references across **{s$total_sites}** sites.

**Issues identified:**

- **Measurements**: {s$n_rows_missing_measurements} rows ({s$n_refs_missing_measurements} references) missing all measurement data (value, LOQ, and LOD)
- **Sample size**: {s$n_rows_missing_n} rows ({s$n_refs_missing_n} references) missing or zero sample size
- **Methods**: {s$n_rows_missing_methods} rows ({s$n_refs_missing_methods} references) missing method information
- **Uncertainty**: {s$n_rows_missing_uncertainty} rows ({s$n_refs_missing_uncertainty} references) missing uncertainty type
- **Site data**: {s$n_sites_missing_data} sites ({s$n_refs_missing_sites} references) missing location/geographic data
- **Biota data**: {s$n_rows_missing_biota} rows ({s$n_refs_missing_biota} references) missing species/tissue information
"
  )
}

cat(format_quality_summary(literature_qc))
```

## Missing Data

### Missing Measurements

Rows where measured value, LOQ, and LOD are all missing.

Known issues:

-   2025PelkonenEnvironmentalImpactOf does not report LOD/LOQ values in main paper or ESI.

In [None]:

if (nrow(literature_qc$missing_measurements) > 0) {
  datatable(
    literature_qc$missing_measurements,
    # select(-sample_ids),
    options = list(pageLength = 10),
    colnames = c("Source", "Timestamp", "Rows"),
    caption = "References with missing measurement data"
  )
} else {
  cat("*No missing measurements found.*")
}


### Missing Sample Size

Rows where MEASURED_N is missing or zero.

In [None]:

if (nrow(literature_qc$missing_n) > 0) {
  datatable(
    literature_qc$missing_n,
    # select(-sample_ids),
    options = list(pageLength = 10),
    colnames = c("Source", "Timestamp", "Rows"),
    caption = "References with missing sample size"
  )
} else {
  cat("*No missing sample sizes found.*")
}


### Missing Methods

Rows missing any of: analytical protocol, extraction protocol, fractionation protocol, or sampling protocol.

In [None]:


if (nrow(literature_qc$missing_methods) > 0) {
  display_df <- literature_qc$missing_methods

  datatable(
    display_df,
    options = list(pageLength = 10),
    caption = "References with missing method information",
    colnames = c("Source", "Timestamp", "Rows", "Missing Protocols"),
  )
} else {
  cat("*No missing methods found.*")
}


### Missing Uncertainty

Rows where UNCERTAINTY_TYPE or UNCERTAINTY\_\*\_STANDARD are missing.

In [None]:

if (nrow(literature_qc$missing_uncertainty) > 0) {
  display_df <- literature_qc$missing_uncertainty

  datatable(
    display_df,
    options = list(pageLength = 10),
    caption = "References with missing or problematic uncertainty",
    colnames = c("Source", "Timestamp", "Rows", "Uncertainty Issues")
  )
} else {
  cat("*No uncertainty issues found.*")
}


### Missing Site Data

Sites missing coordinates, geographic features, country, or ocean information.

In [None]:

if (nrow(literature_qc$missing_sites) > 0) {
  datatable(
    literature_qc$missing_sites,
    options = list(pageLength = 10),
    caption = "Sites with missing geographic data",
    colnames = c(
      "Source",
      "Timestamp",
      "Rows",
      "SITE_CODE",
      "Missing Variables"
    )
  )
} else {
  cat("*No missing site data found.*")
}


### Missing Biota Data

Biota samples missing species, tissue, lifestage, gender, or species group.

In [None]:

if (nrow(literature_qc$missing_biota) > 0) {
  datatable(
    literature_qc$missing_biota,
    # select(-sample_ids),
    options = list(pageLength = 10),
    caption = "References with missing biota information",
    colnames = c(
      "Source",
      "Timestamp",
      "Rows",
      "Missing Variables",
      "SAMPLE_ID"
    )
  )
} else {
  cat("*No missing biota data found.*")
}
