# Notebook 01 - Quality Control

Sam Welch  
November 25, 2025

In [None]:

suppressMessages(
  {
    print(Sys.time())
    library(STOPeData)
    library(DT)
    library(dplyr)
    library(eDataDRF)
    library(targets)
    library(here)
    library(glue)

    i_am("README.md")
    tar_config_set(store = here("_targets"))
  }
)


In [None]:
# load in any data we need from the targets workflow

literature_merged_data <- tar_read(load_literature_pqt)
literature_reference_data <- tar_read(reference_data)
literature_campaign_data <- tar_read(campaign_data)
literature_sites_data <- tar_read(sites_data)
literature_qc <- tar_read(data_quality_report)
wgs84_geo <- tar_read(wgs84_geography)

species_names <- species_names_vocabulary()
species_lookup <- species_names |>
  with(setNames(SPECIES_COMMON_NAME, SPECIES_NAME))


## Data Quality Overview

``` r

#' Generate a text summary of data quality issues
#'
#' @param qc_report Output from check_data_quality()
#' @return A character string with markdown-formatted summary
format_quality_summary <- function(qc_report) {
  s <- qc_report$summary

  glue(
    "
Dataset contains **{s$total_rows}** rows from **{s$total_references}** references across **{s$total_sites}** sites.

**Issues identified:**

- **Measurements**: {s$n_rows_missing_measurements} rows ({s$n_refs_missing_measurements} references) missing all measurement data (value, LOQ, and LOD)
- **Sample size**: {s$n_rows_missing_n} rows ({s$n_refs_missing_n} references) missing or zero sample size
- **Methods**: {s$n_rows_missing_methods} rows ({s$n_refs_missing_methods} references) missing method information
- **Uncertainty**: {s$n_rows_missing_uncertainty} rows ({s$n_refs_missing_uncertainty} references) missing uncertainty type
- **Site data**: {s$n_sites_missing_data} sites ({s$n_refs_missing_sites} references) missing location/geographic data
- **Biota data**: {s$n_rows_missing_biota} rows ({s$n_refs_missing_biota} references) missing species/tissue information
"
  )
}

cat(format_quality_summary(literature_qc))
```

Dataset contains **48972** rows from **32** references across **5142** sites.

**Issues identified:**

-   **Measurements**: 52 rows (2 references) missing all measurement data (value, LOQ, and LOD)
-   **Sample size**: 49 rows (1 references) missing or zero sample size
-   **Methods**: 49 rows (1 references) missing method information
-   **Uncertainty**: 48842 rows (18 references) missing uncertainty type
-   **Site data**: 4 sites (2 references) missing location/geographic data
-   **Biota data**: 1 rows (1 references) missing species/tissue information

## Missing Data

### Missing Measurements

Rows where measured value, LOQ, and LOD are all missing.

Known issues:

-   2025PelkonenEnvironmentalImpactOf does not report LOD/LOQ values in main paper or ESI.

In [None]:

if (nrow(literature_qc$missing_measurements) > 0) {
  datatable(
    literature_qc$missing_measurements |>
      select(-sample_ids),
    options = list(pageLength = 10),
    caption = "References with missing measurement data"
  )
} else {
  cat("*No missing measurements found.*")
}


### Missing Sample Size

Rows where MEASURED_N is missing or zero.

In [None]:

if (nrow(literature_qc$missing_n) > 0) {
  datatable(
    literature_qc$missing_n |>
      select(-sample_ids),
    options = list(pageLength = 10),
    caption = "References with missing sample size"
  )
} else {
  cat("*No missing sample sizes found.*")
}


### Missing Methods

Rows missing any of: analytical protocol, extraction protocol, fractionation protocol, or sampling protocol.

In [None]:


if (nrow(literature_qc$missing_methods) > 0) {
  display_df <- literature_qc$missing_methods |>
    mutate(
      analytical_issue = case_when(
        missing_analytical ~ "Missing",
        .default = NA_character_
      ),
      extraction_issue = case_when(
        missing_extraction ~ "Missing",
        .default = NA_character_
      ),
      fractionation_issue = case_when(
        missing_fractionation ~ "Missing",
        .default = NA_character_
      ),
      sampling_issue = case_when(
        missing_sampling ~ "Missing",
        .default = NA_character_
      )
    ) |>
    select(
      REFERENCE_ID,
      n_rows,
      analytical_issue,
      extraction_issue,
      fractionation_issue,
      sampling_issue
    )

  datatable(
    display_df,
    options = list(pageLength = 10),
    caption = "References with missing method information"
  )
} else {
  cat("*No missing methods found.*")
}


### Missing Uncertainty

Rows where UNCERTAINTY_TYPE or UNCERTAINTY\_\*\_STANDARD are missing.

In [None]:

if (nrow(literature_qc$missing_uncertainty) > 0) {
  display_df <- literature_qc$missing_uncertainty |>
    mutate(
      uncertainty_type_issue = case_when(
        type_missing ~ "Missing",
        type_not_reported ~ "Not reported",
        type_not_relevant ~ "Not relevant",
        .default = NA_character_
      ),
      upper_bound_issue = case_when(
        upper_missing ~ "Missing",
        upper_zero ~ "Zero",
        upper_below_value ~ "< Average",
        .default = NA_character_
      ),
      lower_bound_issue = case_when(
        lower_missing ~ "Missing",
        lower_zero ~ "Zero",
        lower_above_value ~ "> Average",
        .default = NA_character_
      )
    ) |>
    select(
      REFERENCE_ID,
      n_rows,
      uncertainty_type_issue,
      upper_bound_issue,
      lower_bound_issue
    )

  datatable(
    display_df,
    options = list(pageLength = 10),
    caption = "References with missing or problematic uncertainty"
  )
} else {
  cat("*No uncertainty issues found.*")
}


### Missing Site Data

Sites missing coordinates, geographic features, country, or ocean information.

In [None]:

if (nrow(literature_qc$missing_sites) > 0) {
  datatable(
    literature_qc$missing_sites,
    options = list(pageLength = 10),
    caption = "Sites with missing geographic data"
  )
} else {
  cat("*No missing site data found.*")
}


### Missing Biota Data

Biota samples missing species, tissue, lifestage, gender, or species group.

In [None]:

if (nrow(literature_qc$missing_biota) > 0) {
  datatable(
    literature_qc$missing_biota |>
      select(-sample_ids),
    options = list(pageLength = 10),
    caption = "References with missing biota information"
  )
} else {
  cat("*No missing biota data found.*")
}
