# Notebook 01 - Quality Control

Sam Welch  
November 25, 2025

In [None]:
library(STOPeData)
library(DT)
library(dplyr)



Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Attaching package: 'eDataDRF'

The following objects are masked from 'package:STOPeData':

    abbreviate_string, altitude_units_vocabulary,
    analytical_protocols_vocabulary, areas_vocabulary,
    coordinate_systems_vocabulary, countries_vocabulary,
    dummy_parameters_vocabulary, environ_compartments_sub_vocabulary,
    environ_compartments_vocabulary, extraction_protocols_vocabulary,
    fractionation_protocols_vocabulary, gender_vocabulary,
    generate_protocol_id, geographic_features_sub_vocabulary,
    geographic_features_vocabulary, get_dataset_display_name,
    initialise_biota_tibble, initialise_campaign_tibble,
    initialise_compartments_tibble, initialise_CREED_data_tibble,
    initialise_CREED_scores_tibble, initialise_measurements_tibble,
    initialise_methods_tibble, initialise_parameters_tibble,
    initialise_references_tibble, initialise_samples_tibble,
    initialise_sites_tibble, lifestage_vocabulary,
    measured_categories_vocabulary, measured_flags_vocabulary,
    measured_types_vocabular

here() starts at C:/Users/SAW/Local Documents/EXPECT AEP/EXPECT_AEP_R_Project

In [None]:
# load in any data we need from the targets workflow
here::i_am("README.md")


here() starts at C:/Users/SAW/Local Documents/EXPECT AEP/EXPECT_AEP_R_Project

## Data Quality Overview

In [None]:

#' Generate a text summary of data quality issues
#'
#' @param qc_report Output from check_data_quality()
#' @return A character string with markdown-formatted summary
format_quality_summary <- function(qc_report) {
  s <- qc_report$summary

  glue::glue(
    "
Dataset contains **{s$total_rows}** rows from **{s$total_references}** references across **{s$total_sites}** sites.

**Issues identified:**

- **Measurements**: {s$n_rows_missing_measurements} rows ({s$n_refs_missing_measurements} references) missing all measurement data (value, LOQ, and LOD)
- **Sample size**: {s$n_rows_missing_n} rows ({s$n_refs_missing_n} references) missing or zero sample size
- **Methods**: {s$n_rows_missing_methods} rows ({s$n_refs_missing_methods} references) missing method information
- **Uncertainty**: {s$n_rows_missing_uncertainty} rows ({s$n_refs_missing_uncertainty} references) missing uncertainty type
- **Site data**: {s$n_sites_missing_data} sites ({s$n_refs_missing_sites} references) missing location/geographic data
- **Biota data**: {s$n_rows_missing_biota} rows ({s$n_refs_missing_biota} references) missing species/tissue information
"
  )
}

cat(format_quality_summary(literature_qc))


Dataset contains **48972** rows from **32** references across **5142** sites.

**Issues identified:**

- **Measurements**: 52 rows (2 references) missing all measurement data (value, LOQ, and LOD)
- **Sample size**: 49 rows (1 references) missing or zero sample size
- **Methods**: 49 rows (1 references) missing method information
- **Uncertainty**: 48842 rows (18 references) missing uncertainty type
- **Site data**: 4 sites (2 references) missing location/geographic data
- **Biota data**: 1 rows (1 references) missing species/tissue information

> **Missing Data**
>
> ### Missing Measurements
>
> Rows where measured value, LOQ, and LOD are all missing.
>
> Known issues:
>
> -   2025PelkonenEnvironmentalImpactOf does not report LOD/LOQ values in main paper or ESI.
>
> ``` r
>
> if (nrow(literature_qc$missing_measurements) > 0) {
>   DT::datatable(
>     literature_qc$missing_measurements |>
>       dplyr::select(-sample_ids),
>     options = list(pageLength = 10),
>     caption = "References with missing measurement data"
>   )
> } else {
>   cat("*No missing measurements found.*")
> }
> ```
>
> <div class="datatables html-widget html-fill-item" id="htmlwidget-fd29a1f19d0310898eb2" style="width:100%;height:auto;"></div>
> <script type="application/json" data-for="htmlwidget-fd29a1f19d0310898eb2">{"x":{"filter":"none","vertical":false,"caption":"<caption>References with missing measurement data<\/caption>","data":[["1","2"],["","2025PelkonenEnvironmentalImpactOf"],[49,3]],"container":"<table class=\"display\">\n  <thead>\n    <tr>\n      <th> <\/th>\n      <th>REFERENCE_ID<\/th>\n      <th>n_rows<\/th>\n    <\/tr>\n  <\/thead>\n<\/table>","options":{"pageLength":10,"columnDefs":[{"className":"dt-right","targets":2},{"orderable":false,"targets":0},{"name":" ","targets":0},{"name":"REFERENCE_ID","targets":1},{"name":"n_rows","targets":2}],"order":[],"autoWidth":false,"orderClasses":false},"selection":{"mode":"multiple","selected":null,"target":"row","selectable":null}},"evals":[],"jsHooks":[]}</script>
>
> ### Missing Sample Size
>
> Rows where MEASURED_N is missing or zero.
>
> ``` r
>
> if (nrow(literature_qc$missing_n) > 0) {
>   DT::datatable(
>     literature_qc$missing_n |>
>       dplyr::select(-sample_ids),
>     options = list(pageLength = 10),
>     caption = "References with missing sample size"
>   )
> } else {
>   cat("*No missing sample sizes found.*")
> }
> ```
>
> <div class="datatables html-widget html-fill-item" id="htmlwidget-d9705cafc9ad44405d9a" style="width:100%;height:auto;"></div>
> <script type="application/json" data-for="htmlwidget-d9705cafc9ad44405d9a">{"x":{"filter":"none","vertical":false,"caption":"<caption>References with missing sample size<\/caption>","data":[["1"],[""],[49]],"container":"<table class=\"display\">\n  <thead>\n    <tr>\n      <th> <\/th>\n      <th>REFERENCE_ID<\/th>\n      <th>n_rows<\/th>\n    <\/tr>\n  <\/thead>\n<\/table>","options":{"pageLength":10,"columnDefs":[{"className":"dt-right","targets":2},{"orderable":false,"targets":0},{"name":" ","targets":0},{"name":"REFERENCE_ID","targets":1},{"name":"n_rows","targets":2}],"order":[],"autoWidth":false,"orderClasses":false},"selection":{"mode":"multiple","selected":null,"target":"row","selectable":null}},"evals":[],"jsHooks":[]}</script>
>
> ### Missing Methods
>
> Rows missing any of: analytical protocol, extraction protocol, fractionation protocol, or sampling protocol.
>
> ``` r
>
>
> if (nrow(literature_qc$missing_methods) > 0) {
>   display_df <- literature_qc$missing_methods |>
>     dplyr::mutate(
>       analytical_issue = dplyr::case_when(
>         missing_analytical ~ "Missing",
>         .default = NA_character_
>       ),
>       extraction_issue = dplyr::case_when(
>         missing_extraction ~ "Missing",
>         .default = NA_character_
>       ),
>       fractionation_issue = dplyr::case_when(
>         missing_fractionation ~ "Missing",
>         .default = NA_character_
>       ),
>       sampling_issue = dplyr::case_when(
>         missing_sampling ~ "Missing",
>         .default = NA_character_
>       )
>     ) |>
>     dplyr::select(
>       REFERENCE_ID,
>       n_rows,
>       analytical_issue,
>       extraction_issue,
>       fractionation_issue,
>       sampling_issue
>     )
>
>   DT::datatable(
>     display_df,
>     options = list(pageLength = 10),
>     caption = "References with missing method information"
>   )
> } else {
>   cat("*No missing methods found.*")
> }
> ```
>
> <div class="datatables html-widget html-fill-item" id="htmlwidget-21d983b383c257fc9987" style="width:100%;height:auto;"></div>
> <script type="application/json" data-for="htmlwidget-21d983b383c257fc9987">{"x":{"filter":"none","vertical":false,"caption":"<caption>References with missing method information<\/caption>","data":[["1"],[""],[49],["Missing"],["Missing"],["Missing"],["Missing"]],"container":"<table class=\"display\">\n  <thead>\n    <tr>\n      <th> <\/th>\n      <th>REFERENCE_ID<\/th>\n      <th>n_rows<\/th>\n      <th>analytical_issue<\/th>\n      <th>extraction_issue<\/th>\n      <th>fractionation_issue<\/th>\n      <th>sampling_issue<\/th>\n    <\/tr>\n  <\/thead>\n<\/table>","options":{"pageLength":10,"columnDefs":[{"className":"dt-right","targets":2},{"orderable":false,"targets":0},{"name":" ","targets":0},{"name":"REFERENCE_ID","targets":1},{"name":"n_rows","targets":2},{"name":"analytical_issue","targets":3},{"name":"extraction_issue","targets":4},{"name":"fractionation_issue","targets":5},{"name":"sampling_issue","targets":6}],"order":[],"autoWidth":false,"orderClasses":false},"selection":{"mode":"multiple","selected":null,"target":"row","selectable":null}},"evals":[],"jsHooks":[]}</script>
>
> ### Missing Uncertainty
>
> Rows where UNCERTAINTY_TYPE or UNCERTAINTY\_\*\_STANDARD are missing.
>
> ``` r
>
> if (nrow(literature_qc$missing_uncertainty) > 0) {
>   display_df <- literature_qc$missing_uncertainty |>
>     dplyr::mutate(
>       uncertainty_type_issue = dplyr::case_when(
>         type_missing ~ "Missing",
>         type_not_reported ~ "Not reported",
>         type_not_relevant ~ "Not relevant",
>         .default = NA_character_
>       ),
>       upper_bound_issue = dplyr::case_when(
>         upper_missing ~ "Missing",
>         upper_zero ~ "Zero",
>         upper_below_value ~ "< Average",
>         .default = NA_character_
>       ),
>       lower_bound_issue = dplyr::case_when(
>         lower_missing ~ "Missing",
>         lower_zero ~ "Zero",
>         lower_above_value ~ "> Average",
>         .default = NA_character_
>       )
>     ) |>
>     dplyr::select(
>       REFERENCE_ID,
>       n_rows,
>       uncertainty_type_issue,
>       upper_bound_issue,
>       lower_bound_issue
>     )
>
>   DT::datatable(
>     display_df,
>     options = list(pageLength = 10),
>     caption = "References with missing or problematic uncertainty"
>   )
> } else {
>   cat("*No uncertainty issues found.*")
> }
> ```
>
> <div class="datatables html-widget html-fill-item" id="htmlwidget-dc03f011351ef1f30bcf" style="width:100%;height:auto;"></div>
> <script type="application/json" data-for="htmlwidget-dc03f011351ef1f30bcf">{"x":{"filter":"none","vertical":false,"caption":"<caption>References with missing or problematic uncertainty<\/caption>","data":[["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18"],["","2000JulshamnTraceElementLevels","2002HellandTransportAndSedimentation","2006ZaukeHeavyMetalsIn","2011CaiContentAndDistribution","2018ErvikToxicAndEssential","2019SchaanningBenthicCommunityStatus","2019SimonsenLabilityOfToxic","2019SndergaardGreenSeaUrchins","2021LehmannKonEffectsOfBiotransport","2022KohlerDistributionPatternOf","2022ParoliniTrendsAndPotential","2022PedersenImpactsOfClimate","2024HoffBiogeochemicalImpactOf","2024IglikowskaTraceMetalsIn","2025PelkonenEnvironmentalImpactOf","UnknownReference","Vannmilj√∏Copper2010-2025"],[49,18,48,11,25,8,6,140,9,1,25,41,11,42,20,3,1,48384],["Missing","Not reported","Not relevant","Missing","Not relevant","Not relevant","Not relevant","Not reported","Not reported",null,"Not relevant","Not reported","Not reported","Not relevant","Not relevant","Not reported",null,"Not reported"],["Missing","Missing","Missing",null,"Missing","Missing","Missing","Missing","Missing",null,"Missing","Missing","Missing","Missing","Missing","Missing","Missing","Missing"],["Missing","Missing","Missing",null,"Missing","Missing","Missing","Missing","Missing","Zero","Missing","Missing","Missing","Missing","Missing","Missing","Missing","Missing"]],"container":"<table class=\"display\">\n  <thead>\n    <tr>\n      <th> <\/th>\n      <th>REFERENCE_ID<\/th>\n      <th>n_rows<\/th>\n      <th>uncertainty_type_issue<\/th>\n      <th>upper_bound_issue<\/th>\n      <th>lower_bound_issue<\/th>\n    <\/tr>\n  <\/thead>\n<\/table>","options":{"pageLength":10,"columnDefs":[{"className":"dt-right","targets":2},{"orderable":false,"targets":0},{"name":" ","targets":0},{"name":"REFERENCE_ID","targets":1},{"name":"n_rows","targets":2},{"name":"uncertainty_type_issue","targets":3},{"name":"upper_bound_issue","targets":4},{"name":"lower_bound_issue","targets":5}],"order":[],"autoWidth":false,"orderClasses":false},"selection":{"mode":"multiple","selected":null,"target":"row","selectable":null}},"evals":[],"jsHooks":[]}</script>
>
> ### Missing Site Data
>
> Sites missing coordinates, geographic features, country, or ocean information.
>
> ``` r
>
> if (nrow(literature_qc$missing_sites) > 0) {
>   DT::datatable(
>     literature_qc$missing_sites,
>     options = list(pageLength = 10),
>     caption = "Sites with missing geographic data"
>   )
> } else {
>   cat("*No missing site data found.*")
> }
> ```
>
> <div class="datatables html-widget html-fill-item" id="htmlwidget-55404dee5516c6fd22b8" style="width:100%;height:auto;"></div>
> <script type="application/json" data-for="htmlwidget-55404dee5516c6fd22b8">{"x":{"filter":"none","vertical":false,"caption":"<caption>Sites with missing geographic data<\/caption>","data":[["1","2","3","4"],["","2017RasmussenEffectsOfIndustrial","2017RasmussenEffectsOfIndustrial","2017RasmussenEffectsOfIndustrial"],["","seafoodprocessing_Atlantic_salmon","seafoodprocessing_Greenland_halibut","seafoodprocessing_Greenland_prawns"],[49,1,1,1],[true,true,true,true],[true,true,true,true],[true,true,true,true],[true,true,true,true],[true,true,true,true],[true,true,true,true]],"container":"<table class=\"display\">\n  <thead>\n    <tr>\n      <th> <\/th>\n      <th>REFERENCE_ID<\/th>\n      <th>SITE_CODE<\/th>\n      <th>n_rows<\/th>\n      <th>missing_coords<\/th>\n      <th>missing_name<\/th>\n      <th>missing_feature<\/th>\n      <th>missing_feature_sub<\/th>\n      <th>missing_country<\/th>\n      <th>missing_ocean<\/th>\n    <\/tr>\n  <\/thead>\n<\/table>","options":{"pageLength":10,"columnDefs":[{"className":"dt-right","targets":3},{"orderable":false,"targets":0},{"name":" ","targets":0},{"name":"REFERENCE_ID","targets":1},{"name":"SITE_CODE","targets":2},{"name":"n_rows","targets":3},{"name":"missing_coords","targets":4},{"name":"missing_name","targets":5},{"name":"missing_feature","targets":6},{"name":"missing_feature_sub","targets":7},{"name":"missing_country","targets":8},{"name":"missing_ocean","targets":9}],"order":[],"autoWidth":false,"orderClasses":false},"selection":{"mode":"multiple","selected":null,"target":"row","selectable":null}},"evals":[],"jsHooks":[]}</script>
>
> ### Missing Biota Data
>
> Biota samples missing species, tissue, lifestage, gender, or species group.
>
> ``` r
>
> if (nrow(literature_qc$missing_biota) > 0) {
>   DT::datatable(
>     literature_qc$missing_biota |>
>       dplyr::select(-sample_ids),
>     options = list(pageLength = 10),
>     caption = "References with missing biota information"
>   )
> } else {
>   cat("*No missing biota data found.*")
> }
> ```
>
> <div class="datatables html-widget html-fill-item" id="htmlwidget-8940ccb3a098ef8d28ad" style="width:100%;height:auto;"></div>
> <script type="application/json" data-for="htmlwidget-8940ccb3a098ef8d28ad">{"x":{"filter":"none","vertical":false,"caption":"<caption>References with missing biota information<\/caption>","data":[["1"],["2018ErvikToxicAndEssential"],[1],[true],[true],[true],[true],[true]],"container":"<table class=\"display\">\n  <thead>\n    <tr>\n      <th> <\/th>\n      <th>REFERENCE_ID<\/th>\n      <th>n_rows<\/th>\n      <th>missing_species<\/th>\n      <th>missing_tissue<\/th>\n      <th>missing_lifestage<\/th>\n      <th>missing_gender<\/th>\n      <th>missing_group<\/th>\n    <\/tr>\n  <\/thead>\n<\/table>","options":{"pageLength":10,"columnDefs":[{"className":"dt-right","targets":2},{"orderable":false,"targets":0},{"name":" ","targets":0},{"name":"REFERENCE_ID","targets":1},{"name":"n_rows","targets":2},{"name":"missing_species","targets":3},{"name":"missing_tissue","targets":4},{"name":"missing_lifestage","targets":5},{"name":"missing_gender","targets":6},{"name":"missing_group","targets":7}],"order":[],"autoWidth":false,"orderClasses":false},"selection":{"mode":"multiple","selected":null,"target":"row","selectable":null}},"evals":[],"jsHooks":[]}</script>