-
Notifications
You must be signed in to change notification settings - Fork 11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Redundant columns? #78
Comments
|
If you have a normalized collection of tables you can always reconstruct a master table from the pieces. See A normalized database avoids mistakes that will surely come from trying to update mulitple tables in the exact same way. Even a typo will make the merge brake -- resulting in a missing value somewhere. library(tidyverse)
library(allodb)
redundant <- c(
"dependent_variable_biomass_component",
"allometry_specificity",
"dbh_min_cm",
"dbh_max_cm"
)
sitespecies_normalized <- sitespecies[ , setdiff(names(sitespecies), redundant)]
names(sitespecies_normalized)
#> [1] "site" "family" "species"
#> [4] "species_code" "life_form" "equation_group"
#> [7] "equation_id" "equation_taxa" "notes_on_species"
#> [10] "wsg_id" "wsg_specificity"
all <- full_join(equations, sitespecies_normalized, by = "equation_id")
names(all)
#> [1] "ref_id"
#> [2] "equation_allometry"
#> [3] "equation_id"
#> [4] "equation_form"
#> [5] "dependent_variable_biomass_component"
#> [6] "independent_variable"
#> [7] "allometry_specificity"
#> [8] "geographic_area"
#> [9] "dbh_min_cm"
#> [10] "dbh_max_cm"
#> [11] "sample_size"
#> [12] "dbh_units_original"
#> [13] "biomass_units_original"
#> [14] "allometry_development_method"
#> [15] "regression_model"
#> [16] "other_equations_tested"
#> [17] "log_biomass"
#> [18] "bias_corrected"
#> [19] "bias_correction_factor"
#> [20] "notes_fitting_model"
#> [21] "original_data_availability"
#> [22] "warning"
#> [23] "site"
#> [24] "family"
#> [25] "species"
#> [26] "species_code"
#> [27] "life_form"
#> [28] "equation_group"
#> [29] "equation_taxa"
#> [30] "notes_on_species"
#> [31] "wsg_id"
#> [32] "wsg_specificity"
# My approach is to build a wrapper that reconstructs a master table from
# the normalized tables (with no redundant columns)
master <- function() {
eqn_site_sp <- full_join(equations, sitespecies_normalized, by = "equation_id")
full_join(eqn_site_sp, sites_info, by = "site")
}
glimpse(master())
#> Observations: 755
#> Variables: 43
#> $ ref_id <chr> "jenkins_2004_cdod", "jen...
#> $ equation_allometry <chr> "10^(1.1891+1.419*(log10(...
#> $ equation_id <chr> "2060ea", "2060ea", "a4d8...
#> $ equation_form <chr> "10^(a+b*(log10(dbh^c)))"...
#> $ dependent_variable_biomass_component <chr> "Total aboveground biomas...
#> $ independent_variable <chr> "DBH", "DBH", "DBH", "DBH...
#> $ allometry_specificity <chr> "Species", "Species", "Sp...
#> $ geographic_area <chr> "Ohio, USA", "Ohio, USA",...
#> $ dbh_min_cm <chr> "0.21", "0.21", "0.19", "...
#> $ dbh_max_cm <chr> "5.73", "5.73", "3.86", "...
#> $ sample_size <chr> NA, NA, NA, NA, NA, NA, N...
#> $ dbh_units_original <chr> "cm", "cm", "cm", "cm", "...
#> $ biomass_units_original <chr> "g", "g", "g", "g", "g", ...
#> $ allometry_development_method <chr> "harvest", "harvest", "ha...
#> $ regression_model <chr> NA, NA, NA, NA, NA, NA, N...
#> $ other_equations_tested <chr> NA, NA, NA, NA, NA, NA, N...
#> $ log_biomass <chr> NA, NA, NA, NA, NA, NA, N...
#> $ bias_corrected <chr> "1", "1", "1", "1", "1", ...
#> $ bias_correction_factor <chr> "1.056", "1.056", "1.016"...
#> $ notes_fitting_model <chr> NA, NA, NA, NA, NA, NA, N...
#> $ original_data_availability <chr> NA, NA, NA, NA, NA, NA, N...
#> $ warning <chr> NA, NA, NA, NA, NA, NA, N...
#> $ site <chr> "Lilly Dicky", "Tyson", "...
#> $ family <chr> "Sapindaceae", "Sapindace...
#> $ species <chr> "Acer rubrum", "Acer rubr...
#> $ species_code <chr> "316", "acerub", "318", "...
#> $ life_form <chr> "Tree", "Tree", "Tree", "...
#> $ equation_group <chr> "Expert", "Expert", "Expe...
#> $ equation_taxa <chr> "Acer rubrum", "Acer rubr...
#> $ notes_on_species <chr> NA, NA, NA, NA, NA, NA, N...
#> $ wsg_id <chr> NA, NA, NA, NA, NA, NA, N...
#> $ wsg_specificity <chr> NA, NA, NA, NA, NA, NA, N...
#> $ id <chr> NA, NA, NA, NA, NA, NA, N...
#> $ Site <chr> NA, NA, NA, NA, NA, NA, N...
#> $ lat <chr> NA, NA, NA, NA, NA, NA, N...
#> $ long <chr> NA, NA, NA, NA, NA, NA, N...
#> $ UTM_Zone <chr> NA, NA, NA, NA, NA, NA, N...
#> $ UTM_X <chr> NA, NA, NA, NA, NA, NA, N...
#> $ UTM_Y <chr> NA, NA, NA, NA, NA, NA, N...
#> $ intertropical <chr> NA, NA, NA, NA, NA, NA, N...
#> $ size.ha <chr> NA, NA, NA, NA, NA, NA, N...
#> $ E <chr> NA, NA, NA, NA, NA, NA, N...
#> $ wsg.site.name <chr> NA, NA, NA, NA, NA, NA, N... Created on 2019-03-15 by the reprex package (v0.2.1) |
Let me show a toy example for clarity library(tidyverse)
eqn <- tribble(
~id, ~eqn, ~component,
01, "a + b", "whole",
02, "a + c", "part",
)
site <- tribble(
~id, ~site,
01, "scbi",
01, "bci",
02, "luquillo",
)
master <- function() full_join(eqn, site, by = "id")
eqn
#> # A tibble: 2 x 3
#> id eqn component
#> <dbl> <chr> <chr>
#> 1 1 a + b whole
#> 2 2 a + c part
site
#> # A tibble: 3 x 2
#> id site
#> <dbl> <chr>
#> 1 1 scbi
#> 2 1 bci
#> 3 2 luquillo
master()
#> # A tibble: 3 x 4
#> id eqn component site
#> <dbl> <chr> <chr> <chr>
#> 1 1 a + b whole scbi
#> 2 1 a + b whole bci
#> 3 2 a + c part luquillo Created on 2019-03-15 by the reprex package (v0.2.1) |
Allometric equation category that can be selected by the user. "Generic" equations are the current best equations or equation groups that can be applied to any site in a broad geographic area (e.g., pantropical, continental) and to any taxa at that site. "Expert" equations are the current best equations for specific taxa at a specific site, as identified by a botanist or forest ecologist. "Other" equations are potentially relevant (i.e., may be selected as best under different criteria) or were previously classified as best available. @maurolepore I get your point....it is nice to have the taxa specificity in both tables but we also want to keep the best practice as you recommend. |
Most users will need only the warpper, say For example, here are a bunch of tables from the SQL-database of ForestGEO. Most of them users don't need to worry about because they can use "joint" tables such as ViewFullTable. library(tidyverse)
fgeo.data::data_dictionary %>%
as_tibble() %>%
pull(table) %>%
unique()
#> [1] "Census" "CensusQuadrat"
#> [3] "Coordinates" "Country"
#> [5] "CurrentObsolete" "DataCollection"
#> [7] "DBH" "DBHAttributes"
#> [9] "Family" "Features"
#> [11] "FeatureTypes" "Genus"
#> [13] "Log" "Measurement"
#> [15] "MeasurementAttributes" "MeasurementType"
#> [17] "Personnel" "PersonnelRole"
#> [19] "Quadrat" "Reference"
#> [21] "RemeasAttribs" "Remeasurement"
#> [23] "RoleReference" "Site"
#> [25] "Species" "SpeciesInventory"
#> [27] "Specimen" "Stem"
#> [29] "SubSpecies" "Tree"
#> [31] "TreeAttributes" "TreeTaxChange"
#> [33] "TSMAttributes" "ViewFullTable"
#> [35] "ViewTaxonomy" Created on 2019-03-15 by the reprex package (v0.2.1) |
Here is an example of the problem I am trying to show. The column library(tidyverse)
allodb::sitespecies_metadata %>%
filter(Field == "allometry_specificity") %>%
pull(Description)
#> [1] "Refers to the specific taxonomic level for which the biomass equation was developed (species, genus, family or plant group)"
allodb::equations_metadata %>%
filter(Field == "allometry_specificity") %>%
pull(Description)
#> [1] "Specific taxonomic level for which the equation was developed (species, genus, family or plant group)" . library(tidyverse)
library(allodb)
shared_cols <- c(
"equation_id",
"dependent_variable_biomass_component",
"allometry_specificity",
"dbh_min_cm",
"dbh_max_cm"
)
bad_id <- "390a5a"
equations %>%
filter(equation_id == bad_id) %>%
select(shared_cols)
#> # A tibble: 1 x 5
#> equation_id dependent_variable_b~ allometry_specif~ dbh_min_cm dbh_max_cm
#> <chr> <chr> <chr> <chr> <chr>
#> 1 390a5a Total aboveground bi~ Species 0.3 1
sitespecies %>%
filter(equation_id == bad_id) %>%
select(shared_cols)
#> # A tibble: 1 x 5
#> equation_id dependent_variable_b~ allometry_specif~ dbh_min_cm dbh_max_cm
#> <chr> <chr> <chr> <chr> <chr>
#> 1 390a5a Total aboveground bi~ Genus 0.3 1 . Where the two tables missmatch, library(tidyverse)
library(allodb)
shared_cols <- c(
"equation_id",
"dependent_variable_biomass_component",
"allometry_specificity",
"dbh_min_cm",
"dbh_max_cm"
)
bad_id <- "390a5a"
equations %>%
full_join(sitespecies) %>%
filter(equation_id == bad_id) %>%
# filter(is.na(dbh_units_original)) %>%
select(equation_id, allometry_specificity, matches("unit"))
#> Joining, by = c("equation_id", "dependent_variable_biomass_component", "allometry_specificity", "dbh_min_cm", "dbh_max_cm")
#> # A tibble: 2 x 4
#> equation_id allometry_specificity dbh_units_origin~ biomass_units_origin~
#> <chr> <chr> <chr> <chr>
#> 1 390a5a Species cm g
#> 2 390a5a Genus <NA> <NA> . |
There was an error in the eq table. I just fixed it. If that's the only shared column betwen |
Thanks! It's practially impossible to not make mistakes if the data is duplicated.
Here are all the duplicated columns, the only one that should be in the two tables is the key, i.e. intersect(
names(allodb::equations),
names(allodb::sitespecies)
)
#> [1] "equation_id"
#> [2] "dependent_variable_biomass_component"
#> [3] "allometry_specificity"
#> [4] "dbh_min_cm"
#> [5] "dbh_max_cm" |
I eliminated the redundant columns form the sitespecies tables. |
Awesome, thanks! I just finished an exploration of the problems. It should now be irrelevant, but I have it ready so I'll share it so the problem you have just solved becomes clearer. I'll update allodb and use this analysis to check it's now all good. library(tidyverse)
#> Warning: package 'purrr' was built under R version 3.5.3
library(allodb)
pick_different <- function(matches) {
equations %>%
full_join(sitespecies, by = "equation_id") %>%
select(equation_id, matches(matches)) %>%
mutate(is_different = !map2_lgl(.[[2]], .[[3]], identical)) %>%
filter(is_different) %>%
unique()
}
intersect(names(equations), names(sitespecies))
#> [1] "equation_id"
#> [2] "dependent_variable_biomass_component"
#> [3] "allometry_specificity"
#> [4] "dbh_min_cm"
#> [5] "dbh_max_cm"
# Good
pick_different("allometry_specificity")
#> # A tibble: 0 x 4
#> # ... with 4 variables: equation_id <chr>, allometry_specificity.x <chr>,
#> # allometry_specificity.y <chr>, is_different <lgl>
# Bad
pick_different("dependent_variable_biomass_component")
#> # A tibble: 3 x 4
#> equation_id dependent_variable_biom~ dependent_variable_bio~ is_different
#> <chr> <chr> <chr> <lgl>
#> 1 333c34 Stem (wood and bark) Stem and branches (liv~ TRUE
#> 2 e9d686 Stem (wood and bark) Stem and branches (liv~ TRUE
#> 3 8eca60 Whole tree (above stump) Stem and branches (liv~ TRUE
# Bad
pick_different("dbh_min_cm")
#> # A tibble: 1 x 4
#> equation_id dbh_min_cm.x dbh_min_cm.y is_different
#> <chr> <chr> <chr> <lgl>
#> 1 122ba6 25 2.5 TRUE
# Bad
pick_different("dbh_max_cm")
#> # A tibble: 1 x 4
#> equation_id dbh_max_cm.x dbh_max_cm.y is_different
#> <chr> <chr> <chr> <lgl>
#> 1 122ba6 550 55 TRUE Created on 2019-03-18 by the reprex package (v0.2.1) |
# Awesome!
intersect(names(allodb::equations), names(allodb::sitespecies))
#> [1] "equation_id" Created on 2019-03-18 by the reprex package (v0.2.1) |
@gonzalezeb,
Some columns seem redundant -- they appear in more than one table. Can you confirm the redundance and try eliminate it? Keeping the duplicated columns in sync is prone to errors that result in missing values inserted unexpectedly when two tables are joint together.
For example, here I expected the only shared column to be
equation_id
:This example shows that merging by
equation_id
(only) results in duplicated columns:Created on 2019-03-15 by the reprex package (v0.2.1)
The text was updated successfully, but these errors were encountered: