<a href="https://colab.research.google.com/github/samsoe/mpg_notebooks/blob/master/yvp_vegetation_cover_WRANGLE_2020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*R Notebook*

# README

* [Copy of YVP - Vegetation Cover Data - Wrangle](https://colab.research.google.com/drive/1xsVJiE5Nl5SIFqjdAFvq58hcntVrn4Fw?usp=sharing)
* [Readme fixed grid plot vegetation data](https://docs.google.com/document/d/16-Aq8u9Rudd78fSzfjvpCXyQgE-BstC-d2PjYfmLtcw/edit?usp=sharing)

# Load Tools

In [None]:
# Package and library installation
packages_needed = c("tidyverse", "gsheet", "lubridate", "knitr") # comma delimited vector of package names
packages_installed = packages_needed %in% rownames(installed.packages())

if (any(! packages_installed))
  install.packages(packages_needed[! packages_installed])
for (i in 1:length(packages_needed)) {
  library(packages_needed[i], character.only = T)
}

# Source

### vegetation cover

In [None]:
# 2020-10-22_yvp_vegetation_cover_SOURCE.csv
# https://drive.google.com/file/d/1PpCpyViLjC1_jNKLT4S7KWbduuwvQ23R/view?usp=sharing
veg_src = 'https://drive.google.com/uc?id=1PpCpyViLjC1_jNKLT4S7KWbduuwvQ23R'

In [None]:
df_veg_initial <- read.csv(veg_src)

In [None]:
df_veg_initial %>% glimpse()

Rows: 9,150
Columns: 7
$ plot_num     [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
$ plot_code    [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N…
$ date         [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ subplot      [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,…
$ species_code [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", "COLL…
$ cover_pct    [3m[90m<int>[39m[23m 10, 2, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2, 15, …
$ comments     [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …


## survey metadata

In [None]:
# 2020-10-22_yvp_survey_metadata_SOURCE.csv
# https://drive.google.com/file/d/19I0quIj8ALzP91VkxxIR-D1PWgXRZ_90/view?usp=sharing
src_meta <- 'https://drive.google.com/uc?id=19I0quIj8ALzP91VkxxIR-D1PWgXRZ_90'

In [None]:
df_meta_full <- read_csv(src_meta)


[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
cols(
  Plot = [31mcol_character()[39m,
  Date = [31mcol_character()[39m,
  Initials = [31mcol_character()[39m,
  `GPS Lat 1` = [32mcol_double()[39m,
  `GPS Long 1` = [32mcol_double()[39m,
  `GPS Lat 2` = [32mcol_double()[39m,
  `GPS Long 2` = [32mcol_double()[39m,
  Azimuth = [32mcol_double()[39m,
  `comments 2020` = [31mcol_character()[39m,
  `Comments 2019` = [31mcol_character()[39m,
  `Comments 2018` = [31mcol_character()[39m,
  `Comments 2017` = [31mcol_character()[39m
)




In [None]:
# cast to date datatype
df_meta_full$Date <- mdy(df_meta_full$Date)

In [None]:
df_meta <- df_meta_full %>%
  filter(year(df_meta_full$Date) == 2020) %>%
  select(Plot, Date) %>%
  glimpse()

Rows: 58
Columns: 2
$ Plot [3m[90m<chr>[39m[23m "NA294", "NB294", "NC294", "N324", "N321", "NA292", "NB292", "14…
$ Date [3m[90m<date>[39m[23m 2020-05-09, 2020-05-09, 2020-05-09, 2020-05-09, 2020-05-13, 202…


## vegetation metadata

In [None]:
install.packages("bigrquery")
library(bigrquery)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [None]:
# BigQuery API Key
bq_auth(path = "/content/mpg-data-warehouse-api_key-master.json")
Sys.setenv(BIGQUERY_TEST_PROJECT = "mpg-data-warehouse")
billing <- bq_test_project()

In [None]:
sql_metaveg <- 
  "
  SELECT key_plant_species, key_plant_code
  FROM `mpg-data-warehouse.vegetation_species_metadata.vegetation_species_metadata`
  "

In [None]:
bq_metaveg <- bq_project_query(billing, sql_metaveg)
tb_metaveg <- bq_table_download(bq_metaveg)
df_metaveg <- 
  as.data.frame(tb_metaveg) %>% glimpse()

Rows: 754
Columns: 2
$ key_plant_species [3m[90m<int>[39m[23m 360, 13, 26, 53, 738, 75, 76, 746, 83, 88, 86, 87, …
$ key_plant_code    [3m[90m<chr>[39m[23m "NV", "AGRSCA", "ANDGER", "ARIPUR", "BOUCUR", "BOUG…


# Wrangle

## Structure columns

### Plot Code Transformation
The plot code used in the source data is a complex string. It is needed to provide a unique key to each survey location, but because it is a string it is difficult to sort or filter plots. Further, the plot codes used here will be difficult to associate with the extensive grid point metadata stored elsewhere in the MPG Data Warehouse. 

Solution: paste the separate identifers from the plot code into separate fields, but retain the original character string for internal use.

#### plot_code

In [None]:
typeof(df_veg_initial$plot_code)

#### plot_ loc

In [None]:
# detect "N" in 'plot_code' and write to new column 'plot_loc'
# df <- 
df_loc <- df_veg_initial %>%
  mutate(plot_loc = ifelse(str_detect(plot_code, "N"), "N", NA)) %>% 
  select(plot_code, plot_loc, plot_num, subplot, species_code, cover_pct) %>% glimpse()

Rows: 9,150
Columns: 6
$ plot_code    [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N…
$ plot_loc     [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "…
$ plot_num     [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
$ subplot      [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,…
$ species_code [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", "COLL…
$ cover_pct    [3m[90m<int>[39m[23m 10, 2, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2, 15, …


#### plot_rep

In [None]:
# detect "A", "B", "C" characters in plot_code and if present write to 'plot_rep'
df_rep <- df_loc %>%
  mutate(plot_rep = case_when(str_detect(plot_code, "A")~"A",
                              str_detect(plot_code, "B")~"B",
                              str_detect(plot_code, "C")~"C")) %>%
         select(plot_code, plot_loc, plot_rep, plot_num, subplot, species_code, cover_pct) %>% glimpse()

Rows: 9,150
Columns: 7
$ plot_code    [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N…
$ plot_loc     [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "…
$ plot_rep     [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ plot_num     [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
$ subplot      [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,…
$ species_code [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", "COLL…
$ cover_pct    [3m[90m<int>[39m[23m 10, 2, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2, 15, …


#### grid_point

In [None]:
# update variable name to grid_point
df_grid <- df_rep %>% 
  rename(grid_point = plot_num) %>% glimpse()

Rows: 9,150
Columns: 7
$ plot_code    [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N…
$ plot_loc     [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "…
$ plot_rep     [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ grid_point   [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
$ subplot      [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,…
$ species_code [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", "COLL…
$ cover_pct    [3m[90m<int>[39m[23m 10, 2, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2, 15, …


### date

In [None]:
# create Plot in df_grid_point to enable join
df_join <- df_grid %>% 
  mutate(Plot = str_sub(plot_code, 5)) %>% glimpse()

Rows: 9,150
Columns: 8
$ plot_code    [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N…
$ plot_loc     [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "…
$ plot_rep     [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ grid_point   [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
$ subplot      [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,…
$ species_code [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", "COLL…
$ cover_pct    [3m[90m<int>[39m[23m 10, 2, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2, 15, …
$ Plot         [3m[90m<chr>[39m[23m "N7", "N7", "N7", "N7", "N7", "N7", "N7", "N7", "N7", "N…


In [None]:
# date:ISO
# With 2020 data, create via join with survey_metadata
df_date <- df_join %>% 
  left_join(df_meta, by = c("Plot" = "Plot")) %>%
  rename(date = Date) %>%
  select(plot_code, plot_loc, plot_rep, grid_point, date, subplot, species_code, cover_pct) %>% glimpse()

Rows: 9,150
Columns: 8
$ plot_code    [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N…
$ plot_loc     [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "…
$ plot_rep     [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ grid_point   [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
$ date         [3m[90m<date>[39m[23m 2020-05-31, 2020-05-31, 2020-05-31, 2020-05-31, 2020-05…
$ subplot      [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,…
$ species_code [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", "COLL…
$ cover_pct    [3m[90m<int>[39m[23m 10, 2, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2, 15, …


### subplot

In [None]:
typeof(df_date$subplot)

### key_plant_code

In [None]:
# rename variable from species_code to key_plant_code
df_code <- df_date %>%
  rename(key_plant_code = species_code) %>% glimpse()

Rows: 9,150
Columns: 8
$ plot_code      [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP…
$ plot_loc       [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N",…
$ plot_rep       [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ grid_point     [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
$ date           [3m[90m<date>[39m[23m 2020-05-31, 2020-05-31, 2020-05-31, 2020-05-31, 2020-…
$ subplot        [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ key_plant_code [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", "CO…
$ cover_pct      [3m[90m<int>[39m[23m 10, 2, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2, 15…


## Identify Double Counted Species
In a few instances, a plant species is counted twice in the same survey subplot. This could inflate the cover reported for that species. In these cases, the desired end product is to have just one row for each. Because there is no way to know which value of the two is correct, we will average the two values, treating them empirically as independent, legitimate cover estimates. 

In [None]:
df_code %>% summary()

  plot_code           plot_loc           plot_rep           grid_point   
 Length:9150        Length:9150        Length:9150        Min.   :  7.0  
 Class :character   Class :character   Class :character   1st Qu.: 64.0  
 Mode  :character   Mode  :character   Mode  :character   Median :209.0  
                                                          Mean   :250.1  
                                                          3rd Qu.:395.0  
                                                          Max.   :571.0  
      date               subplot       key_plant_code       cover_pct     
 Min.   :2020-05-09   Min.   : 1.000   Length:9150        Min.   : 0.000  
 1st Qu.:2020-05-22   1st Qu.: 3.000   Class :character   1st Qu.: 1.000  
 Median :2020-06-05   Median : 6.000   Mode  :character   Median : 1.000  
 Mean   :2020-06-03   Mean   : 5.518                      Mean   : 4.219  
 3rd Qu.:2020-06-17   3rd Qu.: 8.000                      3rd Qu.: 3.000  
 Max.   :2020-07-01   Max.   :10

No NA values in any of the numeric variables

In [None]:
typeof(as.integer(year(df_code$date)))

In [None]:
# Find instances where a plant species is counted twice in the same year-plot-subplot combination
df_code %>%
group_by(year = as.integer(year(df_code$date)), plot_code, subplot, key_plant_code) %>%
summarize(counted = n()) %>% 
ungroup() %>%
arrange(year, plot_code, subplot, desc(counted)) %>%
filter(counted > 1) %>%
print(n=Inf)

`summarise()` regrouping output by 'year', 'plot_code', 'subplot' (override with `.groups` argument)



[90m# A tibble: 12 x 5[39m
    year plot_code subplot key_plant_code counted
   [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m       [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m            [3m[90m<int>[39m[23m
[90m 1[39m  [4m2[24m020 YVP 144        10 COLLIN               2
[90m 2[39m  [4m2[24m020 YVP 181         9 ARANUT               2
[90m 3[39m  [4m2[24m020 YVP 184         1 PLEMAC               2
[90m 4[39m  [4m2[24m020 YVP 185         1 PRUVIR               2
[90m 5[39m  [4m2[24m020 YVP 22         10 VERVER               2
[90m 6[39m  [4m2[24m020 YVP 479         1 RANREP               2
[90m 7[39m  [4m2[24m020 YVP 62          4 VERVER               2
[90m 8[39m  [4m2[24m020 YVP N324        7 EUPESU               2
[90m 9[39m  [4m2[24m020 YVP N348       10 CYNOFF               2
[90m10[39m  [4m2[24m020 YVP N501        6 EPIBRA               2
[90m11[39m  [4m2[24m020 YVP N7          1 ALYALY               2
[90m12[39m  [4

## Resolve double counted species
Twelve species were counted twice in the same subplot. The dataframe has 9150 records now; after resolving the double counts it should have 9138. 

In [None]:
df_resolve <-
  df_code %>%
  group_by(plot_code, plot_loc, plot_rep, grid_point, date, subplot, key_plant_code) %>%
  summarize(cover_pct = mean(cover_pct)) %>% 
  ungroup() %>% glimpse()

`summarise()` regrouping output by 'plot_code', 'plot_loc', 'plot_rep', 'grid_point', 'date', 'subplot' (override with `.groups` argument)



Rows: 9,138
Columns: 8
$ plot_code      [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP…
$ plot_loc       [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ plot_rep       [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ grid_point     [3m[90m<int>[39m[23m 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10…
$ date           [3m[90m<date>[39m[23m 2020-06-27, 2020-06-27, 2020-06-27, 2020-06-27, 2020-…
$ subplot        [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, …
$ key_plant_code [3m[90m<chr>[39m[23m "ALLCER", "ANTE_SP", "BOEC_SP", "BROTEC", "CREINT", "E…
$ cover_pct      [3m[90m<dbl>[39m[23m 1, 1, 1, 0, 0, 2, 10, 30, 5, 1, 0, 1, 2, 1, 20, 2, 0, …


And the result shows the expected number of records. Diagnostics were run on `cover_pct` and they are not shown here, but the quantitative data behaved as expected. Here is the diagnostic code for future reference:

> Histograms show that distributions of cover_pct did not change
```
df_code %>% ggplot(aes(x = cover_pct)) + geom_histogram()
df_resolve %>% ggplot(aes(x = cover_pct)) + geom_histogram()
```
> A correlation of before and after averaging show that cover_pct lines up, and the very few slight deviations produced by averaging are visible
```
one <- df_code %>% group_by(grid_point) %>% summarize(m = mean(cover_pct)) 
two <- df_resolve %>% group_by(grid_point) %>% summarize(m = mean(cover_pct)) 
plot(one$m, two$m)
```

In [None]:
print("original data")
summary(df_code)
print("transformed data")
summary(df_resolve)

[1] "original data"


  plot_code           plot_loc           plot_rep           grid_point   
 Length:9150        Length:9150        Length:9150        Min.   :  7.0  
 Class :character   Class :character   Class :character   1st Qu.: 64.0  
 Mode  :character   Mode  :character   Mode  :character   Median :209.0  
                                                          Mean   :250.1  
                                                          3rd Qu.:395.0  
                                                          Max.   :571.0  
      date               subplot       key_plant_code       cover_pct     
 Min.   :2020-05-09   Min.   : 1.000   Length:9150        Min.   : 0.000  
 1st Qu.:2020-05-22   1st Qu.: 3.000   Class :character   1st Qu.: 1.000  
 Median :2020-06-05   Median : 6.000   Mode  :character   Median : 1.000  
 Mean   :2020-06-03   Mean   : 5.518                      Mean   : 4.219  
 3rd Qu.:2020-06-17   3rd Qu.: 8.000                      3rd Qu.: 3.000  
 Max.   :2020-07-01   Max.   :10

[1] "transformed data"


  plot_code           plot_loc           plot_rep           grid_point   
 Length:9138        Length:9138        Length:9138        Min.   :  7.0  
 Class :character   Class :character   Class :character   1st Qu.: 64.0  
 Mode  :character   Mode  :character   Mode  :character   Median :209.0  
                                                          Mean   :250.1  
                                                          3rd Qu.:395.0  
                                                          Max.   :571.0  
      date               subplot       key_plant_code       cover_pct     
 Min.   :2020-05-09   Min.   : 1.000   Length:9138        Min.   : 0.000  
 1st Qu.:2020-05-22   1st Qu.: 3.000   Class :character   1st Qu.: 1.000  
 Median :2020-06-05   Median : 6.000   Mode  :character   Median : 1.000  
 Mean   :2020-06-03   Mean   : 5.518                      Mean   : 4.223  
 3rd Qu.:2020-06-17   3rd Qu.: 8.000                      3rd Qu.: 3.000  
 Max.   :2020-07-01   Max.   :10

The mean of cover_pct changed slightly, as expected

In [None]:
# rescan for double observations
df_resolve %>%
group_by(year = as.integer(year(df_resolve$date)), plot_code, subplot, key_plant_code) %>%
summarize(counted = n()) %>% 
ungroup() %>%
arrange(year, plot_code, subplot, desc(counted)) %>%
filter(counted > 1) %>%
print(n=Inf)

`summarise()` regrouping output by 'year', 'plot_code', 'subplot' (override with `.groups` argument)



[90m# A tibble: 0 x 5[39m
[90m# … with 5 variables: year [3m[90m<int>[90m[23m, plot_code [3m[90m<chr>[90m[23m, subplot [3m[90m<int>[90m[23m,[39m
[90m#   key_plant_code [3m[90m<chr>[90m[23m, counted [3m[90m<int>[90m[23m[39m


## Correct errors in species codes
The species codes used in the source data contain numerous errors, and they also in some cases represent old taxonomy where species names have been revised. This can cause all sorts of problems, like artificially creating new species or making it impossible to join with available species metadata. Several steps must be accomplished here:

1. Trim leading or trailing spaces from the code (this was done in excel before source CSV files were created)
2. Read in master list of species metadata and query YVP species codes to identify which ones don't align
3. Align the species codes, identify the ones that are wrong and correct them
4. Import the numeric key from the species metadata so that future aligments are easier and errors are less common

In [None]:
df_metaveg %>% glimpse()

Rows: 754
Columns: 2
$ key_plant_species [3m[90m<int>[39m[23m 360, 13, 26, 53, 738, 75, 76, 746, 83, 88, 86, 87, …
$ key_plant_code    [3m[90m<chr>[39m[23m "NV", "AGRSCA", "ANDGER", "ARIPUR", "BOUCUR", "BOUG…


### Align species codes and identify mistakes


In [None]:
df_resolve %>% 
anti_join(df_metaveg, by = c("key_plant_code" = "key_plant_code")) %>% 
group_by(key_plant_code) %>% 
summarize(n_incorrect = n()) %>% 
kable(format = "pandoc")

`summarise()` ungrouping output (override with `.groups` argument)





key_plant_code    n_incorrect
---------------  ------------
ARAB_SP                     1
FESC_SP                     1

In [None]:
# Update key_plant_codes based on comments 2020-11-03
# ARAB_SP = UNK_FORB
# FESC_SP = UNK_GRAM
df_species <- 
  df_resolve %>%
  mutate(key_plant_code = ifelse(str_detect(key_plant_code, "ARAB_SP"), "UNK_FORB", key_plant_code)) %>%
  mutate(key_plant_code = ifelse(str_detect(key_plant_code, "FESC_SP"), "UNK_GRAM", key_plant_code)) %>% glimpse()

Rows: 9,138
Columns: 8
$ plot_code      [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP…
$ plot_loc       [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ plot_rep       [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ grid_point     [3m[90m<int>[39m[23m 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10…
$ date           [3m[90m<date>[39m[23m 2020-06-27, 2020-06-27, 2020-06-27, 2020-06-27, 2020-…
$ subplot        [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, …
$ key_plant_code [3m[90m<chr>[39m[23m "ALLCER", "ANTE_SP", "BOEC_SP", "BROTEC", "CREINT", "E…
$ cover_pct      [3m[90m<dbl>[39m[23m 1, 1, 1, 0, 0, 2, 10, 30, 5, 1, 0, 1, 2, 1, 20, 2, 0, …


In [None]:
# Rescan for incorrect species codes
df_species %>% 
anti_join(df_metaveg, by = c("key_plant_code" = "key_plant_code")) %>% 
group_by(key_plant_code) %>% distinct(key_plant_code) %>% arrange(key_plant_code)

key_plant_code
<chr>


## Incorporate serial key for species codes

In [None]:
df_final <-
  df_species %>% 
  left_join(df_metaveg, by = c("key_plant_code" = "key_plant_code")) %>% 
  select(plot_code, plot_loc, plot_rep, grid_point, date, subplot, key_plant_species,
         key_plant_code, cover_pct) %>%
  glimpse()

Rows: 9,138
Columns: 9
$ plot_code         [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP 10", "…
$ plot_loc          [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ plot_rep          [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ grid_point        [3m[90m<int>[39m[23m 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,…
$ date              [3m[90m<date>[39m[23m 2020-06-27, 2020-06-27, 2020-06-27, 2020-06-27, 20…
$ subplot           [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ key_plant_species [3m[90m<int>[39m[23m 16, 37, 780, 82, 163, 230, 232, 233, 250, 274, 84, …
$ key_plant_code    [3m[90m<chr>[39m[23m "ALLCER", "ANTE_SP", "BOEC_SP", "BROTEC", "CREINT",…
$ cover_pct         [3m[90m<dbl>[39m[23m 1, 1, 1, 0, 0, 2, 10, 30, 5, 1, 0, 1, 2, 1, 20, 2, …


In [None]:
summary(df_final)

  plot_code           plot_loc           plot_rep           grid_point   
 Length:9138        Length:9138        Length:9138        Min.   :  7.0  
 Class :character   Class :character   Class :character   1st Qu.: 64.0  
 Mode  :character   Mode  :character   Mode  :character   Median :209.0  
                                                          Mean   :250.1  
                                                          3rd Qu.:395.0  
                                                          Max.   :571.0  
      date               subplot       key_plant_species key_plant_code    
 Min.   :2020-05-09   Min.   : 1.000   Min.   :  3.0     Length:9138       
 1st Qu.:2020-05-22   1st Qu.: 3.000   1st Qu.:153.0     Class :character  
 Median :2020-06-05   Median : 6.000   Median :286.0     Mode  :character  
 Mean   :2020-06-03   Mean   : 5.518   Mean   :280.7                       
 3rd Qu.:2020-06-17   3rd Qu.: 8.000   3rd Qu.:411.0                       
 Max.   :2020-07-01   Max.

# Output

## Export Wrangled DataFrame to CSV 
Export the full data set so that we can push it to the BQ database




In [None]:
# Output 2020-11-03 ES
filename_final = "yvp_vegetation_cover_WRANGLE-2020.csv"

if (filename_final %in% list.files(getwd())) {
  cat("file already exists in working directory:", filename_final, "\n", "working directory:", getwd(), "\n")
} else {
  write.csv(df_final, filename_final, row.names = FALSE)
  cat(filename_final, "written to working directory \n", "working directory:", getwd(), "\n")
}

yvp_vegetation_cover_WRANGLE-2020.csv written to working directory 
 working directory: /content 


## Push to BigQuery

"yvp_vegetation_cover_FINAL.csv" uploaded manually to BigQuery

## Export field datasheet version
Field datasheets need to have a complete, cumulative species list for each plot recorded in a table, with the cover_pct column set to 0. This allows field techs to change the 0 to some number if the species is found. The date column is blank so that field techs can fill in the appropriate date. Do not include columns that are needed for data analysis, like plot_loc, plot_rep, plot_num, and species_key. 

**Schema for field data sheet**

* plot_num (helps for sorting and finding plots)
* plot_code
* date
* species_code
* cover_pct

In [None]:
field_datasheet = 
yvp_vegetation_cover_FINAL %>% 
select(plot_num, plot_code, subplot, species_code) %>% 
group_by(plot_num, plot_code, subplot) %>% 
distinct(species_code) %>% 
select(-species_code, species_code) %>% 
add_column(date = NA, .after = "plot_code") %>% 
add_column(cover_pct = 0) %>% 
arrange(plot_num, plot_code, subplot, species_code) %>% 
glimpse()

Rows: 8,861
Columns: 6
Groups: plot_num, plot_code, subplot [580]
$ plot_num     [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
$ plot_code    [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N…
$ date         [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ subplot      [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,…
$ species_code [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", "COLL…
$ cover_pct    [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…


In [None]:
filename_field_datasheet = "yvp_vegetation_cover_field_datasheet_FINAL.csv"

if (filename_field_datasheet %in% list.files(getwd())) {
  cat("file already exists in working directory:", filename_final, "\n", "working directory:", getwd(), "\n")
} else {
  write.csv(field_datasheet, filename_field_datasheet)
  cat(filename_final, "written to working directory \n", "working directory:", getwd(), "\n")
}

yvp_vegetation_cover_FINAL.csv written to working directory 
 working directory: /content 
