<a href="https://colab.research.google.com/github/samsoe/mpg_notebooks/blob/master/yvp_vegetation_cover_WRANGLE_2020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*R Notebook*

# README

* [Copy of YVP - Vegetation Cover Data - Wrangle](https://colab.research.google.com/drive/1xsVJiE5Nl5SIFqjdAFvq58hcntVrn4Fw?usp=sharing)
* [Readme fixed grid plot vegetation data](https://docs.google.com/document/d/16-Aq8u9Rudd78fSzfjvpCXyQgE-BstC-d2PjYfmLtcw/edit?usp=sharing)

# Load Tools

In [None]:
# Package and library installation
packages_needed = c("tidyverse", "gsheet", "lubridate") # comma delimited vector of package names
packages_installed = packages_needed %in% rownames(installed.packages())

if (any(! packages_installed))
  install.packages(packages_needed[! packages_installed])
for (i in 1:length(packages_needed)) {
  library(packages_needed[i], character.only = T)
}

# Source

### vegetation cover

In [None]:
# 2020-10-22_yvp_vegetation_cover_SOURCE.csv
# https://drive.google.com/file/d/1PpCpyViLjC1_jNKLT4S7KWbduuwvQ23R/view?usp=sharing
veg_src = 'https://drive.google.com/uc?id=1PpCpyViLjC1_jNKLT4S7KWbduuwvQ23R'

In [None]:
df_veg_initial <- read.csv(veg_src)

In [None]:
df_veg_initial %>% glimpse()

Rows: 9,150
Columns: 7
$ plot_num     [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
$ plot_code    [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N…
$ date         [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ subplot      [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,…
$ species_code [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", "COLL…
$ cover_pct    [3m[90m<int>[39m[23m 10, 2, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2, 15, …
$ comments     [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …


## survey metadata

In [None]:
# 2020-10-22_yvp_survey_metadata_SOURCE.csv
# https://drive.google.com/file/d/19I0quIj8ALzP91VkxxIR-D1PWgXRZ_90/view?usp=sharing
src_meta <- 'https://drive.google.com/uc?id=19I0quIj8ALzP91VkxxIR-D1PWgXRZ_90'

In [None]:
df_meta_full <- read_csv(src_meta)


[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
cols(
  Plot = [31mcol_character()[39m,
  Date = [31mcol_character()[39m,
  Initials = [31mcol_character()[39m,
  `GPS Lat 1` = [32mcol_double()[39m,
  `GPS Long 1` = [32mcol_double()[39m,
  `GPS Lat 2` = [32mcol_double()[39m,
  `GPS Long 2` = [32mcol_double()[39m,
  Azimuth = [32mcol_double()[39m,
  `comments 2020` = [31mcol_character()[39m,
  `Comments 2019` = [31mcol_character()[39m,
  `Comments 2018` = [31mcol_character()[39m,
  `Comments 2017` = [31mcol_character()[39m
)




In [None]:
# cast to date datatype
df_meta_full$Date <- mdy(df_meta_full$Date)

In [None]:
df_meta <- df_meta_full %>%
  filter(year(df_meta_full$Date) == 2020) %>%
  select(Plot, Date) %>%
  glimpse()

Rows: 58
Columns: 2
$ Plot [3m[90m<chr>[39m[23m "NA294", "NB294", "NC294", "N324", "N321", "NA292", "NB292", "14…
$ Date [3m[90m<date>[39m[23m 2020-05-09, 2020-05-09, 2020-05-09, 2020-05-09, 2020-05-13, 202…


## vegetation metadata

In [None]:
install.packages("bigrquery")
library(bigrquery)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘bit’, ‘bit64’, ‘gargle’, ‘rapidjsonr’




In [None]:
# BigQuery API Key
bq_auth(path = "/content/mpg-data-warehouse-api_key-master.json")
Sys.setenv(BIGQUERY_TEST_PROJECT = "mpg-data-warehouse")
billing <- bq_test_project()

In [None]:
sql_metaveg <- 
  "
  SELECT key_plant_species, key_plant_code
  FROM `mpg-data-warehouse.vegetation_species_metadata.vegetation_species_metadata`
  "

In [None]:
bq_metaveg <- bq_project_query(billing, sql_metaveg)
tb_metaveg <- bq_table_download(bq_metaveg)
df_metaveg <- 
  as.data.frame(tb_metaveg) %>% glimpse()

Rows: 754
Columns: 2
$ key_plant_species [3m[90m<int>[39m[23m 360, 13, 26, 53, 738, 75, 76, 746, 83, 88, 86, 87, …
$ key_plant_code    [3m[90m<chr>[39m[23m "NV", "AGRSCA", "ANDGER", "ARIPUR", "BOUCUR", "BOUG…


# Wrangle

## Structure columns

### Plot Code Transformation
The plot code used in the source data is a complex string. It is needed to provide a unique key to each survey location, but because it is a string it is difficult to sort or filter plots. Further, the plot codes used here will be difficult to associate with the extensive grid point metadata stored elsewhere in the MPG Data Warehouse. 

Solution: paste the separate identifers from the plot code into separate fields, but retain the original character string for internal use.

#### plot_code

In [None]:
typeof(df_veg_initial$plot_code)

#### plot_ loc

In [None]:
# detect "N" in 'plot_code' and write to new column 'plot_loc'
# df <- 
df_loc <- df_veg_initial %>%
  mutate(plot_loc = ifelse(str_detect(plot_code, "N"), "N", NA)) %>% 
  select(plot_code, plot_loc, plot_num, subplot, species_code, cover_pct) %>% glimpse()

Rows: 9,150
Columns: 6
$ plot_code    [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N…
$ plot_loc     [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "…
$ plot_num     [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
$ subplot      [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,…
$ species_code [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", "COLL…
$ cover_pct    [3m[90m<int>[39m[23m 10, 2, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2, 15, …


#### plot_rep

In [None]:
# detect "A", "B", "C" characters in plot_code and if present write to 'plot_rep'
df_rep <- df_loc %>%
  mutate(plot_rep = case_when(str_detect(plot_code, "A")~"A",
                              str_detect(plot_code, "B")~"B",
                              str_detect(plot_code, "C")~"C")) %>%
         select(plot_code, plot_loc, plot_rep, plot_num, subplot, species_code, cover_pct) %>% glimpse()

Rows: 9,150
Columns: 7
$ plot_code    [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N…
$ plot_loc     [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "…
$ plot_rep     [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ plot_num     [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
$ subplot      [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,…
$ species_code [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", "COLL…
$ cover_pct    [3m[90m<int>[39m[23m 10, 2, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2, 15, …


#### grid_point

In [None]:
# update variable name to grid_point
df_grid <- df_rep %>% 
  rename(grid_point = plot_num) %>% glimpse()

Rows: 9,150
Columns: 7
$ plot_code    [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N…
$ plot_loc     [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "…
$ plot_rep     [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ grid_point   [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
$ subplot      [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,…
$ species_code [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", "COLL…
$ cover_pct    [3m[90m<int>[39m[23m 10, 2, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2, 15, …


### date

In [None]:
# create Plot in df_grid_point to enable join
df_join <- df_grid %>% 
  mutate(Plot = str_sub(plot_code, 5)) %>% glimpse()

Rows: 9,150
Columns: 8
$ plot_code    [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N…
$ plot_loc     [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "…
$ plot_rep     [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ grid_point   [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
$ subplot      [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,…
$ species_code [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", "COLL…
$ cover_pct    [3m[90m<int>[39m[23m 10, 2, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2, 15, …
$ Plot         [3m[90m<chr>[39m[23m "N7", "N7", "N7", "N7", "N7", "N7", "N7", "N7", "N7", "N…


In [None]:
# date:ISO
# With 2020 data, create via join with survey_metadata
df_date <- df_join %>% 
  left_join(df_meta, by = c("Plot" = "Plot")) %>%
  rename(date = Date) %>%
  select(plot_code, plot_loc, plot_rep, grid_point, date, subplot, species_code, cover_pct) %>% glimpse()

Rows: 9,150
Columns: 8
$ plot_code    [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N…
$ plot_loc     [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "…
$ plot_rep     [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ grid_point   [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
$ date         [3m[90m<date>[39m[23m 2020-05-31, 2020-05-31, 2020-05-31, 2020-05-31, 2020-05…
$ subplot      [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,…
$ species_code [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", "COLL…
$ cover_pct    [3m[90m<int>[39m[23m 10, 2, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2, 15, …


### subplot

In [None]:
typeof(df_date$subplot)

### key_plant_species

This will be imported from the plant species metadata table, and we can use it to join and correct species codes in the future

In [None]:
df_date %>% glimpse()

Rows: 9,150
Columns: 8
$ plot_code    [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N…
$ plot_loc     [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "…
$ plot_rep     [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ grid_point   [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
$ date         [3m[90m<date>[39m[23m 2020-05-31, 2020-05-31, 2020-05-31, 2020-05-31, 2020-05…
$ subplot      [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,…
$ species_code [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", "COLL…
$ cover_pct    [3m[90m<int>[39m[23m 10, 2, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2, 15, …


In [None]:
df_metaveg %>% glimpse()

Rows: 754
Columns: 2
$ key_plant_species [3m[90m<int>[39m[23m 360, 13, 26, 53, 738, 75, 76, 746, 83, 88, 86, 87, …
$ key_plant_code    [3m[90m<chr>[39m[23m "NV", "AGRSCA", "ANDGER", "ARIPUR", "BOUCUR", "BOUG…


In [None]:
df_species <- df_date %>%
  left_join(df_metaveg, by = c("species_code" = "key_plant_code")) %>% 
  select(plot_code, plot_loc, plot_rep, grid_point, date, subplot, key_plant_species, species_code, cover_pct) %>% glimpse()

Rows: 9,150
Columns: 9
$ plot_code         [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "…
$ plot_loc          [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "…
$ plot_rep          [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ grid_point        [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
$ date              [3m[90m<date>[39m[23m 2020-05-31, 2020-05-31, 2020-05-31, 2020-05-31, 20…
$ subplot           [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ key_plant_species [3m[90m<int>[39m[23m 5, 20, 82, 90, 113, 153, 187, 202, 233, 266, 286, 3…
$ species_code      [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", …
$ cover_pct         [3m[90m<int>[39m[23m 10, 2, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2,…


In [None]:
summary(df_species)

  plot_code           plot_loc           plot_rep           grid_point   
 Length:9150        Length:9150        Length:9150        Min.   :  7.0  
 Class :character   Class :character   Class :character   1st Qu.: 64.0  
 Mode  :character   Mode  :character   Mode  :character   Median :209.0  
                                                          Mean   :250.1  
                                                          3rd Qu.:395.0  
                                                          Max.   :571.0  
                                                                         
      date               subplot       key_plant_species species_code      
 Min.   :2020-05-09   Min.   : 1.000   Min.   :  3.0     Length:9150       
 1st Qu.:2020-05-22   1st Qu.: 3.000   1st Qu.:153.0     Class :character  
 Median :2020-06-05   Median : 6.000   Median :286.0     Mode  :character  
 Mean   :2020-06-03   Mean   : 5.518   Mean   :280.7                       
 3rd Qu.:2020-06-17   3rd Qu

In [None]:
# inspect NA key_plant_species
df_species %>%
  filter(is.na(key_plant_species))

plot_code,plot_loc,plot_rep,grid_point,date,subplot,key_plant_species,species_code,cover_pct
<chr>,<chr>,<chr>,<int>,<date>,<int>,<int>,<chr>,<int>
YVP NB294,N,B,294,2020-05-09,8,,ARAB_SP,1
YVP 468,,,468,2020-07-01,3,,FESC_SP,2


### key_plant_code

In [None]:
# rename variable from species_code to key_plant_code
df_code <- df_species %>%
  rename(key_plant_code = species_code) %>% glimpse()

Rows: 9,150
Columns: 9
$ plot_code         [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "…
$ plot_loc          [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "…
$ plot_rep          [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ grid_point        [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
$ date              [3m[90m<date>[39m[23m 2020-05-31, 2020-05-31, 2020-05-31, 2020-05-31, 20…
$ subplot           [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ key_plant_species [3m[90m<int>[39m[23m 5, 20, 82, 90, 113, 153, 187, 202, 233, 266, 286, 3…
$ key_plant_code    [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", …
$ cover_pct         [3m[90m<int>[39m[23m 10, 2, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2,…


## Identify Double Counted Species
In a few instances, a plant species is counted twice in the same survey subplot. This could inflate the cover reported for that species. In these cases, the desired end product is to have just one row for each. When the reported percent cover differs between repeated entries, we cannot tell which one is correct. We used the following algorithm to process these repeated or double counts:

* If the cover_pct values are equal, simply delete one of the rows
* If the cover_pct values are not equal, delete one of the rows and change cover_pct for the remaining one to NA

In [None]:
df_code %>% glimpse()

Rows: 9,150
Columns: 9
$ plot_code         [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "…
$ plot_loc          [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "…
$ plot_rep          [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ grid_point        [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
$ date              [3m[90m<date>[39m[23m 2020-05-31, 2020-05-31, 2020-05-31, 2020-05-31, 20…
$ subplot           [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ key_plant_species [3m[90m<int>[39m[23m 5, 20, 82, 90, 113, 153, 187, 202, 233, 266, 286, 3…
$ key_plant_code    [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", …
$ cover_pct         [3m[90m<int>[39m[23m 10, 2, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2,…


In [None]:
typeof(as.integer(year(df_code$date)))

In [None]:
# Find instances where a plant species is counted twice in the same year-plot-subplot combination
dbl_counts <- df_code %>%
  group_by(year = as.integer(year(df_code$date)), plot_code, subplot, key_plant_code) %>%
  summarize(counted = n()) %>% 
  ungroup() %>%
  arrange(year, plot_code, subplot, desc(counted)) %>%
  filter(counted > 1) %>%
  print(n=Inf)

`summarise()` regrouping output by 'year', 'plot_code', 'subplot' (override with `.groups` argument)



[90m# A tibble: 12 x 5[39m
    year plot_code subplot key_plant_code counted
   [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m       [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m            [3m[90m<int>[39m[23m
[90m 1[39m  [4m2[24m020 YVP 144        10 COLLIN               2
[90m 2[39m  [4m2[24m020 YVP 181         9 ARANUT               2
[90m 3[39m  [4m2[24m020 YVP 184         1 PLEMAC               2
[90m 4[39m  [4m2[24m020 YVP 185         1 PRUVIR               2
[90m 5[39m  [4m2[24m020 YVP 22         10 VERVER               2
[90m 6[39m  [4m2[24m020 YVP 479         1 RANREP               2
[90m 7[39m  [4m2[24m020 YVP 62          4 VERVER               2
[90m 8[39m  [4m2[24m020 YVP N324        7 EUPESU               2
[90m 9[39m  [4m2[24m020 YVP N348       10 CYNOFF               2
[90m10[39m  [4m2[24m020 YVP N501        6 EPIBRA               2
[90m11[39m  [4m2[24m020 YVP N7          1 ALYALY               2
[90m12[39m  [4

In [None]:
view_doubles  <- dbl_counts %>%
  left_join(df_code %>% mutate(year = as.integer(year(df_code$date))))

Joining, by = c("year", "plot_code", "subplot", "key_plant_code")



In [None]:
str(view_doubles)

tibble [24 × 11] (S3: tbl_df/tbl/data.frame)
 $ year             : int [1:24] 2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
 $ plot_code        : chr [1:24] "YVP 144" "YVP 144" "YVP 181" "YVP 181" ...
 $ subplot          : int [1:24] 10 10 9 9 1 1 1 1 10 10 ...
 $ key_plant_code   : chr [1:24] "COLLIN" "COLLIN" "ARANUT" "ARANUT" ...
 $ counted          : int [1:24] 2 2 2 2 2 2 2 2 2 2 ...
 $ plot_loc         : chr [1:24] NA NA NA NA ...
 $ plot_rep         : chr [1:24] NA NA NA NA ...
 $ grid_point       : int [1:24] 144 144 181 181 184 184 185 185 22 22 ...
 $ date             : Date[1:24], format: "2020-05-27" "2020-05-27" ...
 $ key_plant_species: int [1:24] 153 153 46 46 404 404 433 433 562 562 ...
 $ cover_pct        : int [1:24] 1 1 0 1 2 0 1 0 0 0 ...


In [None]:
view_doubles %>%
  distinct(date, plot_code, key_plant_code, subplot) %>%
  arrange(date, plot_code, key_plant_code)

plot_code,subplot,key_plant_code,date
<chr>,<int>,<chr>,<date>
YVP N324,7,EUPESU,2020-05-09
YVP NB294,8,LUPSER,2020-05-09
YVP 184,1,PLEMAC,2020-05-16
YVP 185,1,PRUVIR,2020-05-16
YVP 144,10,COLLIN,2020-05-27
YVP 181,9,ARANUT,2020-05-31
YVP 62,4,VERVER,2020-05-31
YVP N7,1,ALYALY,2020-05-31
YVP N501,6,EPIBRA,2020-06-17
YVP 22,10,VERVER,2020-06-27


In [None]:
view_doubles %>%
  distinct(date, plot_code, subplot, plot_loc) %>%
  arrange(date, plot_code)

plot_code,subplot,plot_loc,date
<chr>,<int>,<chr>,<date>
YVP N324,7,N,2020-05-09
YVP NB294,8,N,2020-05-09
YVP 184,1,,2020-05-16
YVP 185,1,,2020-05-16
YVP 144,10,,2020-05-27
YVP 181,9,,2020-05-31
YVP 62,4,,2020-05-31
YVP N7,1,N,2020-05-31
YVP N501,6,N,2020-06-17
YVP 22,10,,2020-06-27


### Resolve double counts

* If the 'cover_pct' values are equal, delete one of the rows
* If the 'cover_pct' values are not equal, delete one of the rows and change cover_pct for the remaining one to NA

In [None]:
str(view_doubles)

tibble [24 × 11] (S3: tbl_df/tbl/data.frame)
 $ year             : int [1:24] 2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
 $ plot_code        : chr [1:24] "YVP 144" "YVP 144" "YVP 181" "YVP 181" ...
 $ subplot          : int [1:24] 10 10 9 9 1 1 1 1 10 10 ...
 $ key_plant_code   : chr [1:24] "COLLIN" "COLLIN" "ARANUT" "ARANUT" ...
 $ counted          : int [1:24] 2 2 2 2 2 2 2 2 2 2 ...
 $ plot_loc         : chr [1:24] NA NA NA NA ...
 $ plot_rep         : chr [1:24] NA NA NA NA ...
 $ grid_point       : int [1:24] 144 144 181 181 184 184 185 185 22 22 ...
 $ date             : Date[1:24], format: "2020-05-27" "2020-05-27" ...
 $ key_plant_species: int [1:24] 153 153 46 46 404 404 433 433 562 562 ...
 $ cover_pct        : int [1:24] 1 1 0 1 2 0 1 0 0 0 ...


In [None]:
distinct_doubles <- view_doubles %>%
  distinct(date, plot_code, subplot, key_plant_code) %>%
  arrange(date, plot_code)

In [None]:
str(distinct_doubles)

tibble [12 × 4] (S3: tbl_df/tbl/data.frame)
 $ plot_code     : chr [1:12] "YVP N324" "YVP NB294" "YVP 184" "YVP 185" ...
 $ subplot       : int [1:12] 7 8 1 1 10 9 4 1 6 10 ...
 $ key_plant_code: chr [1:12] "EUPESU" "LUPSER" "PLEMAC" "PRUVIR" ...
 $ date          : Date[1:12], format: "2020-05-09" "2020-05-09" ...


In [None]:
nrow(distinct_doubles)

In [None]:
for (row in 1:nrow(distinct_doubles)) {
  dbl_ref <- distinct_doubles[row, ]
  
  # date, plot_code, species_code, subplot
  selected_rows <- filter(df_code, date == dbl_ref$date &
                        plot_code == dbl_ref$plot_code &
                        key_plant_code == dbl_ref$key_plant_code &
                        subplot == dbl_ref$subplot)
                        
  # identify indicies of duplicate observationos in original dataframe
  selected_indices <- which(df_code$date == dbl_ref$date &
                        df_code$plot_code == dbl_ref$plot_code &
                        df_code$key_plant_code == dbl_ref$key_plant_code &
                        df_code$subplot == dbl_ref$subplot)

  # Display for Review
  print(selected_rows)

  # compare "cover_pct" observations for equality
  if(var(selected_rows$cover_pct) == 0) {
    print("EQUAL")
    # drop duplicate observation
    df_code <- df_code[-c(last(selected_indices)), ]    
  } else if (var(selected_rows$cover_pct) != 0) {
    print("NOT EQUAL")
    # set first row "cover_pct" to NA
    df_code[c(first(selected_indices)), ]$cover_pct = NA

    # drop duplicate observation
    df_post_drop <- df_code[-c(last(selected_indices)), ]
  }
}

  plot_code plot_loc plot_rep grid_point       date subplot key_plant_species
1  YVP N324        N     <NA>        324 2020-05-09       7               230
2  YVP N324        N     <NA>        324 2020-05-09       7               230
  key_plant_code cover_pct
1         EUPESU         0
2         EUPESU         1
[1] "NOT EQUAL"
  plot_code plot_loc plot_rep grid_point       date subplot key_plant_species
1 YVP NB294        N        B        294 2020-05-09       8               320
2 YVP NB294        N        B        294 2020-05-09       8               320
  key_plant_code cover_pct
1         LUPSER         3
2         LUPSER         1
[1] "NOT EQUAL"
  plot_code plot_loc plot_rep grid_point       date subplot key_plant_species
1   YVP 184     <NA>     <NA>        184 2020-05-16       1               404
2   YVP 184     <NA>     <NA>        184 2020-05-16       1               404
  key_plant_code cover_pct
1         PLEMAC         2
2         PLEMAC         0
[1] "NOT EQUAL"
  plot_

In [None]:
str(df_post_drop)

'data.frame':	9146 obs. of  9 variables:
 $ plot_code        : chr  "YVP N7" "YVP N7" "YVP N7" "YVP N7" ...
 $ plot_loc         : chr  "N" "N" "N" "N" ...
 $ plot_rep         : chr  NA NA NA NA ...
 $ grid_point       : int  7 7 7 7 7 7 7 7 7 7 ...
 $ date             : Date, format: "2020-05-31" "2020-05-31" ...
 $ subplot          : int  1 1 1 1 1 1 1 1 1 1 ...
 $ key_plant_species: int  5 20 82 90 113 153 187 202 233 266 ...
 $ key_plant_code   : chr  "ACHMIL" "ALYALY" "BROTEC" "CAMMIC" ...
 $ cover_pct        : int  10 NA 10 1 1 1 1 1 5 3 ...


In [None]:
# rescan for double observations
# Find instances where a plant species is counted twice in the same year-plot-subplot combination
dbl_recount <- df_post_drop %>%
  group_by(year = as.integer(year(df_post_drop$date)), plot_code, subplot, key_plant_code) %>%
  summarize(counted = n()) %>% 
  ungroup() %>%
  arrange(year, plot_code, subplot, desc(counted)) %>%
  filter(counted > 1) %>%
  print(n=Inf)

`summarise()` regrouping output by 'year', 'plot_code', 'subplot' (override with `.groups` argument)



[90m# A tibble: 8 x 5[39m
   year plot_code subplot key_plant_code counted
  [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m       [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m            [3m[90m<int>[39m[23m
[90m1[39m  [4m2[24m020 YVP 181         9 ARANUT               2
[90m2[39m  [4m2[24m020 YVP 184         1 PLEMAC               2
[90m3[39m  [4m2[24m020 YVP 185         1 PRUVIR               2
[90m4[39m  [4m2[24m020 YVP 479         1 RANREP               2
[90m5[39m  [4m2[24m020 YVP 62          4 VERVER               2
[90m6[39m  [4m2[24m020 YVP N324        7 EUPESU               2
[90m7[39m  [4m2[24m020 YVP N7          1 ALYALY               2
[90m8[39m  [4m2[24m020 YVP NB294       8 LUPSER               2


In [None]:
# display previously duplicated plots for review
for (row in 1:nrow(distinct_doubles)) {
  dbl_ref <- distinct_doubles[row, ]

  # date, plot_code, species_code, subplot
  selected_rows <- filter(df_code, date == dbl_ref$date &
                        plot_code == dbl_ref$plot_code &
                        key_plant_code == dbl_ref$key_plant_code &
                        subplot == dbl_ref$subplot)
  print(selected_rows[,c(1,5,6,7,8)])
}

  plot_code       date subplot key_plant_species key_plant_code
1  YVP N324 2020-05-09       7               230         EUPESU
2  YVP N324 2020-05-09       7               230         EUPESU
  plot_code       date subplot key_plant_species key_plant_code
1 YVP NB294 2020-05-09       8               320         LUPSER
2 YVP NB294 2020-05-09       8               320         LUPSER
  plot_code       date subplot key_plant_species key_plant_code
1   YVP 184 2020-05-16       1               404         PLEMAC
2   YVP 184 2020-05-16       1               404         PLEMAC
  plot_code       date subplot key_plant_species key_plant_code
1   YVP 185 2020-05-16       1               433         PRUVIR
2   YVP 185 2020-05-16       1               433         PRUVIR
  plot_code       date subplot key_plant_species key_plant_code
1   YVP 144 2020-05-27      10               153         COLLIN
  plot_code       date subplot key_plant_species key_plant_code
1   YVP 181 2020-05-31       9          

## Correct errors in species codes
The species codes used in the source data contain numerous errors, and they also in some cases represent old taxonomy where species names have been revised. This can cause all sorts of problems, like artificially creating new species or making it impossible to join with available species metadata. Several steps must be accomplished here:

1. Trim leading or trailing spaces from the code (this was done in excel before source CSV files were created)
2. Read in master list of species metadata and query YVP species codes to identify which ones don't align
3. Align the species codes, identify the ones that are wrong and correct them
4. Import the numeric key from the species metadata so that future aligments are easier and errors are less common

### Read in master list of species metadata and codes


In [None]:
# 2020-04-27_MPGR_plant_species_list
spp = gsheet2tbl("https://docs.google.com/spreadsheets/d/1wPen7yeimXtY4qK5Nj4JPvlgHYamoogR0YJekaF7i9Y") %>% 
as_tibble() %>% glimpse()

Rows: 754
Columns: 9
$ key_PlantSpecies [3m[90m<dbl>[39m[23m 1, 2, 3, 743, 5, 661, 8, 694, 9, 10, 11, 744, 677, 1…
$ key_PlantCode    [3m[90m<chr>[39m[23m "ABIGRA", "ABILAS", "ACEGLA", "ACEPLA", "ACHMIL", "A…
$ NameScientific   [3m[90m<chr>[39m[23m "Abies grandis", "Abies lasiocarpa", "Acer glabrum",…
$ NameSynonym      [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ NameCommon       [3m[90m<chr>[39m[23m "grand fir", "subalpine fir", "Rocky Mountain maple"…
$ NameFamily       [3m[90m<chr>[39m[23m "Pinaceae", "Pinaceae", "Aceraceae", "Aceraceae", "A…
$ NativeStatus     [3m[90m<chr>[39m[23m "native", "native", "native", "nonnative", "native",…
$ LifeCycle        [3m[90m<chr>[39m[23m "perennial", "perennial", "perennial", "perennial", …
$ LifeForm         [3m[90m<chr>[39m[23m "tree", "tree", "shrub", "tree", "forb", "forb", "fo…


### Align species codes and identify mistakes


In [None]:
# Align the species codes 
# Produce df of codes that don't match the master list
collisions_species_codes = 
df_code %>% 
anti_join(spp, by = c("key_plant_code" = "key_PlantCode")) %>% 
group_by(key_plant_code) %>% 
distinct(key_plant_code) %>% 
arrange(key_plant_code) %>% 
print(n = Inf)

[90m# A tibble: 2 x 1[39m
[90m# Groups:   key_plant_code [2][39m
  key_plant_code
  [3m[90m<chr>[39m[23m         
[90m1[39m ARAB_SP       
[90m2[39m FESC_SP       


### Create file that associates errors with corrections

In [None]:
# Produce file `collisions_species_codes` for work in spreadsheet outside of this environment
# The file will save to the `content` folder in the drive tree
# BL downloaded the file to his desktop to produce a new naming key file
filename = "collisions_species_codes.csv"
if (filename %in% list.files(getwd())) {
  cat("file already exists in working directory: ", filename, "\n", "working directory: ", getwd(), "\n")
} else {
  write.csv(collisions_species_codes, filename)
  cat(filename, " written to working directory \n", "working directory: ", getwd(), "\n")
}


collisions_species_codes.csv  written to working directory 
 working directory:  /content 


In [None]:
# Import csv file with the updated codes 
# This file was produced by visually aligning the codes with a file that Rebecca Durham provided
code_corrections <- read.csv(file = "https://drive.google.com/uc?id=1D0j3U4Or2PviFS02F3rxTRXr1SpGOB0a",
  colClasses = c("character", "character")) %>% 
glimpse()

Rows: 60
Columns: 2
$ plantcode_incorrect [3m[90m<chr>[39m[23m "AGOS SP", "ALOP SP", "ANDOCCUAL", "ARNCOR?", "AR…
$ plantcode_corrected [3m[90m<chr>[39m[23m "AGOS_SP", "ALOP_SP", "ANDOCC", "ARNCOR", "ARTDRA…


### Cascade changes through dataset


In [None]:
# Create new df to hold corrected information
# Change species_code to character variable to avoid problems with levels later
yvp_veg_cover_correct = df_code %>% mutate(key_plant_code = as.character(key_plant_code)) %>% glimpse()

Rows: 9,147
Columns: 9
$ plot_code         [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "…
$ plot_loc          [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "…
$ plot_rep          [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ grid_point        [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
$ date              [3m[90m<date>[39m[23m 2020-05-31, 2020-05-31, 2020-05-31, 2020-05-31, 20…
$ subplot           [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ key_plant_species [3m[90m<int>[39m[23m 5, 20, 82, 90, 113, 153, 187, 202, 233, 266, 286, 3…
$ key_plant_code    [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", …
$ cover_pct         [3m[90m<int>[39m[23m 10, NA, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2…


In [None]:
# Loop operation used to update each instance of an incorrect code
# Embed logic control to prevent errors if this loop is run on a df with corrected codes
# Variable to track loop cycles
cycles = 0

for (i in 1:length(code_corrections[, 1])) {
  index = which(yvp_veg_cover_correct$species_code == code_corrections$plantcode_incorrect[i])

  if (length(index != 0)) {
    cat("number of incorrect code entries: ", length(index), "\n")
    cat("incorrect code: ", code_corrections$plantcode_incorrect[i], "\n")
    yvp_veg_cover_correct[index, ]$species_code = code_corrections$plantcode_corrected[i]
    print(yvp_veg_cover_correct[index, c(1,5,6,7,8)])
    cycles = cycles + length(index)
    cat("\n")
  } else {
    cat("no incorrect code entries were found \n")
  }

  cat("number of corrections made (cumulative): ", cycles, "\n\n\n")

}

no incorrect code entries were found 
number of corrections made (cumulative):  0 


no incorrect code entries were found 
number of corrections made (cumulative):  0 


no incorrect code entries were found 
number of corrections made (cumulative):  0 


no incorrect code entries were found 
number of corrections made (cumulative):  0 


no incorrect code entries were found 
number of corrections made (cumulative):  0 


no incorrect code entries were found 
number of corrections made (cumulative):  0 


no incorrect code entries were found 
number of corrections made (cumulative):  0 


no incorrect code entries were found 
number of corrections made (cumulative):  0 


no incorrect code entries were found 
number of corrections made (cumulative):  0 


no incorrect code entries were found 
number of corrections made (cumulative):  0 


no incorrect code entries were found 
number of corrections made (cumulative):  0 


no incorrect code entries were found 
number of corrections made 

In [None]:
# Rescan for incorrect species codes
yvp_veg_cover_correct %>% 
anti_join(spp, by = c("key_plant_code" = "key_PlantCode")) %>% 
group_by(key_plant_code) %>% distinct(key_plant_code) %>% arrange(key_plant_code)

key_plant_code
<chr>


In [None]:
# Update key_plant_codes based on comments 2020-11-03
# ARAB_SP = UNK_FORB
# FESC_SP = UNK_GRAM
yvp_veg_cover_correct <- yvp_veg_cover_correct %>%
  mutate(key_plant_code = ifelse(str_detect(key_plant_code, "ARAB_SP"), "UNK_FORB", key_plant_code)) %>%
  mutate(key_plant_code = ifelse(str_detect(key_plant_code, "FESC_SP"), "UNK_GRAM", key_plant_code)) %>% glimpse()

Rows: 9,147
Columns: 9
$ plot_code         [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "…
$ plot_loc          [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "…
$ plot_rep          [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ grid_point        [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
$ date              [3m[90m<date>[39m[23m 2020-05-31, 2020-05-31, 2020-05-31, 2020-05-31, 20…
$ subplot           [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ key_plant_species [3m[90m<int>[39m[23m 5, 20, 82, 90, 113, 153, 187, 202, 233, 266, 286, 3…
$ key_plant_code    [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", …
$ cover_pct         [3m[90m<int>[39m[23m 10, NA, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2…


In [None]:
# Rescan for incorrect species codes
yvp_veg_cover_correct %>% 
anti_join(spp, by = c("key_plant_code" = "key_PlantCode")) %>% 
group_by(key_plant_code) %>% distinct(key_plant_code) %>% arrange(key_plant_code)

key_plant_code
<chr>


In [None]:
spp %>% glimpse()

Rows: 754
Columns: 9
$ key_PlantSpecies [3m[90m<dbl>[39m[23m 1, 2, 3, 743, 5, 661, 8, 694, 9, 10, 11, 744, 677, 1…
$ key_PlantCode    [3m[90m<chr>[39m[23m "ABIGRA", "ABILAS", "ACEGLA", "ACEPLA", "ACHMIL", "A…
$ NameScientific   [3m[90m<chr>[39m[23m "Abies grandis", "Abies lasiocarpa", "Acer glabrum",…
$ NameSynonym      [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ NameCommon       [3m[90m<chr>[39m[23m "grand fir", "subalpine fir", "Rocky Mountain maple"…
$ NameFamily       [3m[90m<chr>[39m[23m "Pinaceae", "Pinaceae", "Aceraceae", "Aceraceae", "A…
$ NativeStatus     [3m[90m<chr>[39m[23m "native", "native", "native", "nonnative", "native",…
$ LifeCycle        [3m[90m<chr>[39m[23m "perennial", "perennial", "perennial", "perennial", …
$ LifeForm         [3m[90m<chr>[39m[23m "tree", "tree", "shrub", "tree", "forb", "forb", "fo…


In [None]:
# Incorporate serial key for species codes
yvp_vegetation_cover_FINAL = 
yvp_veg_cover_correct %>% 
  left_join(spp %>% select(key_PlantSpecies, key_PlantCode), by = c("key_plant_code" = "key_PlantCode")) %>% 
  # select(c(1,2,3,4,5,6,9,7,8)) %>% 
  select(plot_code, plot_loc, plot_rep, grid_point, date, subplot, key_PlantSpecies,
         key_plant_code, cover_pct) %>%
  rename(key_plant_species = key_PlantSpecies) %>% 
  glimpse()

Rows: 9,147
Columns: 9
$ plot_code         [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "…
$ plot_loc          [3m[90m<chr>[39m[23m "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "…
$ plot_rep          [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ grid_point        [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
$ date              [3m[90m<date>[39m[23m 2020-05-31, 2020-05-31, 2020-05-31, 2020-05-31, 20…
$ subplot           [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ key_plant_species [3m[90m<dbl>[39m[23m 5, 20, 82, 90, 113, 153, 187, 202, 233, 266, 286, 3…
$ key_plant_code    [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", …
$ cover_pct         [3m[90m<int>[39m[23m 10, NA, 10, 1, 1, 1, 1, 1, 5, 3, 15, 0, 1, 10, 4, 2…


In [None]:
summary(yvp_vegetation_cover_FINAL)

  plot_code           plot_loc           plot_rep           grid_point   
 Length:9147        Length:9147        Length:9147        Min.   :  7.0  
 Class :character   Class :character   Class :character   1st Qu.: 64.0  
 Mode  :character   Mode  :character   Mode  :character   Median :209.0  
                                                          Mean   :250.1  
                                                          3rd Qu.:395.0  
                                                          Max.   :571.0  
                                                                         
      date               subplot       key_plant_species key_plant_code    
 Min.   :2020-05-09   Min.   : 1.000   Min.   :  3.0     Length:9147       
 1st Qu.:2020-05-22   1st Qu.: 3.000   1st Qu.:153.0     Class :character  
 Median :2020-06-05   Median : 6.000   Median :286.0     Mode  :character  
 Mean   :2020-06-03   Mean   : 5.517   Mean   :280.8                       
 3rd Qu.:2020-06-17   3rd Qu

In [None]:
# show cover_pct NA's
yvp_vegetation_cover_FINAL %>%
  filter(is.na(cover_pct))

plot_code,plot_loc,plot_rep,grid_point,date,subplot,key_plant_species,key_plant_code,cover_pct
<chr>,<chr>,<chr>,<int>,<date>,<int>,<dbl>,<chr>,<int>
YVP N7,N,,7,2020-05-31,1,20,ALYALY,
YVP 62,,,62,2020-05-31,4,562,VERVER,
YVP 181,,,181,2020-05-31,9,46,ARANUT,
YVP 184,,,184,2020-05-16,1,404,PLEMAC,
YVP 185,,,185,2020-05-16,1,433,PRUVIR,
YVP NB294,N,B,294,2020-05-09,8,320,LUPSER,
YVP N324,N,,324,2020-05-09,7,230,EUPESU,
YVP N348,N,,348,2020-07-01,10,167,CYNOFF,
YVP 479,,,479,2020-06-28,1,446,RANREP,


# Output

## Export Wrangled DataFrame to CSV 
Export the full data set so that we can push it to the BQ database




In [None]:
# Output 2020-11-03 ES
filename_final = "yvp_vegetation_cover_WRANGLE-2020.csv"

if (filename_final %in% list.files(getwd())) {
  cat("file already exists in working directory:", filename_final, "\n", "working directory:", getwd(), "\n")
} else {
  write.csv(yvp_vegetation_cover_FINAL, filename_final)
  cat(filename_final, "written to working directory \n", "working directory:", getwd(), "\n")
}

yvp_vegetation_cover_WRANGLE-2020.csv written to working directory 
 working directory: /content 


## Push to BigQuery

"yvp_vegetation_cover_FINAL.csv" uploaded manually to BigQuery

## Export field datasheet version
Field datasheets need to have a complete, cumulative species list for each plot recorded in a table, with the cover_pct column set to 0. This allows field techs to change the 0 to some number if the species is found. The date column is blank so that field techs can fill in the appropriate date. Do not include columns that are needed for data analysis, like plot_loc, plot_rep, plot_num, and species_key. 

**Schema for field data sheet**

* plot_num (helps for sorting and finding plots)
* plot_code
* date
* species_code
* cover_pct

In [None]:
field_datasheet = 
yvp_vegetation_cover_FINAL %>% 
select(plot_num, plot_code, subplot, species_code) %>% 
group_by(plot_num, plot_code, subplot) %>% 
distinct(species_code) %>% 
select(-species_code, species_code) %>% 
add_column(date = NA, .after = "plot_code") %>% 
add_column(cover_pct = 0) %>% 
arrange(plot_num, plot_code, subplot, species_code) %>% 
glimpse()

Rows: 8,861
Columns: 6
Groups: plot_num, plot_code, subplot [580]
$ plot_num     [3m[90m<int>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
$ plot_code    [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N…
$ date         [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ subplot      [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,…
$ species_code [3m[90m<chr>[39m[23m "ACHMIL", "ALYALY", "BROTEC", "CAMMIC", "CARE_SP", "COLL…
$ cover_pct    [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…


In [None]:
filename_field_datasheet = "yvp_vegetation_cover_field_datasheet_FINAL.csv"

if (filename_field_datasheet %in% list.files(getwd())) {
  cat("file already exists in working directory:", filename_final, "\n", "working directory:", getwd(), "\n")
} else {
  write.csv(field_datasheet, filename_field_datasheet)
  cat(filename_final, "written to working directory \n", "working directory:", getwd(), "\n")
}

yvp_vegetation_cover_FINAL.csv written to working directory 
 working directory: /content 
