<a href="https://colab.research.google.com/github/samsoe/mpg_notebooks/blob/master/YVP_Ground_Cover_Data_Wrangle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*R Notebook*

# README

* Readme fixed plot vegetation data - [Ground Cover Data](https://docs.google.com/document/d/16-Aq8u9Rudd78fSzfjvpCXyQgE-BstC-d2PjYfmLtcw/edit#heading=h.a2v9q2quarai)

# Load Tools

In [1]:
library(tidyverse)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



# Source

In [None]:
# '2020-04-28_yvp_ground_cover.csv'
src = 'https://drive.google.com/uc?id=1JXlFaVP1UxuKmnHju8r6T-8ssrPYO9dV'

In [None]:
df <- read.csv(file = src)

In [None]:
head(df, n=2)

Unnamed: 0_level_0,plot_code,date,subplot,groundcover_type,groundcover_pct
Unnamed: 0_level_1,<fct>,<fct>,<int>,<fct>,<int>
1,YVP 10,2017-06-09,1,BG,1
2,YVP 10,2017-06-09,1,BV,80


# Wrangle

## Structure Columns

### plot_code

In [None]:
# convert to string
df$plot_code <- as.character(df$plot_code)

### plot_loc

In [None]:
# detect "N" in 'plot_code' and write to new column 'plot_loc'
df <- df %>%
  mutate(plot_loc = ifelse(str_detect(plot_code, "N"), "N", NA))

In [None]:
# reorder columns
df <- df[, c(1,6,2,3,4,5)]

### plot_rep

In [None]:
# detect "A", "B", "C" characters in plot_code and if present write to 'plot_rep'
df <- df %>%
  mutate(plot_rep = case_when(str_detect(plot_code, "A")~"A",
                              str_detect(plot_code, "B")~"B",
                              str_detect(plot_code, "C")~"C"))

In [None]:
# reorder columns
df <- df[,c(1,2,7,3,4,5,6)]

### plot_num

In [None]:
# use digital values from 'plot_code' and to populate 'plot_num'
df <- df %>%
  mutate(plot_num = str_extract(plot_code, "[:digit:].*"),
         plot_num = as.integer(plot_num))

In [None]:
# reorder columns
df <- df[,c(1,2,3,8,4,5,6,7)]

### date

In [None]:
# convert to date
df$date <- as.Date(df$date)

### subplot

In [None]:
typeof(df$subplot)

### groundcover_type

In [None]:
# convert to string
df$groundcover_type <- as.character(df$groundcover_type)

### groundcover_pct

In [None]:
typeof(df$groundcover_pct)

In [None]:
str(df)

'data.frame':	13917 obs. of  8 variables:
 $ plot_code       : chr  "YVP 10" "YVP 10" "YVP 10" "YVP 10" ...
 $ plot_loc        : chr  NA NA NA NA ...
 $ plot_rep        : chr  NA NA NA NA ...
 $ plot_num        : int  10 10 10 10 10 10 10 10 10 10 ...
 $ date            : Date, format: "2017-06-09" "2017-06-09" ...
 $ subplot         : int  1 1 1 1 1 1 1 1 2 2 ...
 $ groundcover_type: chr  "BG" "BV" "L" "WD" ...
 $ groundcover_pct : int  1 80 10 0 5 4 1 0 2 70 ...


## Review 'groundcover_type' levels

Make sure that all levels of `groundcover_type` are present in each subplot of the data sheet (this is important for data collection)

In [None]:
head(df)

Unnamed: 0_level_0,plot_code,plot_loc,plot_rep,plot_num,date,subplot,groundcover_type,groundcover_pct
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<date>,<int>,<chr>,<int>
1,YVP 10,,,10,2017-06-09,1,BG,1
2,YVP 10,,,10,2017-06-09,1,BV,80
3,YVP 10,,,10,2017-06-09,1,L,10
4,YVP 10,,,10,2017-06-09,1,WD,0
5,YVP 10,,,10,2017-06-09,1,M,5
6,YVP 10,,,10,2017-06-09,1,LIC,4


In [None]:
# display distinct 'groundcover_type' for reference
df %>%
  distinct(groundcover_type) %>%
  arrange(groundcover_type)

groundcover_type
<chr>
BG
BV
G
L
LIC
M
R
WD


In [None]:
groundcover_counts <- df %>%
  group_by(year = as.numeric(substring(date,0,4)), plot_code, subplot) %>%
  arrange(year, plot_code, subplot, groundcover_type) %>%
  summarize(groundcover_type_counted = n())

In [None]:
head(groundcover_counts)

year,plot_code,subplot,groundcover_type_counted
<dbl>,<chr>,<int>,<int>
2017,YVP 10,1,8
2017,YVP 10,2,8
2017,YVP 10,3,8
2017,YVP 10,4,8
2017,YVP 10,5,8
2017,YVP 10,6,8


In [None]:
unique(groundcover_counts$groundcover_type_counted)

In [None]:
groundcover_counts %>%
  filter(groundcover_type_counted != 8) %>%
  arrange(groundcover_type_counted, year)

year,plot_code,subplot,groundcover_type_counted
<dbl>,<chr>,<int>,<int>
2017,YVP 355,1,7
2017,YVP 481,1,7
2018,YVP N278,5,7
2018,YVP N57,8,7
2019,YVP 481,1,7
2018,YVP N278,6,9
2018,YVP N57,2,9


# Remove Duplicate Rows

### Remove Row

In [None]:
# example of one extra 'groundcover_type'
df %>%
  filter(as.numeric(substring(date,0,4)) == 2018,
          plot_code == 'YVP N278',
          subplot == 6) %>%
  arrange(groundcover_type)

plot_code,plot_loc,plot_rep,plot_num,date,subplot,groundcover_type,groundcover_pct
<chr>,<chr>,<chr>,<int>,<date>,<int>,<chr>,<int>
YVP N278,N,,278,2018-05-28,6,BG,1
YVP N278,N,,278,2018-05-28,6,BV,20
YVP N278,N,,278,2018-05-28,6,G,4
YVP N278,N,,278,2018-05-28,6,L,25
YVP N278,N,,278,2018-05-28,6,LIC,10
YVP N278,N,,278,2018-05-28,6,M,40
YVP N278,N,,278,2018-05-28,6,R,0
YVP N278,N,,278,2018-05-28,6,WD,0


In [None]:
# remove row YVP N278	N	NA	278	2018-05-28	6	M	15
df <- df[!((as.numeric(substring(df$date,0,4)) == 2018 &
          df$plot_code == 'YVP N278' &
          df$subplot == 6 &
          df$groundcover_type == 'M' &
          df$groundcover_pct == 15)), ]

### Remove Row

In [None]:
# example of one extra 'groundcover_type'
# 2018	YVP N57	2
df %>%
  filter(as.numeric(substring(date,0,4)) == 2018,
          plot_code == 'YVP N57',
          subplot == 2) %>%
  arrange(groundcover_type)

plot_code,plot_loc,plot_rep,plot_num,date,subplot,groundcover_type,groundcover_pct
<chr>,<chr>,<chr>,<int>,<date>,<int>,<chr>,<int>
YVP N57,N,,57,2018-06-08,2,BG,3
YVP N57,N,,57,2018-06-08,2,BV,10
YVP N57,N,,57,2018-06-08,2,G,0
YVP N57,N,,57,2018-06-08,2,L,40
YVP N57,N,,57,2018-06-08,2,LIC,3
YVP N57,N,,57,2018-06-08,2,M,35
YVP N57,N,,57,2018-06-08,2,R,0
YVP N57,N,,57,2018-06-08,2,WD,10


In [None]:
# remove row YVP N57	N	NA	57	2018-06-08	2	M	5
df <- df[!((as.numeric(substring(df$date,0,4)) == 2018 &
     df$plot_code == 'YVP N57' &
     df$subplot == 2 &
     df$groundcover_type == 'M' &
     df$groundcover_pct == 5)), ]

In [None]:
str(df)

'data.frame':	13915 obs. of  8 variables:
 $ plot_code       : chr  "YVP 10" "YVP 10" "YVP 10" "YVP 10" ...
 $ plot_loc        : chr  NA NA NA NA ...
 $ plot_rep        : chr  NA NA NA NA ...
 $ plot_num        : int  10 10 10 10 10 10 10 10 10 10 ...
 $ date            : Date, format: "2017-06-09" "2017-06-09" ...
 $ subplot         : int  1 1 1 1 1 1 1 1 2 2 ...
 $ groundcover_type: chr  "BG" "BV" "L" "WD" ...
 $ groundcover_pct : int  1 80 10 0 5 4 1 0 2 70 ...


# Add Rows

Set missing 'groundcover_type' to NA

In [None]:
groundcover_counts %>%
  filter(groundcover_type_counted != 8) %>%
  arrange(groundcover_type_counted, year)

year,plot_code,subplot,groundcover_type_counted
<dbl>,<chr>,<int>,<int>
2017,YVP 355,1,7
2017,YVP 481,1,7
2018,YVP N278,5,7
2018,YVP N57,8,7
2019,YVP 481,1,7
2018,YVP N278,6,9
2018,YVP N57,2,9


5 "plot_code" "subplot" groups missing "groundcover_type" entries

In [None]:
# types we are looking for
# display distinct 'groundcover_type' for reference
df %>%
  distinct(groundcover_type) %>%
  arrange(groundcover_type)

groundcover_type
<chr>
BG
BV
G
L
LIC
M
R
WD


### 2017 YVP 355 1

In [None]:
# display
df %>%
  filter(as.numeric(substring(date,0,4)) == 2017,
          plot_code == 'YVP 355',
          subplot == 1) %>%
  arrange(groundcover_type)

# for all groundcover_type_counted find missing category and create row with groundcover_pct = NA
# BG in this case

plot_code,plot_loc,plot_rep,plot_num,date,subplot,groundcover_type,groundcover_pct
<chr>,<chr>,<chr>,<dbl>,<date>,<dbl>,<chr>,<int>
YVP 355,,,355,2017-06-02,1,BG,
YVP 355,,,355,2017-06-02,1,BV,75.0
YVP 355,,,355,2017-06-02,1,G,1.0
YVP 355,,,355,2017-06-02,1,L,10.0
YVP 355,,,355,2017-06-02,1,LIC,10.0
YVP 355,,,355,2017-06-02,1,M,5.0
YVP 355,,,355,2017-06-02,1,R,0.0
YVP 355,,,355,2017-06-02,1,WD,0.0


In [None]:
# add row with BG 'groundcover_type', set to NA
df <- df %>%
  add_row(plot_code = "YVP 355",
          plot_loc = NA,
          plot_rep = NA,
          plot_num = 355,
          date = as.Date('2017-06-02'),
          subplot = 1,
          groundcover_type = "BG",
          groundcover_pct = NA)

### 2017	YVP 481	1

In [None]:
# display
df %>%
  filter(as.numeric(substring(date,0,4)) == 2017,
          plot_code == 'YVP 481',
          subplot == 1) %>%
  arrange(groundcover_type)

plot_code,plot_loc,plot_rep,plot_num,date,subplot,groundcover_type,groundcover_pct
<chr>,<chr>,<chr>,<dbl>,<date>,<dbl>,<chr>,<int>
YVP 481,,,481,2017-07-06,1,BG,
YVP 481,,,481,2017-07-06,1,BV,30.0
YVP 481,,,481,2017-07-06,1,G,0.0
YVP 481,,,481,2017-07-06,1,L,70.0
YVP 481,,,481,2017-07-06,1,LIC,0.0
YVP 481,,,481,2017-07-06,1,M,0.0
YVP 481,,,481,2017-07-06,1,R,0.0
YVP 481,,,481,2017-07-06,1,WD,0.0


In [None]:
# add row with BG 'groundcover_type', set to NA
df <- df %>%
  add_row(plot_code = "YVP 481",
          plot_loc = NA,
          plot_rep = NA,
          plot_num = 481,
          date = as.Date('2017-07-06'),
          subplot = 1,
          groundcover_type = "BG",
          groundcover_pct = NA)

### 2018	YVP N278	5

In [None]:
# display
df %>%
  filter(as.numeric(substring(date,0,4)) == 2018,
          plot_code == 'YVP N278',
          subplot == 5) %>%
  arrange(groundcover_type)

plot_code,plot_loc,plot_rep,plot_num,date,subplot,groundcover_type,groundcover_pct
<chr>,<chr>,<chr>,<dbl>,<date>,<dbl>,<chr>,<int>
YVP N278,N,,278,2018-05-28,5,BG,4.0
YVP N278,N,,278,2018-05-28,5,BV,15.0
YVP N278,N,,278,2018-05-28,5,G,5.0
YVP N278,N,,278,2018-05-28,5,L,45.0
YVP N278,N,,278,2018-05-28,5,LIC,15.0
YVP N278,N,,278,2018-05-28,5,M,
YVP N278,N,,278,2018-05-28,5,R,0.0
YVP N278,N,,278,2018-05-28,5,WD,0.0


In [None]:
# Missing "M" 'groundcover_type' add a row and set it to NA
df <- df %>%
  add_row(plot_code = "YVP N278",
          plot_loc = as.character('N'),
          plot_rep = NA,
          plot_num = 278,
          date = as.Date('2018-05-28'),
          subplot = 5,
          groundcover_type = "M",
          groundcover_pct = NA)

### 2018	YVP N57	8

In [None]:
# display
df %>%
  filter(as.numeric(substring(date,0,4)) == 2018,
          plot_code == 'YVP N57',
          subplot == 8) %>%
  arrange(groundcover_type)

plot_code,plot_loc,plot_rep,plot_num,date,subplot,groundcover_type,groundcover_pct
<chr>,<chr>,<chr>,<dbl>,<date>,<dbl>,<chr>,<int>
YVP N57,N,,57,2018-06-08,8,BG,30.0
YVP N57,N,,57,2018-06-08,8,BV,5.0
YVP N57,N,,57,2018-06-08,8,G,40.0
YVP N57,N,,57,2018-06-08,8,L,10.0
YVP N57,N,,57,2018-06-08,8,LIC,5.0
YVP N57,N,,57,2018-06-08,8,M,
YVP N57,N,,57,2018-06-08,8,R,5.0
YVP N57,N,,57,2018-06-08,8,WD,5.0


In [None]:
# Missing "M" 'groundcover_type' add a row and set it to NA
df <- df %>%
  add_row(plot_code = "YVP N57",
          plot_loc = as.character('N'),
          plot_rep = NA,
          plot_num = 57,
          date = as.Date('2018-06-08'),
          subplot = 8,
          groundcover_type = "M",
          groundcover_pct = NA)

### 2019	YVP 481	1

In [None]:
# display
df %>%
  filter(as.numeric(substring(date,0,4)) == 2019,
          plot_code == 'YVP 481',
          subplot == 1) %>%
  arrange(groundcover_type)

plot_code,plot_loc,plot_rep,plot_num,date,subplot,groundcover_type,groundcover_pct
<chr>,<chr>,<chr>,<dbl>,<date>,<dbl>,<chr>,<int>
YVP 481,,,481,2019-07-15,1,BG,
YVP 481,,,481,2019-07-15,1,BV,30.0
YVP 481,,,481,2019-07-15,1,G,0.0
YVP 481,,,481,2019-07-15,1,L,65.0
YVP 481,,,481,2019-07-15,1,LIC,0.0
YVP 481,,,481,2019-07-15,1,M,3.0
YVP 481,,,481,2019-07-15,1,R,0.0
YVP 481,,,481,2019-07-15,1,WD,3.0


In [None]:
# Missing "BG" 'groundcover_type' add a row and set it to NA
df <- df %>%
  add_row(plot_code = "YVP 481",
          plot_loc = NA,
          plot_rep = NA,
          plot_num = 481,
          date = as.Date('2019-07-15'),
          subplot = 1,
          groundcover_type = "BG",
          groundcover_pct = NA)

# Output

## All data for upload to BQ

In [None]:
summary(df)

  plot_code           plot_loc           plot_rep            plot_num    
 Length:13920       Length:13920       Length:13920       Min.   :  7.0  
 Class :character   Class :character   Class :character   1st Qu.:110.0  
 Mode  :character   Mode  :character   Mode  :character   Median :212.5  
                                                          Mean   :254.3  
                                                          3rd Qu.:395.0  
                                                          Max.   :571.0  
                                                                         
      date               subplot     groundcover_type   groundcover_pct
 Min.   :2017-05-08   Min.   : 1.0   Length:13920       Min.   : 0.00  
 1st Qu.:2017-06-06   1st Qu.: 3.0   Class :character   1st Qu.: 0.00  
 Median :2018-06-17   Median : 5.5   Mode  :character   Median : 5.00  
 Mean   :2018-06-08   Mean   : 5.5                      Mean   :12.65  
 3rd Qu.:2019-05-27   3rd Qu.: 8.0              

In [None]:
filename_final = "yvp_ground_cover_FINAL.csv"

if (filename_final %in% list.files(getwd())) {
  cat("file already exists in working directory:", filename_final, "\n", "working directory:", getwd(), "\n")
} else {
  write.csv(df, filename_final)
  cat(filename_final, "written to working directory \n", "working directory:", getwd(), "\n")
}

yvp_ground_cover_FINAL.csv written to working directory 
 working directory: /content 


## Export field datasheet version
Field datasheets need to have all levels of `groundcover_type` represented for each subplot, with `groundcover_pct` set to zero for each row. This allows field techs to change the 0 to some number depending on their observations. The date column is blank so that field techs can fill in the appropriate date. Do not include columns that are needed for data analysis, like plot_loc, plot_rep, plot_num, and species_key. 

**Schema for field data sheet**

* plot_num (helps for sorting and finding plots)
* plot_code
* date
* subplot
* groundcover_type
* groundcover_pct

In [None]:
field_datasheet = 
df %>% 
select(plot_num, plot_code, subplot, groundcover_type) %>% 
group_by(plot_num, plot_code, subplot) %>% 
distinct(groundcover_type) %>% 
select(-groundcover_type, groundcover_type) %>% 
add_column(date = NA, .after = "plot_code") %>% 
add_column(groundcover_pct = 0) %>% 
arrange(plot_num, plot_code, subplot, groundcover_type) %>% 
glimpse()

Rows: 4,640
Columns: 6
Groups: plot_num, plot_code, subplot [580]
$ plot_num         [3m[90m<dbl>[39m[23m 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7…
$ plot_code        [3m[90m<chr>[39m[23m "YVP N7", "YVP N7", "YVP N7", "YVP N7", "YVP N7", "Y…
$ date             [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ subplot          [3m[90m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3…
$ groundcover_type [3m[90m<chr>[39m[23m "BG", "BV", "G", "L", "LIC", "M", "R", "WD", "BG", "…
$ groundcover_pct  [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…


In [None]:
filename_field_datasheet = "yvp_ground_cover_field_datasheet_FINAL.csv"

if (filename_field_datasheet %in% list.files(getwd())) {
  cat("file already exists in working directory:", filename_final, "\n", "working directory:", getwd(), "\n")
} else {
  write.csv(field_datasheet, filename_field_datasheet)
  cat(filename_final, "written to working directory \n", "working directory:", getwd(), "\n")
}

yvp_ground_cover_FINAL.csv written to working directory 
 working directory: /content 
