<a href="https://colab.research.google.com/github/samsoe/mpg_notebooks/blob/master/YVP_Vegetation_Cover_Data_Wrangle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*R Notebook*

# README

* [Readme fixed grid plot vegetation data](https://docs.google.com/document/d/16-Aq8u9Rudd78fSzfjvpCXyQgE-BstC-d2PjYfmLtcw/edit?usp=sharing)

# Load Tools

In [1]:
library(tidyverse)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.0     [32m✔[39m [34mpurrr  [39m 0.3.3
[32m✔[39m [34mtibble [39m 3.0.0     [32m✔[39m [34mdplyr  [39m 0.8.5
[32m✔[39m [34mtidyr  [39m 1.0.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



# Source

In [0]:
# 2020-04-28_yvp_vegetation_cover
src = 'https://drive.google.com/uc?id=1pemnlKIlfAQw2JSMN7yDlYMG5QhUW-NP'

In [0]:
df <- read.csv(file = src)

In [4]:
head(df, n=2)

Unnamed: 0_level_0,plot_code,date,subplot,species_code,cover_pct
Unnamed: 0_level_1,<fct>,<fct>,<int>,<fct>,<int>
1,YVP 10,2017-06-09,1,BOESPP,1
2,YVP 10,2017-06-09,1,CREINT,1


# Wrangle

## Structure columns

### plot_code

In [0]:
# convert to string
df$plot_code <- as.character(df$plot_code)

### plot_ loc

In [0]:
# detect "N" in 'plot_code' and write to new column 'plot_loc'
df <- df %>%
  mutate(plot_loc = ifelse(str_detect(plot_code, "N"), "N", NA))

In [0]:
# # strip "N" from 'plot_code' if present
# df$plot_code <- str_remove(df$plot_code, "N")

In [0]:
# reorder columns
df <- df[,c(1,6,2,3,4,5)]

### plot_rep

In [0]:
# detect "A", "B", "C" characters in plot_code and if present write to 'plot_rep'
df <- df %>%
  mutate(plot_rep = case_when(str_detect(plot_code, "A")~"A",
                              str_detect(plot_code, "B")~"B",
                              str_detect(plot_code, "C")~"C"))

In [0]:
# # strip "A", "B", "C" from plot_code
# df$plot_code <- str_remove(df$plot_code, "[ABC]")

In [0]:
# reorder columns
df <- df[,c(1,2,7,3,4,5,6)]

### plot_num

In [0]:
# use digital values from 'plot_code' and to populate 'plot_num'
df <- df %>%
  mutate(plot_num = str_extract(plot_code, "[:digit:].*"))

In [0]:
# reorder columns
df <- df[,c(1,2,3,8,4,5,6,7)]

### date

In [0]:
# convert to date
df$date <- as.Date(df$date)

### subplot

In [0]:
# convert to integer
df$subplot <- as.integer(df$subplot)

### species_key

This will be imported from the plant species metadata table, and we can use it to join and correct species codes in the future


In [0]:
# set to NA for now
df$species_key <- NA

In [0]:
# convert to string
df$species_key <- as.character(df$species_key)

In [0]:
# reorder columns
df <- df[,c(1,2,3,4,5,6,9,7,8)]

### species_code

In [0]:
# convert to string
df$species_code <- as.character(df$species_code)

In [0]:
head(df)

Unnamed: 0_level_0,plot_code,plot_loc,plot_rep,plot_num,date,subplot,species_key,species_code,cover_pct
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<date>,<int>,<chr>,<chr>,<int>
1,YVP 10,,,10,2017-06-09,1,,BOESPP,1
2,YVP 10,,,10,2017-06-09,1,,CREINT,1
3,YVP 10,,,10,2017-06-09,1,,EUPESU,5
4,YVP 10,,,10,2017-06-09,1,,FESCAM,25
5,YVP 10,,,10,2017-06-09,1,,FESIDA,25
6,YVP 10,,,10,2017-06-09,1,,GEUTRI,10


## Identify Double Counting

In [0]:
# Find instances where a plant species is counted twice in the same year-plot-subplot combination
dbl_counts <- df %>%
  group_by(year = as.numeric(substring(date,0,4)), plot_code, subplot, species_code) %>%
  summarize(counted = n()) %>% 
  ungroup() %>%
  arrange(year, plot_code, subplot, desc(counted)) %>%
  filter(counted > 1) %>%
  print(n=Inf)

[90m# A tibble: 46 x 5[39m
    year plot_code subplot species_code counted
   [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m       [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m          [3m[90m<int>[39m[23m
[90m 1[39m  [4m2[24m017 YVP 144         2 VERVER             2
[90m 2[39m  [4m2[24m017 YVP 180         7 FRIPUD             2
[90m 3[39m  [4m2[24m017 YVP 203         4 COLLIN             2
[90m 4[39m  [4m2[24m017 YVP 355        10 PSESPI             2
[90m 5[39m  [4m2[24m017 YVP 44          9 ORTTEN             2
[90m 6[39m  [4m2[24m017 YVP N111        2 DRAVER             2
[90m 7[39m  [4m2[24m017 YVP NB294       8 MICGRA             2
[90m 8[39m  [4m2[24m018 YVP 112         9 ALYALY             2
[90m 9[39m  [4m2[24m018 YVP 12          4 HOLUMB             2
[90m10[39m  [4m2[24m018 YVP 144        10 ACHMIL             2
[90m11[39m  [4m2[24m018 YVP 184         4 HOLUMB             2
[90m12[39m  [4m2[24m018 YVP 185        

In [0]:
view_doubles  <- dbl_counts %>%
  left_join(df %>% mutate(year = as.numeric(substring(date,0,4))))

Joining, by = c("year", "plot_code", "subplot", "species_code")



In [0]:
head(view_doubles, n=46)

year,plot_code,subplot,species_code,counted,plot_loc,plot_rep,plot_num,date,species_key,cover_pct
<dbl>,<chr>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<date>,<chr>,<int>
2017,YVP 144,2,VERVER,2,,,144,2017-05-30,,3
2017,YVP 144,2,VERVER,2,,,144,2017-05-30,,4
2017,YVP 180,7,FRIPUD,2,,,180,2017-05-31,,1
2017,YVP 180,7,FRIPUD,2,,,180,2017-05-31,,1
2017,YVP 203,4,COLLIN,2,,,203,2017-05-18,,10
2017,YVP 203,4,COLLIN,2,,,203,2017-05-18,,1
2017,YVP 355,10,PSESPI,2,,,355,2017-06-02,,20
2017,YVP 355,10,PSESPI,2,,,355,2017-06-02,,2
2017,YVP 44,9,ORTTEN,2,,,44,2017-06-06,,4
2017,YVP 44,9,ORTTEN,2,,,44,2017-06-06,,1


In [0]:
tail(view_doubles, n=46)

year,plot_code,subplot,species_code,counted,plot_loc,plot_rep,plot_num,date,species_key,cover_pct
<dbl>,<chr>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<date>,<chr>,<int>
2018,YVP 412,9,MICGRA,2,,,412,2018-07-02,,1
2018,YVP 412,9,MICGRA,2,,,412,2018-07-02,,2
2018,YVP 56,1,HOLUMB,2,,,56,2018-06-11,,1
2018,YVP 56,1,HOLUMB,2,,,56,2018-06-11,,1
2018,YVP 62,3,KOEMAC,2,,,62,2018-07-02,,35
2018,YVP 62,3,KOEMAC,2,,,62,2018-07-02,,1
2018,YVP 62,4,HOLUMB,2,,,62,2018-07-02,,1
2018,YVP 62,4,HOLUMB,2,,,62,2018-07-02,,1
2018,YVP 62,8,ORTTEN,2,,,62,2018-07-02,,1
2018,YVP 62,8,ORTTEN,2,,,62,2018-07-02,,1


### Export View Doubles

In [0]:
write.csv(view_doubles, 'view_doubles.csv')

In [0]:
# example where cover_pct value is different
# 2017	YVP 144	2	VERVER
df %>%
  filter(as.numeric(substring(date,0,4)) == 2017,
          plot_code == 'YVP 144',
          subplot == '2',
          species_code == 'VERVER')

plot_code,plot_loc,plot_rep,plot_num,date,subplot,species_key,species_code,cover_pct
<chr>,<chr>,<chr>,<chr>,<date>,<int>,<chr>,<chr>,<int>
YVP 144,,,144,2017-05-30,2,,VERVER,3
YVP 144,,,144,2017-05-30,2,,VERVER,4


In [0]:
# example where cover_pct value is the same
# 2017	YVP 180	7	FRIPUD
df %>%
  filter(as.numeric(substring(date,0,4)) == 2017,
          plot_code == 'YVP 180',
          subplot == '7',
          species_code == 'FRIPUD')

plot_code,plot_loc,plot_rep,plot_num,date,subplot,species_key,species_code,cover_pct
<chr>,<chr>,<chr>,<chr>,<date>,<int>,<chr>,<chr>,<int>
YVP 180,,,180,2017-05-31,7,,FRIPUD,1
YVP 180,,,180,2017-05-31,7,,FRIPUD,1


In [0]:
# example where cover_pct value is different
# 2019	YVP NC294	6	ERIPUM
df %>%
  filter(as.numeric(substring(date,0,4)) == 2019,
          plot_code == 'YVP 294',
          subplot == '6',
          species_code == 'ERIPUM')

plot_code,plot_loc,plot_rep,plot_num,date,subplot,species_key,species_code,cover_pct
<chr>,<chr>,<chr>,<chr>,<date>,<int>,<chr>,<chr>,<int>
YVP 294,N,C,294,2019-05-09,6,,ERIPUM,1
YVP 294,N,C,294,2019-05-09,6,,ERIPUM,2
YVP 294,N,B,294,2019-05-13,6,,ERIPUM,0


Display all rows contained in matching 

## Address Double Counting

In [0]:
# ?