<a href="https://colab.research.google.com/github/samsoe/mpg_notebooks/blob/master/YVP_Vegetation_Cover_Data_Wrangle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*R Notebook*

# README

* [Readme fixed grid plot vegetation data](https://docs.google.com/document/d/16-Aq8u9Rudd78fSzfjvpCXyQgE-BstC-d2PjYfmLtcw/edit?usp=sharing)

# Load Tools

In [0]:
library(tidyverse)

# Source

In [0]:
# 2020-04-28_yvp_vegetation_cover
src = 'https://drive.google.com/uc?id=1pemnlKIlfAQw2JSMN7yDlYMG5QhUW-NP'

In [0]:
df <- read.csv(file = src)

In [368]:
head(df, n=2)

Unnamed: 0_level_0,plot_code,date,subplot,species_code,cover_pct
Unnamed: 0_level_1,<fct>,<fct>,<int>,<fct>,<int>
1,YVP 10,2017-06-09,1,BOESPP,1
2,YVP 10,2017-06-09,1,CREINT,1


# Wrangle

## Structure columns

### plot_code

In [0]:
# convert to string
df$plot_code <- as.character(df$plot_code)

### plot_ loc

In [0]:
# detect "N" in 'plot_code' and write to new column 'plot_loc'
df <- df %>%
  mutate(plot_loc = ifelse(str_detect(plot_code, "N"), "N", NA))

In [0]:
# reorder columns
df <- df[,c(1,6,2,3,4,5)]

### plot_rep

In [0]:
# detect "A", "B", "C" characters in plot_code and if present write to 'plot_rep'
df <- df %>%
  mutate(plot_rep = case_when(str_detect(plot_code, "A")~"A",
                              str_detect(plot_code, "B")~"B",
                              str_detect(plot_code, "C")~"C"))

In [0]:
# reorder columns
df <- df[,c(1,2,7,3,4,5,6)]

### plot_num

In [0]:
# use digital values from 'plot_code' and to populate 'plot_num'
df <- df %>%
  mutate(plot_num = str_extract(plot_code, "[:digit:].*"))

In [0]:
# reorder columns
df <- df[,c(1,2,3,8,4,5,6,7)]

### date

In [0]:
# convert to date
df$date <- as.Date(df$date)

### subplot

In [0]:
# convert to integer
df$subplot <- as.integer(df$subplot)

### species_key

This will be imported from the plant species metadata table, and we can use it to join and correct species codes in the future


In [0]:
# set to NA for now
df$species_key <- NA

In [0]:
# convert to string
df$species_key <- as.character(df$species_key)

In [0]:
# reorder columns
df <- df[,c(1,2,3,4,5,6,9,7,8)]

### species_code

In [0]:
# convert to string
df$species_code <- as.character(df$species_code)

In [382]:
head(df, n=2)

Unnamed: 0_level_0,plot_code,plot_loc,plot_rep,plot_num,date,subplot,species_key,species_code,cover_pct
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<date>,<int>,<chr>,<chr>,<int>
1,YVP 10,,,10,2017-06-09,1,,BOESPP,1
2,YVP 10,,,10,2017-06-09,1,,CREINT,1


## Identify Double Counting

In [383]:
str(df)

'data.frame':	21728 obs. of  9 variables:
 $ plot_code   : chr  "YVP 10" "YVP 10" "YVP 10" "YVP 10" ...
 $ plot_loc    : chr  NA NA NA NA ...
 $ plot_rep    : chr  NA NA NA NA ...
 $ plot_num    : chr  "10" "10" "10" "10" ...
 $ date        : Date, format: "2017-06-09" "2017-06-09" ...
 $ subplot     : int  1 1 1 1 1 1 1 1 1 1 ...
 $ species_key : chr  NA NA NA NA ...
 $ species_code: chr  "BOESPP" "CREINT" "EUPESU" "FESCAM" ...
 $ cover_pct   : int  1 1 5 25 25 10 1 1 5 1 ...


In [384]:
head(df)

Unnamed: 0_level_0,plot_code,plot_loc,plot_rep,plot_num,date,subplot,species_key,species_code,cover_pct
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<date>,<int>,<chr>,<chr>,<int>
1,YVP 10,,,10,2017-06-09,1,,BOESPP,1
2,YVP 10,,,10,2017-06-09,1,,CREINT,1
3,YVP 10,,,10,2017-06-09,1,,EUPESU,5
4,YVP 10,,,10,2017-06-09,1,,FESCAM,25
5,YVP 10,,,10,2017-06-09,1,,FESIDA,25
6,YVP 10,,,10,2017-06-09,1,,GEUTRI,10


In [385]:
# Find instances where a plant species is counted twice in the same year-plot-subplot combination
dbl_counts <- df %>%
  group_by(year = as.numeric(substring(date,0,4)), plot_code, subplot, species_code) %>%
  summarize(counted = n()) %>% 
  ungroup() %>%
  arrange(year, plot_code, subplot, desc(counted)) %>%
  filter(counted > 1) %>%
  print(n=Inf)

[90m# A tibble: 46 x 5[39m
    year plot_code subplot species_code counted
   [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m       [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m          [3m[90m<int>[39m[23m
[90m 1[39m  [4m2[24m017 YVP 144         2 VERVER             2
[90m 2[39m  [4m2[24m017 YVP 180         7 FRIPUD             2
[90m 3[39m  [4m2[24m017 YVP 203         4 COLLIN             2
[90m 4[39m  [4m2[24m017 YVP 355        10 PSESPI             2
[90m 5[39m  [4m2[24m017 YVP 44          9 ORTTEN             2
[90m 6[39m  [4m2[24m017 YVP N111        2 DRAVER             2
[90m 7[39m  [4m2[24m017 YVP NB294       8 MICGRA             2
[90m 8[39m  [4m2[24m018 YVP 112         9 ALYALY             2
[90m 9[39m  [4m2[24m018 YVP 12          4 HOLUMB             2
[90m10[39m  [4m2[24m018 YVP 144        10 ACHMIL             2
[90m11[39m  [4m2[24m018 YVP 184         4 HOLUMB             2
[90m12[39m  [4m2[24m018 YVP 185        

In [386]:
view_doubles  <- dbl_counts %>%
  left_join(df %>% mutate(year = as.numeric(substring(date,0,4))))

Joining, by = c("year", "plot_code", "subplot", "species_code")



In [387]:
str(view_doubles)

tibble [92 × 11] (S3: tbl_df/tbl/data.frame)
 $ year        : num [1:92] 2017 2017 2017 2017 2017 ...
 $ plot_code   : chr [1:92] "YVP 144" "YVP 144" "YVP 180" "YVP 180" ...
 $ subplot     : int [1:92] 2 2 7 7 4 4 10 10 9 9 ...
 $ species_code: chr [1:92] "VERVER" "VERVER" "FRIPUD" "FRIPUD" ...
 $ counted     : int [1:92] 2 2 2 2 2 2 2 2 2 2 ...
 $ plot_loc    : chr [1:92] NA NA NA NA ...
 $ plot_rep    : chr [1:92] NA NA NA NA ...
 $ plot_num    : chr [1:92] "144" "144" "180" "180" ...
 $ date        : Date[1:92], format: "2017-05-30" "2017-05-30" ...
 $ species_key : chr [1:92] NA NA NA NA ...
 $ cover_pct   : int [1:92] 3 4 1 1 10 1 20 2 4 1 ...


In [388]:
view_doubles %>%
  distinct(date, plot_code, species_code, subplot) %>%
  arrange(date, plot_code, species_code)

date,plot_code,species_code,subplot
<date>,<chr>,<chr>,<int>
2017-05-08,YVP NB294,MICGRA,8
2017-05-18,YVP 203,COLLIN,4
2017-05-25,YVP N111,DRAVER,2
2017-05-30,YVP 144,VERVER,2
2017-05-31,YVP 180,FRIPUD,7
2017-06-02,YVP 355,PSESPI,10
2017-06-06,YVP 44,ORTTEN,9
2018-05-28,YVP 144,ACHMIL,10
2018-05-28,YVP N278,ARESER,2
2018-05-28,YVP N522,LITRUD,1


In [389]:
view_doubles %>%
  distinct(date, plot_code, subplot, plot_loc) %>%
  arrange(date, plot_code)

date,plot_code,subplot,plot_loc
<date>,<chr>,<int>,<chr>
2017-05-08,YVP NB294,8,N
2017-05-18,YVP 203,4,
2017-05-25,YVP N111,2,N
2017-05-30,YVP 144,2,
2017-05-31,YVP 180,7,
2017-06-02,YVP 355,10,
2017-06-06,YVP 44,9,
2018-05-28,YVP 144,10,
2018-05-28,YVP N278,2,N
2018-05-28,YVP N522,1,N


## Resolve double counts

* If the 'cover_pct' values are equal, delete one of the rows
* If the 'cover_pct' values are not equal, delete one of the rows and change cover_pct for the remaining one to NA

In [390]:
str(view_doubles)

tibble [92 × 11] (S3: tbl_df/tbl/data.frame)
 $ year        : num [1:92] 2017 2017 2017 2017 2017 ...
 $ plot_code   : chr [1:92] "YVP 144" "YVP 144" "YVP 180" "YVP 180" ...
 $ subplot     : int [1:92] 2 2 7 7 4 4 10 10 9 9 ...
 $ species_code: chr [1:92] "VERVER" "VERVER" "FRIPUD" "FRIPUD" ...
 $ counted     : int [1:92] 2 2 2 2 2 2 2 2 2 2 ...
 $ plot_loc    : chr [1:92] NA NA NA NA ...
 $ plot_rep    : chr [1:92] NA NA NA NA ...
 $ plot_num    : chr [1:92] "144" "144" "180" "180" ...
 $ date        : Date[1:92], format: "2017-05-30" "2017-05-30" ...
 $ species_key : chr [1:92] NA NA NA NA ...
 $ cover_pct   : int [1:92] 3 4 1 1 10 1 20 2 4 1 ...


In [0]:
distinct_doubles <- view_doubles %>%
  distinct(date, plot_code, subplot, species_code) %>%
  arrange(date, plot_code)

In [392]:
str(distinct_doubles)

tibble [46 × 4] (S3: tbl_df/tbl/data.frame)
 $ date        : Date[1:46], format: "2017-05-08" "2017-05-18" ...
 $ plot_code   : chr [1:46] "YVP NB294" "YVP 203" "YVP N111" "YVP 144" ...
 $ subplot     : int [1:46] 8 4 2 2 7 10 9 10 2 1 ...
 $ species_code: chr [1:46] "MICGRA" "COLLIN" "DRAVER" "VERVER" ...


In [393]:
nrow(distinct_doubles)

In [394]:
for (row in 1:nrow(distinct_doubles)) {
  dbl_ref <- distinct_doubles[row, ]
  
  # date, plot_code, species_code, subplot
  selected_rows <- filter(df, date == dbl_ref$date &
                        plot_code == dbl_ref$plot_code &
                        species_code == dbl_ref$species_code &
                        subplot == dbl_ref$subplot)
                        
  # identify indicies of duplicate observationos in original dataframe
  selected_indices <- which(df$date == dbl_ref$date &
                        df$plot_code == dbl_ref$plot_code &
                        df$species_code == dbl_ref$species_code &
                        df$subplot == dbl_ref$subplot)

  # Display for Review
  print(selected_rows)

  # compare "cover_pct" observations for equality
  if(var(selected_rows$cover_pct) == 0) {
    print("EQUAL")
    # drop duplicate observation
    df <- df[-c(last(selected_indices)), ]    
  } else if (var(selected_rows$cover_pct) != 0) {
    print("NOT EQUAL")
    # set first row "cover_pct" to NA
    df[c(first(selected_indices)), ]$cover_pct = NA

    # drop duplicate observation
    df <- df[-c(last(selected_indices)), ]
  }
}

  plot_code plot_loc plot_rep plot_num       date subplot species_key
1 YVP NB294        N        B      294 2017-05-08       8        <NA>
2 YVP NB294        N        B      294 2017-05-08       8        <NA>
  species_code cover_pct
1       MICGRA         1
2       MICGRA         1
[1] "EQUAL"
  plot_code plot_loc plot_rep plot_num       date subplot species_key
1   YVP 203     <NA>     <NA>      203 2017-05-18       4        <NA>
2   YVP 203     <NA>     <NA>      203 2017-05-18       4        <NA>
  species_code cover_pct
1       COLLIN        10
2       COLLIN         1
[1] "NOT EQUAL"
  plot_code plot_loc plot_rep plot_num       date subplot species_key
1  YVP N111        N     <NA>      111 2017-05-25       2        <NA>
2  YVP N111        N     <NA>      111 2017-05-25       2        <NA>
  species_code cover_pct
1       DRAVER         1
2       DRAVER         1
[1] "EQUAL"
  plot_code plot_loc plot_rep plot_num       date subplot species_key
1   YVP 144     <NA>     <NA>      

In [395]:
str(df)

'data.frame':	21682 obs. of  9 variables:
 $ plot_code   : chr  "YVP 10" "YVP 10" "YVP 10" "YVP 10" ...
 $ plot_loc    : chr  NA NA NA NA ...
 $ plot_rep    : chr  NA NA NA NA ...
 $ plot_num    : chr  "10" "10" "10" "10" ...
 $ date        : Date, format: "2017-06-09" "2017-06-09" ...
 $ subplot     : int  1 1 1 1 1 1 1 1 1 1 ...
 $ species_key : chr  NA NA NA NA ...
 $ species_code: chr  "BOESPP" "CREINT" "EUPESU" "FESCAM" ...
 $ cover_pct   : int  1 1 5 25 25 10 1 1 5 1 ...


In [396]:
# rescan for double observations
# Find instances where a plant species is counted twice in the same year-plot-subplot combination
dbl_recount <- df %>%
  group_by(year = as.numeric(substring(date,0,4)), plot_code, subplot, species_code) %>%
  summarize(counted = n()) %>% 
  ungroup() %>%
  arrange(year, plot_code, subplot, desc(counted)) %>%
  filter(counted > 1) %>%
  print(n=Inf)

[90m# A tibble: 0 x 5[39m
[90m# … with 5 variables: year [3m[90m<dbl>[90m[23m, plot_code [3m[90m<chr>[90m[23m, subplot [3m[90m<int>[90m[23m,[39m
[90m#   species_code [3m[90m<chr>[90m[23m, counted [3m[90m<int>[90m[23m[39m


In [409]:
# display previously duplicated plots for review
for (row in 1:nrow(distinct_doubles)) {
  dbl_ref <- distinct_doubles[row, ]

  # date, plot_code, species_code, subplot
  selected_rows <- filter(df, date == dbl_ref$date &
                        plot_code == dbl_ref$plot_code &
                        species_code == dbl_ref$species_code &
                        subplot == dbl_ref$subplot)
  print(selected_rows[,c(1,5,6,8,9)])
}

  plot_code       date subplot species_code cover_pct
1 YVP NB294 2017-05-08       8       MICGRA         1
  plot_code       date subplot species_code cover_pct
1   YVP 203 2017-05-18       4       COLLIN        NA
  plot_code       date subplot species_code cover_pct
1  YVP N111 2017-05-25       2       DRAVER         1
  plot_code       date subplot species_code cover_pct
1   YVP 144 2017-05-30       2       VERVER        NA
  plot_code       date subplot species_code cover_pct
1   YVP 180 2017-05-31       7       FRIPUD         1
  plot_code       date subplot species_code cover_pct
1   YVP 355 2017-06-02      10       PSESPI        NA
  plot_code       date subplot species_code cover_pct
1    YVP 44 2017-06-06       9       ORTTEN        NA
  plot_code       date subplot species_code cover_pct
1   YVP 144 2018-05-28      10       ACHMIL         5
  plot_code       date subplot species_code cover_pct
1  YVP N278 2018-05-28       2       ARESER        NA
  plot_code       date subpl

### Export View Doubles

In [0]:
write.csv(view_doubles, 'view_doubles.csv')