<a href="https://colab.research.google.com/github/samsoe/mpg_notebooks/blob/master/yvp_ground_cover_WRANGLE_2020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Documentation

[Readme fixed plot vegetation data](https://docs.google.com/document/d/16-Aq8u9Rudd78fSzfjvpCXyQgE-BstC-d2PjYfmLtcw/edit?usp=sharing)

# Security

* The user must load a `json` file containing the BigQuery API key into the local directory `/content/...`
* The user must have a Google Maps API key to enable mapping. 
   * CAUTION make sure the key is deleted from the current instance of the notebook before sharing

# Tools

In [None]:
library(tidyverse)
library(lubridate)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘lubridate’


The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union




# Source

## CSV

### df_groundcover

In [None]:
# 2020-10-22_yvp_ground_cover_SOURCE.csv
# https://drive.google.com/file/d/1gm88qr4nHmKt7OnWCH-uaBT7DNdl5KCd/view?usp=sharing
src_ground <- 'https://drive.google.com/uc?id=1gm88qr4nHmKt7OnWCH-uaBT7DNdl5KCd'

In [None]:
df_groundcover <- read_csv(src_ground)


[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
cols(
  plot_num = [32mcol_double()[39m,
  plot_code = [31mcol_character()[39m,
  date = [33mcol_logical()[39m,
  subplot = [32mcol_double()[39m,
  groundcover_type = [31mcol_character()[39m,
  groundcover_pct = [32mcol_double()[39m,
  comments = [31mcol_character()[39m
)




In [None]:
df_groundcover %>% glimpse()

Rows: 4,640
Columns: 7
$ plot_num         [3m[90m<dbl>[39m[23m 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, …
$ plot_code        [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP 10", "Y…
$ date             [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ subplot          [3m[90m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3…
$ groundcover_type [3m[90m<chr>[39m[23m "BV", "L", "LIC", "M", "G", "BG", "R", "WD", "BV", "…
$ groundcover_pct  [3m[90m<dbl>[39m[23m 70, 20, 5, 5, 1, 0, 0, 0, 50, 30, 20, 2, 1, 1, 0, 0,…
$ comments         [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …


In [None]:
df_groundcover %>% head()

plot_num,plot_code,date,subplot,groundcover_type,groundcover_pct,comments
<dbl>,<chr>,<lgl>,<dbl>,<chr>,<dbl>,<chr>
10,YVP 10,,1,BV,70,
10,YVP 10,,1,L,20,
10,YVP 10,,1,LIC,5,
10,YVP 10,,1,M,5,
10,YVP 10,,1,G,1,
10,YVP 10,,1,BG,0,


### df_meta

In [None]:
# 2020-10-22_yvp_survey_metadata_SOURCE.csv
# https://drive.google.com/file/d/19I0quIj8ALzP91VkxxIR-D1PWgXRZ_90/view?usp=sharing
src_meta <- 'https://drive.google.com/uc?id=19I0quIj8ALzP91VkxxIR-D1PWgXRZ_90'

In [None]:
df_meta_full <- read_csv(src_meta) %>% glimpse()


[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
cols(
  Plot = [31mcol_character()[39m,
  Date = [31mcol_character()[39m,
  Initials = [31mcol_character()[39m,
  `GPS Lat 1` = [32mcol_double()[39m,
  `GPS Long 1` = [32mcol_double()[39m,
  `GPS Lat 2` = [32mcol_double()[39m,
  `GPS Long 2` = [32mcol_double()[39m,
  Azimuth = [32mcol_double()[39m,
  `comments 2020` = [31mcol_character()[39m,
  `Comments 2019` = [31mcol_character()[39m,
  `Comments 2018` = [31mcol_character()[39m,
  `Comments 2017` = [31mcol_character()[39m
)




Rows: 232
Columns: 12
$ Plot            [3m[90m<chr>[39m[23m "NA294", "NB294", "NC294", "N278", "NA292", "NB292", …
$ Date            [3m[90m<chr>[39m[23m "5/8/17", "5/8/17", "5/8/17", "5/9/17", "5/9/17", "5/…
$ Initials        [3m[90m<chr>[39m[23m "MED", "MED", "MED", "MED", "MED", "MED", "MED", "MED…
$ `GPS Lat 1`     [3m[90m<dbl>[39m[23m 46.67863, 46.67960, 46.68005, 46.68213, 46.67934, 46.…
$ `GPS Long 1`    [3m[90m<dbl>[39m[23m -113.9934, -113.9935, -113.9933, -114.0009, -113.9995…
$ `GPS Lat 2`     [3m[90m<dbl>[39m[23m 46.67869, 46.67915, 46.68049, 46.68192, 46.67976, 46.…
$ `GPS Long 2`    [3m[90m<dbl>[39m[23m -113.9928, -113.9936, -113.9932, -114.0015, -113.9993…
$ Azimuth         [3m[90m<dbl>[39m[23m 118, 189, 13, 248, 25, 287, 21, 234, 101, 193, 220, 1…
$ `comments 2020` [3m[90m<chr>[39m[23m "this transect was challenging to match the sp. due t…
$ `Comments 2019` [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N

In [None]:
# cast to date datatype
df_meta_full$Date <- mdy(df_meta_full$Date)

In [None]:
df_meta <- df_meta_full %>%
  filter(year(df_meta_full$Date) == 2020) %>%
  glimpse()

Rows: 58
Columns: 12
$ Plot            [3m[90m<chr>[39m[23m "NA294", "NB294", "NC294", "N324", "N321", "NA292", "…
$ Date            [3m[90m<date>[39m[23m 2020-05-09, 2020-05-09, 2020-05-09, 2020-05-09, 2020…
$ Initials        [3m[90m<chr>[39m[23m "MED", "MED", "MED", "MED", "MED", "MED", "MED", "MED…
$ `GPS Lat 1`     [3m[90m<dbl>[39m[23m 46.67863, 46.67960, 46.68005, 46.67728, 46.67698, 46.…
$ `GPS Long 1`    [3m[90m<dbl>[39m[23m -113.9934, -113.9935, 113.9933, -113.9933, -114.0028,…
$ `GPS Lat 2`     [3m[90m<dbl>[39m[23m 46.67869, 46.67915, 46.68049, 46.67702, 46.67738, 46.…
$ `GPS Long 2`    [3m[90m<dbl>[39m[23m -113.9928, -113.9936, -113.9932, -113.9938, -114.0026…
$ Azimuth         [3m[90m<dbl>[39m[23m 118, 189, 13, 234, 21, 25, 287, 212, 101, 193, 162, 3…
$ `comments 2020` [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, "Sagebrush in this plot h…
$ `Comments 2019` [3m[90m<chr>[39m[23m "5/14/2019 MK", "5/13/2019 MK Still has barbed wire. …

# Wrangle

### plot_loc

In [None]:
# detect "N" in 'plot_code' and write to new column 'plot_loc'
df_plot_loc <- df_groundcover %>% 
  mutate(plot_loc = ifelse(str_detect(plot_code, "N"), "N", NA)) %>% 
  # comments variable is dropped here, not included in schema
  select(plot_num, plot_code, plot_loc, date, subplot, groundcover_type, groundcover_pct) %>%
  glimpse()

Rows: 4,640
Columns: 7
$ plot_num         [3m[90m<dbl>[39m[23m 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, …
$ plot_code        [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP 10", "Y…
$ plot_loc         [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ date             [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ subplot          [3m[90m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3…
$ groundcover_type [3m[90m<chr>[39m[23m "BV", "L", "LIC", "M", "G", "BG", "R", "WD", "BV", "…
$ groundcover_pct  [3m[90m<dbl>[39m[23m 70, 20, 5, 5, 1, 0, 0, 0, 50, 30, 20, 2, 1, 1, 0, 0,…


### plot_rep

In [None]:
# detect "A", "B", "C" characters in plot_code and if present write to 'plot_rep'
df_plot_rep <- df_plot_loc %>%
  mutate(plot_rep = case_when(str_detect(plot_code, "A")~"A",
                              str_detect(plot_code, "B")~"B",
                              str_detect(plot_code, "C")~"C")) %>%
  select(plot_num, plot_code, plot_loc, plot_rep, date, subplot, groundcover_type, groundcover_pct) %>%
  glimpse()

Rows: 4,640
Columns: 8
$ plot_num         [3m[90m<dbl>[39m[23m 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, …
$ plot_code        [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP 10", "Y…
$ plot_loc         [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ plot_rep         [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ date             [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ subplot          [3m[90m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3…
$ groundcover_type [3m[90m<chr>[39m[23m "BV", "L", "LIC", "M", "G", "BG", "R", "WD", "BV", "…
$ groundcover_pct  [3m[90m<dbl>[39m[23m 70, 20, 5, 5, 1, 0, 0, 0, 50, 30, 20, 2, 1, 1, 0, 0,…


### grid_point

In [None]:
# rename plot_num grid_point and drop duplicate
df_grid_point <- df_plot_rep %>%
  mutate(grid_point = plot_num) %>%
  select(grid_point, plot_code, plot_loc, plot_rep, date, subplot, groundcover_type, groundcover_pct) %>%
  glimpse()

Rows: 4,640
Columns: 8
$ grid_point       [3m[90m<dbl>[39m[23m 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, …
$ plot_code        [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP 10", "Y…
$ plot_loc         [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ plot_rep         [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ date             [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ subplot          [3m[90m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3…
$ groundcover_type [3m[90m<chr>[39m[23m "BV", "L", "LIC", "M", "G", "BG", "R", "WD", "BV", "…
$ groundcover_pct  [3m[90m<dbl>[39m[23m 70, 20, 5, 5, 1, 0, 0, 0, 50, 30, 20, 2, 1, 1, 0, 0,…


In [None]:
# cast dbl grid_point datatype to int
df_grid_point$grid_point <- as.integer(df_grid_point$grid_point)

In [None]:
df_grid_point %>% glimpse()

Rows: 4,640
Columns: 8
$ grid_point       [3m[90m<int>[39m[23m 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, …
$ plot_code        [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP 10", "Y…
$ plot_loc         [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ plot_rep         [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ date             [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ subplot          [3m[90m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3…
$ groundcover_type [3m[90m<chr>[39m[23m "BV", "L", "LIC", "M", "G", "BG", "R", "WD", "BV", "…
$ groundcover_pct  [3m[90m<dbl>[39m[23m 70, 20, 5, 5, 1, 0, 0, 0, 50, 30, 20, 2, 1, 1, 0, 0,…


In [None]:
df_meta %>% glimpse()

Rows: 58
Columns: 12
$ Plot            [3m[90m<chr>[39m[23m "NA294", "NB294", "NC294", "N324", "N321", "NA292", "…
$ Date            [3m[90m<date>[39m[23m 2020-05-09, 2020-05-09, 2020-05-09, 2020-05-09, 2020…
$ Initials        [3m[90m<chr>[39m[23m "MED", "MED", "MED", "MED", "MED", "MED", "MED", "MED…
$ `GPS Lat 1`     [3m[90m<dbl>[39m[23m 46.67863, 46.67960, 46.68005, 46.67728, 46.67698, 46.…
$ `GPS Long 1`    [3m[90m<dbl>[39m[23m -113.9934, -113.9935, 113.9933, -113.9933, -114.0028,…
$ `GPS Lat 2`     [3m[90m<dbl>[39m[23m 46.67869, 46.67915, 46.68049, 46.67702, 46.67738, 46.…
$ `GPS Long 2`    [3m[90m<dbl>[39m[23m -113.9928, -113.9936, -113.9932, -113.9938, -114.0026…
$ Azimuth         [3m[90m<dbl>[39m[23m 118, 189, 13, 234, 21, 25, 287, 212, 101, 193, 162, 3…
$ `comments 2020` [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, "Sagebrush in this plot h…
$ `Comments 2019` [3m[90m<chr>[39m[23m "5/14/2019 MK", "5/13/2019 MK Still has barbed wire. …

### date

In [None]:
# create Plot in df_grid_point to enable join
df_join <- df_grid_point %>% 
  mutate(Plot = str_sub(plot_code, 5)) %>% glimpse()

Rows: 4,640
Columns: 9
$ grid_point       [3m[90m<int>[39m[23m 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, …
$ plot_code        [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP 10", "Y…
$ plot_loc         [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ plot_rep         [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ date             [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ subplot          [3m[90m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3…
$ groundcover_type [3m[90m<chr>[39m[23m "BV", "L", "LIC", "M", "G", "BG", "R", "WD", "BV", "…
$ groundcover_pct  [3m[90m<dbl>[39m[23m 70, 20, 5, 5, 1, 0, 0, 0, 50, 30, 20, 2, 1, 1, 0, 0,…
$ Plot             [3m[90m<chr>[39m[23m "10", "10", "10", "10", "10", "10", "10", "10", "10"…


In [None]:
# join in date from df_meta
df_date <- df_join %>%
  left_join(df_meta %>% select(Plot, Date), by = c("Plot" = "Plot")) %>%
  mutate(date = Date) %>%
  select(,-c(9,10))

In [None]:
df_date %>% glimpse()

Rows: 4,640
Columns: 8
$ grid_point       [3m[90m<int>[39m[23m 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, …
$ plot_code        [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP 10", "Y…
$ plot_loc         [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ plot_rep         [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ date             [3m[90m<date>[39m[23m 2020-06-27, 2020-06-27, 2020-06-27, 2020-06-27, 202…
$ subplot          [3m[90m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3…
$ groundcover_type [3m[90m<chr>[39m[23m "BV", "L", "LIC", "M", "G", "BG", "R", "WD", "BV", "…
$ groundcover_pct  [3m[90m<dbl>[39m[23m 70, 20, 5, 5, 1, 0, 0, 0, 50, 30, 20, 2, 1, 1, 0, 0,…


### groundcover_pct

In [None]:
# cast double data type to int
df_date$groundcover_pct <- 
  as.integer(df_date$groundcover_pct)

In [None]:
df_date %>% glimpse()

Rows: 4,640
Columns: 8
$ grid_point       [3m[90m<int>[39m[23m 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, …
$ plot_code        [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP 10", "Y…
$ plot_loc         [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ plot_rep         [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ date             [3m[90m<date>[39m[23m 2020-06-27, 2020-06-27, 2020-06-27, 2020-06-27, 202…
$ subplot          [3m[90m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3…
$ groundcover_type [3m[90m<chr>[39m[23m "BV", "L", "LIC", "M", "G", "BG", "R", "WD", "BV", "…
$ groundcover_pct  [3m[90m<int>[39m[23m 70, 20, 5, 5, 1, 0, 0, 0, 50, 30, 20, 2, 1, 1, 0, 0,…


In [None]:
df_output <- df_date

# Summary

In [None]:
df_output %>% summary()

   grid_point     plot_code           plot_loc           plot_rep        
 Min.   :  7.0   Length:4640        Length:4640        Length:4640       
 1st Qu.:110.0   Class :character   Class :character   Class :character  
 Median :212.5   Mode  :character   Mode  :character   Mode  :character  
 Mean   :254.3                                                           
 3rd Qu.:395.0                                                           
 Max.   :571.0                                                           
      date               subplot     groundcover_type   groundcover_pct
 Min.   :2020-05-09   Min.   : 1.0   Length:4640        Min.   : 0.00  
 1st Qu.:2020-05-22   1st Qu.: 3.0   Class :character   1st Qu.: 0.00  
 Median :2020-06-02   Median : 5.5   Mode  :character   Median : 3.00  
 Mean   :2020-06-02   Mean   : 5.5                      Mean   :12.59  
 3rd Qu.:2020-06-17   3rd Qu.: 8.0                      3rd Qu.:20.00  
 Max.   :2020-07-01   Max.   :10.0                

In [None]:
df_output %>% filter(is.na(plot_code) | is.na(groundcover_type))

grid_point,plot_code,plot_loc,plot_rep,date,subplot,groundcover_type,groundcover_pct
<int>,<chr>,<chr>,<chr>,<date>,<dbl>,<chr>,<int>


No NA values found in numeric variables via `summary()`, and no NA values found in the character variables plot_code or groundcover_type. The other character variables, plot_loc and plot_rep, contain numerous NA values, but these are expected. 

# Output

In [None]:
# Output 
write_csv(df_output, file = "yvp_ground_cover-WRANGLE-2020.csv")