<a href="https://colab.research.google.com/github/samsoe/mpg_notebooks/blob/master/gridVeg_survey_metadata_wrangle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tools

In [1]:
library(tidyverse)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.1     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.1     [32m✔[39m [34mdplyr  [39m 1.0.0
[32m✔[39m [34mtidyr  [39m 1.1.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



# Source

In [2]:
# 2020-06-15_gridVeg_survey_metadata_SOURCE.csv
src <- "https://drive.google.com/uc?export=download&id=12FgxXQbpC3XotrATbBLx-SYKypUKkXPt"

In [3]:
df_gridVeg <- read_csv(src)

Parsed with column specification:
cols(
  `__kp_Survey` = [31mcol_character()[39m,
  `_kf_Site` = [32mcol_double()[39m,
  SurveyYear = [32mcol_double()[39m,
  SurveyDate = [31mcol_character()[39m,
  Surveyor1 = [31mcol_character()[39m
)



# Structure

## Rename
* name according to schema specification

In [4]:
colnames(df_gridVeg)

In [5]:
oldnames = c('__kp_Survey', '_kf_Site', 'SurveyYear', 'SurveyDate', 'Surveyor1') 
newnames = c('survey_ID', 'grid_point', 'year', 'date', 'surveyor')

df_gridVeg <- df_gridVeg %>% rename_at(vars(oldnames), ~ newnames)

Note: Using an external vector in selections is ambiguous.
[34mℹ[39m Use `all_of(oldnames)` instead of `oldnames` to silence this message.
[34mℹ[39m See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
[90mThis message is displayed once per session.[39m



## Data Type

In [6]:
str(df_gridVeg)

tibble [1,472 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ survey_ID : chr [1:1472] "108" "107" "106" "105" ...
 $ grid_point: num [1:1472] 3 4 5 6 7 8 9 10 11 12 ...
 $ year      : num [1:1472] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
 $ date      : chr [1:1472] "8/17/10" "8/17/10" "8/17/10" "8/17/10" ...
 $ surveyor  : chr [1:1472] "EAR" "EAR" "EAR" "EAR" ...
 - attr(*, "spec")=
  .. cols(
  ..   `__kp_Survey` = [31mcol_character()[39m,
  ..   `_kf_Site` = [32mcol_double()[39m,
  ..   SurveyYear = [32mcol_double()[39m,
  ..   SurveyDate = [31mcol_character()[39m,
  ..   Surveyor1 = [31mcol_character()[39m
  .. )


In [7]:
df_gridVeg$grid_point <- as.integer(df_gridVeg$grid_point)

In [8]:
df_gridVeg$year <- as.integer(df_gridVeg$year)

In [9]:
df_gridVeg$date <- as.Date(df_gridVeg$date, "%m/%d/%y")

In [10]:
str(df_gridVeg)

tibble [1,472 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ survey_ID : chr [1:1472] "108" "107" "106" "105" ...
 $ grid_point: int [1:1472] 3 4 5 6 7 8 9 10 11 12 ...
 $ year      : int [1:1472] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
 $ date      : Date[1:1472], format: "2010-08-17" "2010-08-17" ...
 $ surveyor  : chr [1:1472] "EAR" "EAR" "EAR" "EAR" ...
 - attr(*, "spec")=
  .. cols(
  ..   `__kp_Survey` = [31mcol_character()[39m,
  ..   `_kf_Site` = [32mcol_double()[39m,
  ..   SurveyYear = [32mcol_double()[39m,
  ..   SurveyDate = [31mcol_character()[39m,
  ..   Surveyor1 = [31mcol_character()[39m
  .. )


# Explore

In [11]:
summary(df_gridVeg)

  survey_ID           grid_point         year           date           
 Length:1472        Min.   :  1.0   Min.   :2010   Min.   :2010-08-10  
 Class :character   1st Qu.:118.0   1st Qu.:2011   1st Qu.:2011-07-08  
 Mode  :character   Median :264.0   Median :2012   Median :2012-07-23  
                    Mean   :268.8   Mean   :2013   Mean   :2013-09-15  
                    3rd Qu.:406.2   3rd Qu.:2016   3rd Qu.:2016-06-27  
                    Max.   :583.0   Max.   :2017   Max.   :2017-07-12  
   surveyor        
 Length:1472       
 Class :character  
 Mode  :character  
                   
                   
                   

In [12]:
df_gridVeg %>%
  distinct(surveyor)

surveyor
<chr>
EAR
MAW
MAE
PAW
RDW
JCR
TAH
MPK
RDW
RAD


In [13]:
values <- c('Med', 'med')

df_gridVeg %>%
  filter(surveyor %in% values)

survey_ID,grid_point,year,date,surveyor
<chr>,<int>,<int>,<date>,<chr>
93FE2753-53F1-4525-8C0B-391CF7DDD8DD,80,2016,2016-05-23,Med
C6C3B9F6-EA22-40C8-80CA-AA41151681B0,104,2016,2016-06-02,Med
9820BB50-8B2D-4CEB-ABBB-F920D39C5F49,112,2016,2016-05-16,med
9F3E5065-5B34-401C-8BA5-5BCCD3B597AC,271,2016,2016-05-12,med
0ADBB23B-6019-460A-A03C-C207752A1B44,309,2016,2016-05-17,med
4FCB1E58-7B13-47C5-BC0F-63A029F73E5A,322,2016,2016-05-11,med
2DADFDB5-9B4D-4E61-9ABD-E7E84FD942AA,384,2016,2016-05-26,Med


# Tidy


In [14]:
# standardize surveyor values
values <- c('Med', 'med')

df_gridVeg <- df_gridVeg %>%
  mutate(surveyor = ifelse(surveyor %in% values, 'MED', surveyor))

In [15]:
df_gridVeg %>%
  distinct(surveyor)

surveyor
<chr>
EAR
MAW
MAE
PAW
RDW
JCR
TAH
MPK
RDW
RAD


# Update


2020-06-18 add survey_sequence variable 

this variable allows us to collapse records from 2011 and 2012 into a single year for analysis and graphics. Surveys in those two years together constitute the baseline survey.

create this new variable from `year`. Recode 2011 and 2012 to 2011-12. Leave all other years the same. The new variable survey_sequence will be in {​2010, 2011-12, 2013, 2015, 2016, 2017} 

In [27]:
df_gridVeg <- df_gridVeg %>%
  mutate(survey_sequence = ifelse((year == 2011 | year == 2012), "2011-12", as.character(year)))

# Output

In [28]:
# updated output by esamsoe on 2020-06-18
output = '2020-06-15_gridVeg_survey_metadata_WRANGLE.csv'
write_csv(df_gridVeg, path = output)