<a href="https://colab.research.google.com/github/samsoe/mpg_notebooks/blob/master/gridVeg_shrub_tree_wrangle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Documentation
[Readme - vegetation point transect survey](https://docs.google.com/document/d/1JWnhxNjeSQZkSnGhtHP68i_l1mDj4vPFMBdUvGqN0TA/edit?usp=sharing)

# Tools

In [8]:
library(tidyverse)

# Source

In [9]:
# 2020-06-15_gridVeg_shrub_tree_SOURCE.csv
src <- "https://drive.google.com/uc?export=download&id=1FYgqp4Q90rC_6BcS5opXX4nEbYHg3r3q"

In [10]:
df_ts <- read_csv(src)

Parsed with column specification:
cols(
  `__kp_Survey` = [31mcol_character()[39m,
  `_kf_Site` = [32mcol_double()[39m,
  SurveyDate = [31mcol_character()[39m,
  SurveyYear = [32mcol_double()[39m,
  ShrubCvr = [32mcol_double()[39m,
  ShrubDistribution = [31mcol_character()[39m,
  ShrubMinHt = [32mcol_double()[39m,
  ShrubCharHt = [32mcol_double()[39m,
  ShrubMaxHt = [32mcol_double()[39m,
  TreeMinHt = [32mcol_double()[39m,
  TreeCharHt = [32mcol_double()[39m,
  TreeMaxHt = [32mcol_double()[39m,
  TreeMinDBH = [32mcol_double()[39m,
  TreeCharDBH = [32mcol_double()[39m,
  TreeMaxDBH = [32mcol_double()[39m,
  CanopyDensity = [32mcol_double()[39m
)



# Wrangle

In [11]:
dim(df_ts)

In [12]:
summary(df_ts)

 __kp_Survey           _kf_Site      SurveyDate          SurveyYear  
 Length:752         Min.   :  1.0   Length:752         Min.   :2011  
 Class :character   1st Qu.:154.0   Class :character   1st Qu.:2012  
 Mode  :character   Median :378.5   Mode  :character   Median :2016  
                    Mean   :325.5                      Mean   :2014  
                    3rd Qu.:475.0                      3rd Qu.:2016  
                    Max.   :581.0                      Max.   :2017  
                                                                     
    ShrubCvr     ShrubDistribution    ShrubMinHt     ShrubCharHt   
 Min.   :1.000   Length:752         Min.   :1.000   Min.   :1.000  
 1st Qu.:3.000   Class :character   1st Qu.:1.000   1st Qu.:2.000  
 Median :4.000   Mode  :character   Median :1.000   Median :2.000  
 Mean   :3.652                      Mean   :1.094   Mean   :2.052  
 3rd Qu.:5.000                      3rd Qu.:1.000   3rd Qu.:2.000  
 Max.   :5.000                  

## Rename Columns

In [13]:
names(df_ts)

In [14]:
oldnames = c('__kp_Survey', '_kf_Site', 'SurveyDate', 'SurveyYear', 
             'ShrubCvr', 'ShrubDistribution', 'ShrubMinHt', 'ShrubCharHt', 
             'ShrubMaxHt', 'TreeMinHt', 'TreeCharHt', 'TreeMaxHt', 
             'TreeMinDBH', 'TreeCharDBH', 'TreeMaxDBH', 'CanopyDensity')
newnames = c('survey_ID', 'grid_point', 'date', 'year', 
             'shrub_cover', 'shrub_dist', 'shrub_ht_min', 'shrub_ht_char', 
             'shrub_ht_max', 'tree_ht_min', 'tree_ht_char', 'tree_ht_max',
             'tree_dbh_min', 'tree_dbh_char', 'tree_dbh_max', 'canopy_density')

df_ts <- df_ts %>% rename_at(vars(all_of(oldnames)), ~ newnames)

In [17]:
glimpse(df_ts)

Rows: 752
Columns: 16
$ survey_ID      [3m[90m<chr>[39m[23m "436", "558", "565", "582", "696", "439", "440", "441"…
$ grid_point     [3m[90m<dbl>[39m[23m 1, 6, 13, 14, 15, 16, 17, 18, 25, 26, 29, 30, 31, 35, …
$ date           [3m[90m<chr>[39m[23m "7/26/11", "7/19/11", "7/28/11", "7/21/11", "7/21/11",…
$ year           [3m[90m<dbl>[39m[23m 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, …
$ shrub_cover    [3m[90m<dbl>[39m[23m 2, 5, 4, 3, 3, 3, 5, 5, 4, 3, 4, 3, 4, 3, 4, 5, 2, 4, …
$ shrub_dist     [3m[90m<chr>[39m[23m "Scattered", "Scattered", "Scattered", "Scattered", "S…
$ shrub_ht_min   [3m[90m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, …
$ shrub_ht_char  [3m[90m<dbl>[39m[23m 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, …
$ shrub_ht_max   [3m[90m<dbl>[39m[23m 2, 3, 3, 3, 2, 3, 2, 2, 2, 2, 3, 3, 3, 3, 2, 3, 2, 3, …
$ tree_ht_min    [3m[90m<dbl>[39m[23m 2, NA, NA, NA, NA, 1, 1, 3, 3, NA, NA, NA, NA, NA, NA,

## Set Data Types

In [20]:
df_ts$grid_point <- as.integer(df_ts$grid_point)
df_ts$date <- as.Date(df_ts$date, "%m/%d/%y")
df_ts$year <- as.integer(df_ts$year)
df_ts$shrub_cover <- as.integer(df_ts$shrub_cover)
df_ts$shrub_ht_min <- as.integer(df_ts$shrub_ht_min)
df_ts$shrub_ht_char <- as.integer(df_ts$shrub_ht_char)
df_ts$shrub_ht_max <- as.integer(df_ts$shrub_ht_max)
df_ts$tree_ht_min <- as.integer(df_ts$tree_ht_min)
df_ts$tree_ht_char <- as.integer(df_ts$tree_ht_char)
df_ts$tree_ht_max <- as.integer(df_ts$tree_ht_max)
df_ts$tree_dbh_min <- as.integer(df_ts$tree_dbh_min)
df_ts$tree_dbh_char <- as.integer(df_ts$tree_dbh_char)
df_ts$tree_dbh_max <- as.integer(df_ts$tree_dbh_max)

In [21]:
glimpse(df_ts)

Rows: 752
Columns: 16
$ survey_ID      [3m[90m<chr>[39m[23m "436", "558", "565", "582", "696", "439", "440", "441"…
$ grid_point     [3m[90m<int>[39m[23m 1, 6, 13, 14, 15, 16, 17, 18, 25, 26, 29, 30, 31, 35, …
$ date           [3m[90m<date>[39m[23m 2011-07-26, 2011-07-19, 2011-07-28, 2011-07-21, 2011-…
$ year           [3m[90m<int>[39m[23m 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, …
$ shrub_cover    [3m[90m<int>[39m[23m 2, 5, 4, 3, 3, 3, 5, 5, 4, 3, 4, 3, 4, 3, 4, 5, 2, 4, …
$ shrub_dist     [3m[90m<chr>[39m[23m "Scattered", "Scattered", "Scattered", "Scattered", "S…
$ shrub_ht_min   [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, …
$ shrub_ht_char  [3m[90m<int>[39m[23m 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, …
$ shrub_ht_max   [3m[90m<int>[39m[23m 2, 3, 3, 3, 2, 3, 2, 2, 2, 2, 3, 3, 3, 3, 2, 3, 2, 3, …
$ tree_ht_min    [3m[90m<int>[39m[23m 2, NA, NA, NA, NA, 1, 1, 3, 3, NA, NA, NA, NA, NA, NA,

## Explore

Each column was reviewed with distinct and arrange to look for outlier values

In [None]:
df_ts %>%
  distinct(canopy_density) %>%
  arrange(canopy_density)

# Output

In [43]:
output <- "gridVeg_shrub_tree_WRANGLE.csv"
write_csv(df_ts, path = output)