<a href="https://colab.research.google.com/github/samsoe/mpg_notebooks/blob/master/yvp_groundCover_cover_WRANGLE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Documentation

[Readme fixed plot vegetation data](https://docs.google.com/document/d/16-Aq8u9Rudd78fSzfjvpCXyQgE-BstC-d2PjYfmLtcw/edit?usp=sharing)

# Security

* The user must load a `json` file containing the BigQuery API key into the local directory `/content/...`
* The user must have a Google Maps API key to enable mapping. 
   * CAUTION make sure the key is deleted from the current instance of the notebook before sharing

# Tools

In [None]:
library(tidyverse)

* Remember that the file containing authorization keys for Big Query must be loaded into the virutual envrionment manually.

In [2]:
install.packages("bigrquery")
library(bigrquery)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘bit’, ‘bit64’, ‘gargle’, ‘rapidjsonr’




# Source

## Database Connection

In [3]:
# BigQuery API Key
bq_auth(path = "/content/mpg-data-warehouse-api_key-master.json")

In [4]:
Sys.setenv(BIGQUERY_TEST_PROJECT = "mpg-data-warehouse")

In [5]:
billing <- bq_test_project()

### yvp_ground_cover

In [20]:
sql_ground_cover <- "
  SELECT
    CONCAT(plot_code, \" \", date) AS survey_code,
    plot_code,
    SUBSTR(SAFE_CAST(date AS STRING), 0, 4) AS year,
    plot_loc,
    plot_rep,
    plot_num,
    subplot,
    groundcover_type,
    groundcover_pct
  FROM
    `mpg-data-warehouse.vegetation_fixed_plot_yvp.yvp_ground_cover`
"

In [24]:
bq_ground_cover <- bq_project_query(billing, sql_ground_cover)

In [25]:
tb_ground_cover <- bq_table_download(bq_ground_cover)

In [64]:
df_groundcover <- as.data.frame(tb_ground_cover)

### location_position_classification

In [14]:
sql_location_class <- "
  SELECT
    grid_point,
    aspect_mean_deg,
    elevation_mean_m,
    slope_mean_deg,
    cover_type_2016_gridVeg,
    type3_vegetation_indicators,
    type4_indicators_history
  FROM
    `mpg-data-warehouse.grid_point_summaries.location_position_classification`
"

In [15]:
bq_location_class <- bq_project_query(billing, sql_location_class)

In [16]:
tb_location_class <- bq_table_download(bq_location_class)

In [17]:
df_location_class <- as.data.frame(tb_location_class)

# Wrangle

## Complete

In [67]:
df_groundcover_complete <- df_groundcover %>%
  complete(survey_code,
           nesting(groundcover_type, subplot),
           fill = list(groundcover_pct = 0)) %>%
  group_by(survey_code, subplot) %>%
  select(survey_code, groundcover_type, subplot, groundcover_pct) %>%
  arrange(survey_code, subplot) %>% glimpse()

Rows: 14,080
Columns: 4
Groups: survey_code, subplot [1,760]
$ survey_code      [3m[90m<chr>[39m[23m "YVP 10 2017-06-09", "YVP 10 2017-06-09", "YVP 10 20…
$ groundcover_type [3m[90m<chr>[39m[23m "BG", "BV", "G", "L", "LIC", "M", "R", "WD", "BG", "…
$ subplot          [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3…
$ groundcover_pct  [3m[90m<dbl>[39m[23m 1, 80, 1, 10, 4, 5, 0, 0, 2, 70, 0, 15, 5, 10, 0, 0,…


## Generate Variables

### groundcover_pct_avg

The ground cover data only need to be summarized in one dimension. This makes processing a little easier than it was for the plant cover data. To process the data, sum the subplot-level ground cover values across each transect for each survey, and divide the sum by 10 to create an average for that ground cover type. Confirm that each value of ground cover possible is present in each survey. Confirm that the number of rows produced makes sense based on the source data (number of surveys * number of possible ground cover types). Join in metadata to produce the final view, as instructed in the table below. 

In [117]:
df_groundcover_sum <- 
df_groundcover_complete %>%
  group_by(survey_code, groundcover_type) %>%
  summarise(groundcover_sum = sum(groundcover_pct)) %>% ungroup() %>%
  glimpse()

`summarise()` regrouping output by 'survey_code' (override with `.groups` argument)



Rows: 1,408
Columns: 3
$ survey_code      [3m[90m<chr>[39m[23m "YVP 10 2017-06-09", "YVP 10 2017-06-09", "YVP 10 20…
$ groundcover_type [3m[90m<chr>[39m[23m "BG", "BV", "G", "L", "LIC", "M", "R", "WD", "BG", "…
$ groundcover_sum  [3m[90m<dbl>[39m[23m 24, 665, 11, 155, 74, 100, 0, 0, 26, 505, 1, 250, 11…


In [121]:
df_groundcover_avg <- df_groundcover_sum %>%
  group_by(survey_code, groundcover_type) %>%
  summarise(groundcover_pct_avg = groundcover_sum / 10) %>% ungroup() %>% 
  glimpse()

`summarise()` regrouping output by 'survey_code' (override with `.groups` argument)



Rows: 1,408
Columns: 3
$ survey_code         [3m[90m<chr>[39m[23m "YVP 10 2017-06-09", "YVP 10 2017-06-09", "YVP 10…
$ groundcover_type    [3m[90m<chr>[39m[23m "BG", "BV", "G", "L", "LIC", "M", "R", "WD", "BG"…
$ groundcover_pct_avg [3m[90m<dbl>[39m[23m 2.4, 66.5, 1.1, 15.5, 7.4, 10.0, 0.0, 0.0, 2.6, 5…


### plot_code

In [122]:
# bring plot_code back in by removing date
df_plot_code <- df_groundcover_avg %>%
  mutate(plot_code = substring(survey_code, 1,nchar(survey_code)-11)) %>% glimpse()

Rows: 1,408
Columns: 4
$ survey_code         [3m[90m<chr>[39m[23m "YVP 10 2017-06-09", "YVP 10 2017-06-09", "YVP 10…
$ groundcover_type    [3m[90m<chr>[39m[23m "BG", "BV", "G", "L", "LIC", "M", "R", "WD", "BG"…
$ groundcover_pct_avg [3m[90m<dbl>[39m[23m 2.4, 66.5, 1.1, 15.5, 7.4, 10.0, 0.0, 0.0, 2.6, 5…
$ plot_code           [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP 10",…


### plot_loc

In [123]:
# detect "N" in 'plot_code' and write to new column 'plot_loc'
# df_plot_loc <- 
df_plot_loc <- df_plot_code %>%
  mutate(plot_loc = ifelse(str_detect(plot_code, "N"), "N", NA)) %>% glimpse()

Rows: 1,408
Columns: 5
$ survey_code         [3m[90m<chr>[39m[23m "YVP 10 2017-06-09", "YVP 10 2017-06-09", "YVP 10…
$ groundcover_type    [3m[90m<chr>[39m[23m "BG", "BV", "G", "L", "LIC", "M", "R", "WD", "BG"…
$ groundcover_pct_avg [3m[90m<dbl>[39m[23m 2.4, 66.5, 1.1, 15.5, 7.4, 10.0, 0.0, 0.0, 2.6, 5…
$ plot_code           [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP 10",…
$ plot_loc            [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…


## plot_rep

In [130]:
# detect "A", "B", "C" characters in plot_code and if present write to 'plot_rep'
df_plot_rep <- df_plot_loc %>%
  mutate(plot_rep = case_when(str_detect(plot_code, "A")~"A",
                              str_detect(plot_code, "B")~"B",
                              str_detect(plot_code, "C")~"C")) %>% glimpse()

Rows: 1,408
Columns: 6
$ survey_code         [3m[90m<chr>[39m[23m "YVP 10 2017-06-09", "YVP 10 2017-06-09", "YVP 10…
$ groundcover_type    [3m[90m<chr>[39m[23m "BG", "BV", "G", "L", "LIC", "M", "R", "WD", "BG"…
$ groundcover_pct_avg [3m[90m<dbl>[39m[23m 2.4, 66.5, 1.1, 15.5, 7.4, 10.0, 0.0, 0.0, 2.6, 5…
$ plot_code           [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP 10",…
$ plot_loc            [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ plot_rep            [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…


### plot_num

In [132]:
# use digital values from 'plot_code' to populate 'plot_num'
# df_grid_point <- 
df_plot_num <- df_plot_rep %>%
  mutate(plot_num = str_extract(plot_code, "[:digit:].*"),
         plot_num = as.integer(plot_num)) %>% glimpse()

Rows: 1,408
Columns: 7
$ survey_code         [3m[90m<chr>[39m[23m "YVP 10 2017-06-09", "YVP 10 2017-06-09", "YVP 10…
$ groundcover_type    [3m[90m<chr>[39m[23m "BG", "BV", "G", "L", "LIC", "M", "R", "WD", "BG"…
$ groundcover_pct_avg [3m[90m<dbl>[39m[23m 2.4, 66.5, 1.1, 15.5, 7.4, 10.0, 0.0, 0.0, 2.6, 5…
$ plot_code           [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP 10",…
$ plot_loc            [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ plot_rep            [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ plot_num            [3m[90m<int>[39m[23m 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1…


### year

In [133]:
 df_year <- df_plot_num %>%
  mutate(year = substring(survey_code, nchar(survey_code)-9, nchar(survey_code)-6)) %>% glimpse()

Rows: 1,408
Columns: 8
$ survey_code         [3m[90m<chr>[39m[23m "YVP 10 2017-06-09", "YVP 10 2017-06-09", "YVP 10…
$ groundcover_type    [3m[90m<chr>[39m[23m "BG", "BV", "G", "L", "LIC", "M", "R", "WD", "BG"…
$ groundcover_pct_avg [3m[90m<dbl>[39m[23m 2.4, 66.5, 1.1, 15.5, 7.4, 10.0, 0.0, 0.0, 2.6, 5…
$ plot_code           [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP 10",…
$ plot_loc            [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ plot_rep            [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ plot_num            [3m[90m<int>[39m[23m 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1…
$ year                [3m[90m<chr>[39m[23m "2017", "2017", "2017", "2017", "2017", "2017", "…


## Join

In [134]:
df_year %>% glimpse()

Rows: 1,408
Columns: 8
$ survey_code         [3m[90m<chr>[39m[23m "YVP 10 2017-06-09", "YVP 10 2017-06-09", "YVP 10…
$ groundcover_type    [3m[90m<chr>[39m[23m "BG", "BV", "G", "L", "LIC", "M", "R", "WD", "BG"…
$ groundcover_pct_avg [3m[90m<dbl>[39m[23m 2.4, 66.5, 1.1, 15.5, 7.4, 10.0, 0.0, 0.0, 2.6, 5…
$ plot_code           [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "YVP 10",…
$ plot_loc            [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ plot_rep            [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ plot_num            [3m[90m<int>[39m[23m 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1…
$ year                [3m[90m<chr>[39m[23m "2017", "2017", "2017", "2017", "2017", "2017", "…


In [135]:
df_location_class %>% glimpse()

Rows: 582
Columns: 7
$ grid_point                  [3m[90m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13…
$ aspect_mean_deg             [3m[90m<dbl>[39m[23m 334.7050, 45.3030, 221.3340, 290.4890, 28…
$ elevation_mean_m            [3m[90m<dbl>[39m[23m 1395.64, 1456.09, 1126.90, 1166.33, 1179.…
$ slope_mean_deg              [3m[90m<dbl>[39m[23m 28.44230, 12.22630, 4.25130, 2.68361, 4.2…
$ cover_type_2016_gridVeg     [3m[90m<chr>[39m[23m "woodland/forest", "non-irrigated grassla…
$ type3_vegetation_indicators [3m[90m<chr>[39m[23m "mixed canopy conifer", "uncultivated gra…
$ type4_indicators_history    [3m[90m<chr>[39m[23m "mixed canopy conifer", "uncultivated gra…


In [136]:
df_join <- df_year %>%
  left_join(df_location_class, by = c("plot_num" = "grid_point")) %>% glimpse()

Rows: 1,408
Columns: 14
$ survey_code                 [3m[90m<chr>[39m[23m "YVP 10 2017-06-09", "YVP 10 2017-06-09",…
$ groundcover_type            [3m[90m<chr>[39m[23m "BG", "BV", "G", "L", "LIC", "M", "R", "W…
$ groundcover_pct_avg         [3m[90m<dbl>[39m[23m 2.4, 66.5, 1.1, 15.5, 7.4, 10.0, 0.0, 0.0…
$ plot_code                   [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "…
$ plot_loc                    [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ plot_rep                    [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ plot_num                    [3m[90m<int>[39m[23m 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1…
$ year                        [3m[90m<chr>[39m[23m "2017", "2017", "2017", "2017", "2017", "…
$ aspect_mean_deg             [3m[90m<dbl>[39m[23m 18.8095, 18.8095, 18.8095, 18.8095, 18.80…
$ elevation_mean_m            [3m[90m<dbl>[39m[23m 1146.9, 1146.9, 1146.9, 1146.9, 1146.9,

## Organise Columns

In [142]:
df_reorder <- df_join[c("survey_code","plot_code","year","plot_loc","plot_rep",
                         "plot_num","aspect_mean_deg","elevation_mean_m",
                         "slope_mean_deg","cover_type_2016_gridVeg","type3_vegetation_indicators",
                         "type4_indicators_history","groundcover_type","groundcover_pct_avg")] %>% glimpse()

Rows: 1,408
Columns: 14
$ survey_code                 [3m[90m<chr>[39m[23m "YVP 10 2017-06-09", "YVP 10 2017-06-09",…
$ plot_code                   [3m[90m<chr>[39m[23m "YVP 10", "YVP 10", "YVP 10", "YVP 10", "…
$ year                        [3m[90m<chr>[39m[23m "2017", "2017", "2017", "2017", "2017", "…
$ plot_loc                    [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ plot_rep                    [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ plot_num                    [3m[90m<int>[39m[23m 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1…
$ aspect_mean_deg             [3m[90m<dbl>[39m[23m 18.8095, 18.8095, 18.8095, 18.8095, 18.80…
$ elevation_mean_m            [3m[90m<dbl>[39m[23m 1146.9, 1146.9, 1146.9, 1146.9, 1146.9, 1…
$ slope_mean_deg              [3m[90m<dbl>[39m[23m 20.7940, 20.7940, 20.7940, 20.7940, 20.79…
$ cover_type_2016_gridVeg     [3m[90m<chr>[39m[23m "non-irrigated grasslands", "non-irriga

# Output

In [143]:
df_yvp_groundCover_cover <- df_reorder

In [144]:
write_csv(df_yvp_groundCover_cover, path = "yvp_groundCover_cover-WRANGLE.csv")