<a href="https://colab.research.google.com/github/samsoe/mpg_notebooks/blob/master/gridVeg_plant_functional_groups_WRANGLE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Security

* The user must load a `json` file containing the BigQuery API key into the local directory `/content/...`
* The user must have a Google Maps API key to enable mapping. 
   * CAUTION make sure the key is deleted from the current instance of the notebook before sharing

# Tools

In [None]:
library(tidyverse)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 1.0.0
[32m✔[39m [34mtidyr  [39m 1.1.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



* Remember that the file containing authorization keys for Big Query must be loaded into the virutual envrionment manually.

In [None]:
install.packages("bigrquery")
library(bigrquery)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘bit’, ‘bit64’, ‘gargle’, ‘rapidjsonr’




# Source

## Database Connection

In [None]:
# BigQuery API Key
bq_auth(path = "/content/mpg-data-warehouse-api_key-master.json")

In [None]:
Sys.setenv(BIGQUERY_TEST_PROJECT = "mpg-data-warehouse")

In [None]:
billing <- bq_test_project()

## Database Query

### gridVeg_plant_intercepts

In [None]:
sql_plant_intercepts <- 
"
  SELECT 
    survey_ID,
    grid_point,
    key_plant_code,
    plant_native_status,
    plant_life_cycle,
    plant_life_form,
    intercepts_pct
  FROM 
    `mpg-data-warehouse.vegetation_gridVeg_summaries.gridVeg_plant_intercepts`
"

In [None]:
bq_plant_intercepts <- bq_project_query(billing, sql_plant_intercepts)

In [None]:
tb_plant_intercepts <- bq_table_download(bq_plant_intercepts)

In [None]:
df_plant_intercepts <- as.data.frame(tb_plant_intercepts)

In [None]:
head(df_plant_intercepts, n=4)

Unnamed: 0_level_0,survey_ID,grid_point,key_plant_code,plant_native_status,plant_life_cycle,plant_life_form,intercepts_pct
Unnamed: 0_level_1,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<dbl>
1,436,1,HEUCYL,native,perennial,forb,2.5
2,436,1,ALLCER,native,perennial,forb,0.5
3,436,1,GEUTRI,native,perennial,forb,1.0
4,436,1,ERIG_SP,native,unknown,forb,0.5


### gridVeg_survey_metadata

In [None]:
sql_survey_metadata <- "
  SELECT 
    survey_ID,
    year,
    survey_sequence
  FROM
    mpg-data-warehouse.vegetation_point_intercept_gridVeg.gridVeg_survey_metadata
"

In [None]:
bq_survey_metadata <- bq_project_query(billing, sql_survey_metadata)

In [None]:
tb_survey_metadata <- bq_table_download(bq_survey_metadata)

In [None]:
df_survey_metadata <- as.data.frame(tb_survey_metadata)

### location_position_classification

In [None]:
sql_position_class <- "
  SELECT
    grid_point,
    aspect_mean_deg,
    elevation_mean_m,
    slope_mean_deg,
    cover_type_2016_gridVeg,
    type3_vegetation_indicators,
    type4_indicators_history
  FROM
    `mpg-data-warehouse.grid_point_summaries.location_position_classification` 
"

In [None]:
bq_position_class <- bq_project_query(billing, sql_position_class)

In [None]:
tb_position_class <- bq_table_download(bq_position_class)

In [None]:
df_position_class <- as.data.frame(tb_position_class)

# Wrangle

Start with the view vegetation_gridVeg_summaries:gridVeg_plant_intercepts. Remove records where key_plant_code = “NV” (corresponds with key_plant_species = 360).

## Explore key_plant_code "NV"

In [None]:
df_plant_intercepts %>%
  filter(key_plant_code == "NV")

## Remove key_plant_code "NV"

In [None]:
df_plant_functional_groups <- df_plant_intercepts %>%
  filter(key_plant_code != "NV")

## Recode

Recode the levels of plant_life_cycle to simplify them (re-coded values are supplied in the Readme).


In [None]:
df_plant_functional_groups <- df_plant_functional_groups %>%
  mutate(plant_life_cycle = ifelse(plant_life_cycle == "biennial perennial" |
                                   plant_life_cycle == "annual perennial" |
                                   plant_life_cycle == "annual biennial perennial" |
                                   plant_life_cycle == "annual biennial"
                                   , "multiple", plant_life_cycle))

## Calculate Detection Rate

Group the data on {survey_ID, plant_native_status, plant_life_cycle, plant_life_form}. Sum the intercepts_pct within each group, and divide the sums by 4 to obtain the detection_rate per 100 intercepts possible. 

In [None]:
df_plant_functional_groups %>%
  group_by(survey_ID, plant_native_status, plant_life_cycle, plant_life_form) %>%
  summarise(intercepts_pct_sum = sum(intercepts_pct), detection_rate = sum(intercepts_pct)/4) %>% ungroup() %>% 
  filter(survey_ID == "436")

`summarise()` regrouping output by 'survey_ID', 'plant_native_status', 'plant_life_cycle' (override with `.groups` argument)



survey_ID,plant_native_status,plant_life_cycle,plant_life_form,intercepts_pct_sum,detection_rate
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
436,native,annual,forb,3.5,0.875
436,native,multiple,forb,1.0,0.25
436,native,perennial,forb,8.5,2.125
436,native,perennial,graminoid,44.5,11.125
436,native,perennial,shrub,10.5,2.625
436,native,perennial,tree,2.0,0.5
436,native,unknown,forb,0.5,0.125
436,nonnative,annual,forb,1.5,0.375
436,nonnative,annual,graminoid,5.5,1.375
436,nonnative,multiple,forb,1.0,0.25


In [None]:
df_plant_functional_groups <- df_plant_functional_groups %>%
  group_by(survey_ID, plant_native_status, plant_life_cycle, plant_life_form) %>%
  summarise(intercepts_pct_sum = sum(intercepts_pct), detection_rate = sum(intercepts_pct)/4) %>% 
  ungroup()

`summarise()` regrouping output by 'survey_ID', 'plant_native_status', 'plant_life_cycle' (override with `.groups` argument)



In [None]:
df_plant_functional_groups %>% glimpse()

Rows: 9,015
Columns: 6
$ survey_ID           [3m[90m<chr>[39m[23m "012C5FAD-2451-41B0-9E2F-432D1ECEB55C", "012C5FAD…
$ plant_native_status [3m[90m<chr>[39m[23m "native", "native", "native", "native", "native",…
$ plant_life_cycle    [3m[90m<chr>[39m[23m "annual", "multiple", "perennial", "perennial", "…
$ plant_life_form     [3m[90m<chr>[39m[23m "forb", "forb", "forb", "graminoid", "shrub", "fo…
$ intercepts_pct_sum  [3m[90m<dbl>[39m[23m 6.5, 1.0, 14.5, 5.0, 8.5, 5.0, 46.0, 15.0, 25.0, …
$ detection_rate      [3m[90m<dbl>[39m[23m 1.625, 0.250, 3.625, 1.250, 2.125, 1.250, 11.500,…


Then, make sure all combinations of functional groups that are found in the data are represented in each survey_ID. For those groups which were not detected at a survey_ID, fill the detection_rate with 0. This will complete the data set and make averages and other statistical comparisons more meaningful.

The number of records produced in the final dataset should be predictable from the number of surveys and the possible combinations of plant functional groups (pfg). With 1242 surveys and 25 pfg, we should end up with 31,050 records in the final data set.

In [None]:
# Number of surveys
df_plant_functional_groups$survey_ID %>% 
  unique() %>% 
  length()

In [None]:
# Possible combinations of pfg
df_plant_functional_groups %>% 
  select(plant_native_status, plant_life_cycle, plant_life_form) %>% 
  distinct() %>% 
  arrange(plant_native_status, plant_life_cycle)

## Group Fill
Indeed, 31,050 records are produced. 

In [None]:
df_plant_functional_groups <- 
  df_plant_functional_groups %>%
    complete(survey_ID, 
            nesting(plant_native_status, plant_life_cycle, plant_life_form), 
            fill = list(intercepts_pct_sum = 0, detection_rate = 0)) %>% 
    glimpse()

Rows: 31,050
Columns: 9
$ survey_ID           [3m[90m<chr>[39m[23m "012C5FAD-2451-41B0-9E2F-432D1ECEB55C", "012C5FAD…
$ plant_native_status [3m[90m<chr>[39m[23m "native", "native", "native", "native", "native",…
$ plant_life_cycle    [3m[90m<chr>[39m[23m "annual", "annual", "biennial", "multiple", "mult…
$ plant_life_form     [3m[90m<chr>[39m[23m "forb", "graminoid", "forb", "forb", "graminoid",…
$ intercepts_pct_sum  [3m[90m<dbl>[39m[23m 6.5, 0.0, 0.0, 1.0, 0.0, 14.5, 5.0, 8.5, 0.0, 0.0…
$ detection_rate      [3m[90m<dbl>[39m[23m 1.625, 0.000, 0.000, 0.250, 0.000, 3.625, 1.250, …
$ grid_point          [3m[90m<int>[39m[23m 285, 285, 285, 285, 285, 285, 285, 285, 285, 285,…
$ year                [3m[90m<int>[39m[23m 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2…
$ survey_sequence     [3m[90m<chr>[39m[23m "2016", "2016", "2016", "2016", "2016", "2016", "…


In [None]:
df_plant_functional_groups %>% head()

survey_ID,plant_native_status,plant_life_cycle,plant_life_form,intercepts_pct_sum,detection_rate,grid_point,year,survey_sequence
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<chr>
012C5FAD-2451-41B0-9E2F-432D1ECEB55C,native,annual,forb,6.5,1.625,285,2016,2016
012C5FAD-2451-41B0-9E2F-432D1ECEB55C,native,annual,graminoid,0.0,0.0,285,2016,2016
012C5FAD-2451-41B0-9E2F-432D1ECEB55C,native,biennial,forb,0.0,0.0,285,2016,2016
012C5FAD-2451-41B0-9E2F-432D1ECEB55C,native,multiple,forb,1.0,0.25,285,2016,2016
012C5FAD-2451-41B0-9E2F-432D1ECEB55C,native,multiple,graminoid,0.0,0.0,285,2016,2016
012C5FAD-2451-41B0-9E2F-432D1ECEB55C,native,perennial,forb,14.5,3.625,285,2016,2016


## Join Metadata

Finally, join the metadata from requested tables or views and complete the schema below. 

In [None]:
# bring gridpoints back in
grid_point_ref <- df_plant_intercepts %>%
  select(survey_ID, grid_point) %>%
  distinct(survey_ID, grid_point)

df_plant_functional_groups <- df_plant_functional_groups %>%
  left_join(grid_point_ref)

Joining, by = c("survey_ID", "grid_point")



In [None]:
df_plant_functional_groups <- df_plant_functional_groups %>%
  left_join(df_survey_metadata) %>%
  glimpse()

Joining, by = c("survey_ID", "year", "survey_sequence")



Rows: 31,050
Columns: 9
$ survey_ID           [3m[90m<chr>[39m[23m "012C5FAD-2451-41B0-9E2F-432D1ECEB55C", "012C5FAD…
$ plant_native_status [3m[90m<chr>[39m[23m "native", "native", "native", "native", "native",…
$ plant_life_cycle    [3m[90m<chr>[39m[23m "annual", "annual", "biennial", "multiple", "mult…
$ plant_life_form     [3m[90m<chr>[39m[23m "forb", "graminoid", "forb", "forb", "graminoid",…
$ intercepts_pct_sum  [3m[90m<dbl>[39m[23m 6.5, 0.0, 0.0, 1.0, 0.0, 14.5, 5.0, 8.5, 0.0, 0.0…
$ detection_rate      [3m[90m<dbl>[39m[23m 1.625, 0.000, 0.000, 0.250, 0.000, 3.625, 1.250, …
$ grid_point          [3m[90m<int>[39m[23m 285, 285, 285, 285, 285, 285, 285, 285, 285, 285,…
$ year                [3m[90m<int>[39m[23m 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2…
$ survey_sequence     [3m[90m<chr>[39m[23m "2016", "2016", "2016", "2016", "2016", "2016", "…


In [None]:
df_plant_functional_groups <- df_plant_functional_groups %>%
  left_join(df_position_class) %>%
  glimpse()

Joining, by = "grid_point"



Rows: 31,050
Columns: 15
$ survey_ID                   [3m[90m<chr>[39m[23m "012C5FAD-2451-41B0-9E2F-432D1ECEB55C", "…
$ plant_native_status         [3m[90m<chr>[39m[23m "native", "native", "native", "native", "…
$ plant_life_cycle            [3m[90m<chr>[39m[23m "annual", "annual", "biennial", "multiple…
$ plant_life_form             [3m[90m<chr>[39m[23m "forb", "graminoid", "forb", "forb", "gra…
$ intercepts_pct_sum          [3m[90m<dbl>[39m[23m 6.5, 0.0, 0.0, 1.0, 0.0, 14.5, 5.0, 8.5, …
$ detection_rate              [3m[90m<dbl>[39m[23m 1.625, 0.000, 0.000, 0.250, 0.000, 3.625,…
$ grid_point                  [3m[90m<int>[39m[23m 285, 285, 285, 285, 285, 285, 285, 285, 2…
$ year                        [3m[90m<int>[39m[23m 2016, 2016, 2016, 2016, 2016, 2016, 2016,…
$ survey_sequence             [3m[90m<chr>[39m[23m "2016", "2016", "2016", "2016", "2016", "…
$ aspect_mean_deg             [3m[90m<dbl>[39m[23m 138.749, 138.749, 138.749, 138.749, 13

# Output

In [None]:
write_csv(df_plant_functional_groups, path = "gridVeg_plant_functional_groups-WRANGLE.csv")