# Security

* This notebook uses Application Default Credentials (ADC) for BigQuery authentication.
* Authenticate with gcloud before running: `gcloud auth application-default login`
* No API key files needed - authentication is handled through your gcloud session.

# Tools

In [1]:
# Package and library installation
packages_needed = c("tidyverse", "knitr") # comma delimited vector of package names
packages_installed = packages_needed %in% rownames(installed.packages())

if (any(! packages_installed))
  install.packages(packages_needed[! packages_installed])
for (i in 1:length(packages_needed)) {
  library(packages_needed[i], character.only = T)
}

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.1     [32m✔[39m [34mstringr  [39m 1.6.0
[32m✔[39m [34mggplot2  [39m 4.0.0     [32m✔[39m [34mtibble   [39m 3.3.0
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.2.0     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


* Using Application Default Credentials - no manual key file needed if you've run `gcloud auth application-default login`

In [2]:
install.packages("bigrquery")
library(bigrquery)

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



# Source

## Database Connection

In [3]:
# BigQuery Authentication using Application Default Credentials
# This will use your gcloud authentication automatically
# Run this first in terminal: gcloud auth application-default login
bq_auth()

In [4]:
Sys.setenv(BIGQUERY_TEST_PROJECT = "mpg-data-warehouse")

In [5]:
billing <- bq_test_project()

## Database Query

### gridVeg_plant_intercepts

In [6]:
sql_plant_intercepts <-
"
  SELECT
    survey_ID,
    grid_point,
    key_plant_code,
    plant_native_status,
    plant_life_cycle,
    plant_life_form,
    intercepts_pct
  FROM
    `mpg-data-warehouse.vegetation_gridVeg_summaries.gridVeg_foliar_cover_all`
"
bq_plant_intercepts <- bq_project_query(billing, sql_plant_intercepts)
tb_plant_intercepts <- bq_table_download(bq_plant_intercepts)
df_plant_intercepts <- as.data.frame(tb_plant_intercepts) %>% glimpse()

Rows: 28,083
Columns: 7
$ survey_ID           [3m[90m<chr>[39m[23m "436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"…
$ grid_point          [3m[90m<int>[39m[23m 1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m,[39m…
$ key_plant_code      [3m[90m<chr>[39m[23m "ACHMIL"[90m, [39m"SEDLAN"[90m, [39m"ELYTRA"[90m, [39m"VALDIO"[90m, [39m"GALAPA"[90m, [39m…
$ plant_native_status [3m[90m<chr>[39m[23m "native"[90m, [39m"native"[90m, [39m"native"[90m, [39m"native"[90m, [39m"native"[90m, [39m…
$ plant_life_cycle    [3m[90m<chr>[39m[23m "unknown"[90m, [39m"unknown"[90m, [39m"unknown"[90m, [39m"unknown"[90m, [39m"unkno…
$ plant_life_form     [3m[90m<chr>[39m[23m "forb"[90m, [39m"forb"[90m, [39m"graminoid"[90m, 

### gridVeg_survey_metadata

In [7]:
sql_survey_metadata <- "
  SELECT
    survey_ID,
    year,
    survey_sequence,
    grid_point
  FROM
    mpg-data-warehouse.vegetation_point_intercept_gridVeg.gridVeg_survey_metadata
"
bq_survey_metadata <- bq_project_query(billing, sql_survey_metadata)
tb_survey_metadata <- bq_table_download(bq_survey_metadata)
df_survey_metadata <- as.data.frame(tb_survey_metadata) %>% glimpse()

Rows: 1,723
Columns: 4
$ survey_ID       [3m[90m<chr>[39m[23m "138"[90m, [39m"139"[90m, [39m"135"[90m, [39m"134"[90m, [39m"137"[90m, [39m"136"[90m, [39m"141"[90m, [39m"143"…
$ year            [3m[90m<int>[39m[23m 2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m…
$ survey_sequence [3m[90m<chr>[39m[23m "2010"[90m, [39m"2010"[90m, [39m"2010"[90m, [39m"2010"[90m, [39m"2010"[90m, [39m"2010"[90m, [39m"2010"…
$ grid_point      [3m[90m<int>[39m[23m 61[90m, [39m60[90m, [39m52[90m, [39m51[90m, [39m62[90m, [39m44[90m, [39m57[90m, [39m66[90m, [39m65[90m, [39m59[90m, [39m69[90m, [39m76[90m, [39m75[90m, [39m68…


# Wrangle

Start with the view vegetation_gridVeg_summaries:gridVeg_plant_intercepts. Remove records where key_plant_code = “NV” (corresponds with key_plant_species = 360).

## Explore key_plant_code "NV"
No records should exist with no vegetation (NV) because this is built with the gridVeg_foliar_cover_all data.

In [8]:
df_plant_intercepts %>%
  filter(key_plant_code == "NV") %>%
  glimpse()

Rows: 0
Columns: 7
$ survey_ID           [3m[90m<chr>[39m[23m 
$ grid_point          [3m[90m<int>[39m[23m 
$ key_plant_code      [3m[90m<chr>[39m[23m 
$ plant_native_status [3m[90m<chr>[39m[23m 
$ plant_life_cycle    [3m[90m<chr>[39m[23m 
$ plant_life_form     [3m[90m<chr>[39m[23m 
$ intercepts_pct      [3m[90m<dbl>[39m[23m 


In [9]:
df_plant_functional_groups <- df_plant_intercepts %>%
  filter(key_plant_code != "NV")

## Calculate Detection Rate

Group the data on {survey_ID, plant_native_status, plant_life_cycle, plant_life_form}. Sum the intercepts_pct within each group.

In [10]:
df_plant_functional_groups %>%
  group_by(survey_ID, plant_native_status, plant_life_cycle, plant_life_form) %>%
  summarise(detection_rate = sum(intercepts_pct)) %>% ungroup() %>%
  filter(survey_ID == "436")

[1m[22m`summarise()` has grouped output by 'survey_ID', 'plant_native_status',
'plant_life_cycle'. You can override using the `.groups` argument.


survey_ID,plant_native_status,plant_life_cycle,plant_life_form,detection_rate
<chr>,<chr>,<chr>,<chr>,<dbl>
436,native,unknown,forb,13.5
436,native,unknown,graminoid,44.5
436,native,unknown,shrub,10.5
436,native,unknown,tree,2.0
436,nonnative,unknown,forb,3.0
436,nonnative,unknown,graminoid,5.5


In [11]:
df_detection_rate <- df_plant_functional_groups %>%
  group_by(survey_ID, plant_native_status, plant_life_cycle, plant_life_form) %>%
  summarise(detection_rate = sum(intercepts_pct)) %>%
  ungroup()

[1m[22m`summarise()` has grouped output by 'survey_ID', 'plant_native_status',
'plant_life_cycle'. You can override using the `.groups` argument.


In [12]:
df_detection_rate %>% glimpse()

Rows: 7,062
Columns: 5
$ survey_ID           [3m[90m<chr>[39m[23m "012C5FAD-2451-41B0-9E2F-432D1ECEB55C"[90m, [39m"012C5FAD-…
$ plant_native_status [3m[90m<chr>[39m[23m "native"[90m, [39m"native"[90m, [39m"native"[90m, [39m"nonnative"[90m, [39m"nonnat…
$ plant_life_cycle    [3m[90m<chr>[39m[23m "unknown"[90m, [39m"unknown"[90m, [39m"unknown"[90m, [39m"unknown"[90m, [39m"unkno…
$ plant_life_form     [3m[90m<chr>[39m[23m "forb"[90m, [39m"graminoid"[90m, [39m"shrub"[90m, [39m"forb"[90m, [39m"graminoid"[90m,[39m…
$ detection_rate      [3m[90m<dbl>[39m[23m 22.0[90m, [39m5.0[90m, [39m8.5[90m, [39m45.0[90m, [39m46.0[90m, [39m22.5[90m, [39m15.5[90m, [39m72.5[90m, [39m3.0[90m,[39m…


Then, make sure all combinations of functional groups that are found in the data are represented in each survey_ID. For those groups which were not detected at a survey_ID, fill the detection_rate with 0. This will complete the data set and make averages and other statistical comparisons more meaningful.

The number of records produced in the final dataset should be predictable from the number of surveys and the possible combinations of plant functional groups (pfg). With 1242 surveys and 23 pfg, we should end up with 28,556 records in the final data set.

In [13]:
# Number of surveys
df_detection_rate$survey_ID %>%
  unique() %>%
  length()

In [14]:
# Possible combinations of pfg
df_detection_rate %>%
  select(plant_native_status, plant_life_cycle, plant_life_form) %>%
  distinct() %>%
  arrange(plant_native_status, plant_life_cycle)

plant_native_status,plant_life_cycle,plant_life_form
<chr>,<chr>,<chr>
native,perennial,forb
native,perennial,graminoid
native,unknown,forb
native,unknown,graminoid
native,unknown,shrub
native,unknown,tree
native,unknown,vine
nonnative,unknown,forb
nonnative,unknown,graminoid
nonnative,unknown,vine


## Group Fill

In [15]:
df_plant_functional_groups_comp <-
  df_detection_rate %>% ungroup() %>%
    complete(survey_ID,
            nesting(plant_native_status, plant_life_cycle, plant_life_form),
            fill = list(detection_rate = 0)) %>%
    glimpse()

Rows: 19,344
Columns: 5
$ survey_ID           [3m[90m<chr>[39m[23m "012C5FAD-2451-41B0-9E2F-432D1ECEB55C"[90m, [39m"012C5FAD-…
$ plant_native_status [3m[90m<chr>[39m[23m "native"[90m, [39m"native"[90m, [39m"native"[90m, [39m"native"[90m, [39m"native"[90m, [39m…
$ plant_life_cycle    [3m[90m<chr>[39m[23m "perennial"[90m, [39m"perennial"[90m, [39m"unknown"[90m, [39m"unknown"[90m, [39m"u…
$ plant_life_form     [3m[90m<chr>[39m[23m "forb"[90m, [39m"graminoid"[90m, [39m"forb"[90m, [39m"graminoid"[90m, [39m"shrub"[90m,[39m…
$ detection_rate      [3m[90m<dbl>[39m[23m 0.0[90m, [39m0.0[90m, [39m22.0[90m, [39m5.0[90m, [39m8.5[90m, [39m0.0[90m, [39m0.0[90m, [39m45.0[90m, [39m46.0[90m, [39m0.…


## Join Metadata

Finally, join the metadata from requested tables or views and complete the schema below.

In [16]:
# bring gridpoints back in
grid_point_ref <- df_plant_intercepts %>%
  select(survey_ID, grid_point) %>%
  distinct(survey_ID, grid_point)

In [17]:
df_plant_functional_groups_join <-
  df_plant_functional_groups_comp %>%
  left_join(df_survey_metadata, by = "survey_ID") %>%
  select(survey_ID, year, survey_sequence, grid_point, everything()) %>%
  arrange(year, grid_point) %>%
  glimpse()

Rows: 19,344
Columns: 8
$ survey_ID           [3m[90m<chr>[39m[23m "436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"…
$ year                [3m[90m<int>[39m[23m 2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m20…
$ survey_sequence     [3m[90m<chr>[39m[23m "2011-12"[90m, [39m"2011-12"[90m, [39m"2011-12"[90m, [39m"2011-12"[90m, [39m"2011-…
$ grid_point          [3m[90m<int>[39m[23m 1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m2[90m, [39m2[90m, [39m2[90m, [39m2[90m,[39m…
$ plant_native_status [3m[90m<chr>[39m[23m "native"[90m, [39m"native"[90m, [39m"native"[90m, [39m"native"[90m, [39m"native"[90m, [39m…
$ plant_life_cycle    [3m[90m<chr>[39m[23m "perennial"[90m, [39m"

In [18]:
df_plant_functional_groups_join %>% distinct(year)

year
<int>
2011
2012
2013
2015
2016
2017
2021
2022
2023
2024


## Filter

In [19]:
df_plant_functional_groups_output <- df_plant_functional_groups_join %>%
  filter(year > 2022)

# Output

In [None]:
# updated output 2022-08-18 esamsoe
# write_csv(df_plant_functional_groups_output, file = "gridVeg_plant_functional_groups-WRANGLE.csv")

# updated output 2025-11-04 esamsoe
# write_csv(df_plant_functional_groups_output, file = "../data/processed/gridVeg_plant_functional_groups-WRANGLE-251104.csv")