# Security


* This notebook uses Application Default Credentials (ADC) for BigQuery authentication.
* Authenticate with gcloud before running: `gcloud auth application-default login`
* No API key files needed - authentication is handled through your gcloud session.


# Tools

In [27]:
# Package and library installation
packages_needed = c("tidyverse", "knitr") # comma delimited vector of package names
packages_installed = packages_needed %in% rownames(installed.packages())

if (any(! packages_installed))
  install.packages(packages_needed[! packages_installed])
for (i in 1:length(packages_needed)) {
  library(packages_needed[i], character.only = T)
}

* Using Application Default Credentials - no manual key file needed if you've run `gcloud auth application-default login`


In [28]:
install.packages("bigrquery")
library(bigrquery)

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



# Source

## Database Connection

In [29]:
# BigQuery Authentication using Application Default Credentials
# This will use your gcloud authentication automatically
# Run this first in terminal: gcloud auth application-default login
bq_auth()

In [30]:
Sys.setenv(BIGQUERY_TEST_PROJECT = "mpg-data-warehouse")

In [31]:
billing <- bq_test_project()

## Database Query

### gridVeg_plant_intercepts

In [32]:
# vegetation_gridVeg_summaries:gridVeg_plant_intercepts
sql_plant_intercepts <-
"
  SELECT
    survey_ID, grid_point, year, key_plant_species
  FROM
    `mpg-data-warehouse.vegetation_gridVeg_summaries.gridVeg_foliar_cover_all`
"
bq_plant_intercepts <- bq_project_query(billing, sql_plant_intercepts)
tb_plant_intercepts <- bq_table_download(bq_plant_intercepts)

In [33]:
df_plant_intercepts <- as.data.frame(tb_plant_intercepts) %>%
  mutate(detection_type = "point_intercept") %>% glimpse()

Rows: 28,083
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"43…
$ grid_point        [3m[90m<int>[39m[23m 1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1…
$ year              [3m[90m<int>[39m[23m 2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011…
$ key_plant_species [3m[90m<int>[39m[23m 5[90m, [39m481[90m, [39m201[90m, [39m554[90m, [39m241[90m, [39m16[90m, [39m250[90m, [39m102[90m, [39m267[90m, [39m154[90m, [39m57[90m, [39m2…
$ detection_type    [3m[90m<chr>[39m[23m "point_intercept"[90m, [39m"point_intercept"[90m, [39m"point_interce…


In [34]:
df_plant_intercepts %>% filter(is.na(key_plant_species))

survey_ID,grid_point,year,key_plant_species,detection_type
<chr>,<int>,<int>,<int>,<chr>


### gridVeg_additional_species

In [35]:
# vegetation_point_intercept_gridVeg:gridVeg_additional_species
sql_add_spec <-
  "
    SELECT
      survey_ID, grid_point, year, key_plant_species
    FROM
      `mpg-data-warehouse.vegetation_point_intercept_gridVeg.gridVeg_additional_species`
  "
bq_add_spec <- bq_project_query(billing, sql_add_spec)
tb_add_spec <- bq_table_download(bq_add_spec)

In [36]:
df_add_spec <- as.data.frame(tb_add_spec) %>%
  mutate(detection_type = "supplemental_obs") %>%
  filter(!is.na(key_plant_species)) %>%
  glimpse()

Rows: 14,038
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"30…
$ grid_point        [3m[90m<int>[39m[23m 324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m23…
$ year              [3m[90m<int>[39m[23m 2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011…
$ key_plant_species [3m[90m<int>[39m[23m 69[90m, [39m72[90m, [39m5[90m, [39m82[90m, [39m75[90m, [39m174[90m, [39m230[90m, [39m67[90m, [39m428[90m, [39m529[90m, [39m240[90m, [39m308[90m,[39m…
$ detection_type    [3m[90m<chr>[39m[23m "supplemental_obs"[90m, [39m"supplemental_obs"[90m, [39m"supplementa…


In [37]:
df_add_spec %>% filter(is.na(key_plant_species))

survey_ID,grid_point,year,key_plant_species,detection_type
<chr>,<int>,<int>,<int>,<chr>


### gridVeg_survey_metadata


In [38]:
# vegetation_point_intercept_gridVeg:gridVeg_survey_metadata
sql_survey_meta <-
  "
    SELECT
      *
    FROM
      `mpg-data-warehouse.vegetation_point_intercept_gridVeg.gridVeg_survey_metadata`
  "
bq_survey_meta <- bq_project_query(billing, sql_survey_meta)
tb_survey_meta <- bq_table_download(bq_survey_meta)
df_survey_meta <- as.data.frame(tb_survey_meta) %>% glimpse()

Rows: 1,723
Columns: 6
$ survey_ID       [3m[90m<chr>[39m[23m "138"[90m, [39m"139"[90m, [39m"135"[90m, [39m"134"[90m, [39m"137"[90m, [39m"136"[90m, [39m"141"[90m, [39m"143"…
$ grid_point      [3m[90m<int>[39m[23m 61[90m, [39m60[90m, [39m52[90m, [39m51[90m, [39m62[90m, [39m44[90m, [39m57[90m, [39m66[90m, [39m65[90m, [39m59[90m, [39m69[90m, [39m76[90m, [39m75[90m, [39m68…
$ year            [3m[90m<int>[39m[23m 2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m…
$ date            [3m[90m<date>[39m[23m 2010-08-27[90m, [39m2010-08-27[90m, [39m2010-08-27[90m, [39m2010-08-27[90m, [39m2010-…
$ survey_sequence [3m[90m<chr>[39m[23m "2010"[90m, [39m"2010"[90m, [39m"2010"[90m, [39m"2010"[90m, [39m"2010"[90m, [39m"2010"[90m, [39m"2010"…
$ surveyor        [3m[90m<chr>[39m[23m "EAR"[90m, [39m"EAR"[90m, [39m"EAR"[90m, [3

### location_position_classification

In [39]:
# grid_point_summaries:location_position_classification
sql_loc_pos <-
  "
    SELECT
      *
    FROM
      `mpg-data-warehouse.grid_point_summaries.location_position_classification`
  "
bq_loc_pos <- bq_project_query(billing, sql_loc_pos)
tb_loc_pos <- bq_table_download(bq_loc_pos)
df_loc_pos <- as.data.frame(tb_loc_pos) %>% glimpse()

Rows: 588
Columns: 16
$ grid_point                  [3m[90m<int>[39m[23m 1[90m, [39m2[90m, [39m3[90m, [39m4[90m, [39m5[90m, [39m6[90m, [39m7[90m, [39m8[90m, [39m9[90m, [39m10[90m, [39m11[90m, [39m12[90m, [39m13[90m,[39m…
$ lat                         [3m[90m<dbl>[39m[23m 46.73193[90m, [39m46.72972[90m, [39m46.72443[90m, [39m46.72487[90m, [39m46…
$ long                        [3m[90m<dbl>[39m[23m -114.0017[90m, [39m-114.0010[90m, [39m-114.0227[90m, [39m-114.0195…
$ aspect_mean_deg             [3m[90m<dbl>[39m[23m 321.18675[90m, [39m65.92522[90m, [39m240.01216[90m, [39m290.66936[90m,[39m…
$ aspect_direction            [3m[90m<chr>[39m[23m "NW"[90m, [39m"ENE"[90m, [39m"WSW"[90m, [39m"WNW"[90m, [39m"W"[90m, [39m"SW"[90m, [39m"W"[90m,[39m…
$ aspect_northness            [3m[90m<dbl>[39m[23m 0.77919303[90m, [39m0.40792854[90m, [39m-0.49981612[90m, [39m0.352…
$ aspect_eastness             [3m[90m<dbl>

### vegetation_species_metadata

In [40]:
# vegetation_species_metadata:vegetation_species_metadata
sql_species_meta <-
  "
    SELECT
      *
    FROM
      `mpg-data-warehouse.vegetation_species_metadata.vegetation_species_metadata`
  "
bq_species_meta <- bq_project_query(billing, sql_species_meta)
tb_species_meta <- bq_table_download(bq_species_meta)
df_species_meta <- as.data.frame(tb_species_meta) %>% glimpse()

Rows: 769
Columns: 9
$ key_plant_species   [3m[90m<int>[39m[23m 656[90m, [39m285[90m, [39m21[90m, [39m22[90m, [39m130[90m, [39m131[90m, [39m129[90m, [39m635[90m, [39m659[90m, [39m685[90m, [39m74…
$ key_plant_code      [3m[90m<chr>[39m[23m "DYSBOT"[90m, [39m"KOCSCO"[90m, [39m"AMABLI"[90m, [39m"AMARET"[90m, [39m"CHEFRE"[90m, [39m…
$ plant_name_sci      [3m[90m<chr>[39m[23m "Dysphania botrys"[90m, [39m"Kochia scoparia"[90m, [39m"Amaranthus…
$ plant_name_syn      [3m[90m<chr>[39m[23m "Chenopodium botrys"[90m, [39m"Bassia scoparia"[90m, [39m[31mNA[39m[90m, [39m[31mNA[39m[90m, [39m[31mN[39m…
$ plant_name_common   [3m[90m<chr>[39m[23m "Jerusalem oak goosefoot"[90m, [39m"common kochia"[90m, [39m"mat p…
$ plant_name_family   [3m[90m<chr>[39m[23m "Amaranthaceae"[90m, [39m"Amaranthaceae"[90m, [39m"Amaranthaceae"[90m,[39m…
$ plant_native_status [3m[90m<chr>[39m[23m "nonnative"[90m, [39m"nonnative"[90m, [39m"n

# Wrangle

In [41]:
df_plant_intercepts %>% glimpse()

Rows: 28,083
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"43…
$ grid_point        [3m[90m<int>[39m[23m 1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1…
$ year              [3m[90m<int>[39m[23m 2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011…
$ key_plant_species [3m[90m<int>[39m[23m 5[90m, [39m481[90m, [39m201[90m, [39m554[90m, [39m241[90m, [39m16[90m, [39m250[90m, [39m102[90m, [39m267[90m, [39m154[90m, [39m57[90m, [39m2…
$ detection_type    [3m[90m<chr>[39m[23m "point_intercept"[90m, [39m"point_intercept"[90m, [39m"point_interce…


In [42]:
df_add_spec %>% glimpse()

Rows: 14,038
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"30…
$ grid_point        [3m[90m<int>[39m[23m 324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m23…
$ year              [3m[90m<int>[39m[23m 2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011…
$ key_plant_species [3m[90m<int>[39m[23m 69[90m, [39m72[90m, [39m5[90m, [39m82[90m, [39m75[90m, [39m174[90m, [39m230[90m, [39m67[90m, [39m428[90m, [39m529[90m, [39m240[90m, [39m308[90m,[39m…
$ detection_type    [3m[90m<chr>[39m[23m "supplemental_obs"[90m, [39m"supplemental_obs"[90m, [39m"supplementa…


## Bind rows

In [43]:
df_bind <- df_plant_intercepts %>%
  # Remove "NV" key plant code which is equal to 360
  filter(key_plant_species != 360) %>%
  bind_rows(df_add_spec) %>%
  glimpse()

Rows: 42,121
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"43…
$ grid_point        [3m[90m<int>[39m[23m 1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1…
$ year              [3m[90m<int>[39m[23m 2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011…
$ key_plant_species [3m[90m<int>[39m[23m 5[90m, [39m481[90m, [39m201[90m, [39m554[90m, [39m241[90m, [39m16[90m, [39m250[90m, [39m102[90m, [39m267[90m, [39m154[90m, [39m57[90m, [39m2…
$ detection_type    [3m[90m<chr>[39m[23m "point_intercept"[90m, [39m"point_intercept"[90m, [39m"point_interce…


## Identify multiples

In [44]:
df_multiple <- df_bind %>%
  group_by(survey_ID, key_plant_species) %>%
  summarise(n = n()) %>%
  arrange(desc(n)) %>%
  filter(n > 1) %>%
  ungroup() %>%
  unite("multiple_key", survey_ID:key_plant_species, remove = FALSE) %>%
  glimpse()

[1m[22m`summarise()` has grouped output by 'survey_ID'. You can override using the
`.groups` argument.


Rows: 1,361
Columns: 4
$ multiple_key      [3m[90m<chr>[39m[23m "249_174"[90m, [39m"326_57"[90m, [39m"376_62"[90m, [39m"425_250"[90m, [39m"450_5"[90m, [39m"…
$ survey_ID         [3m[90m<chr>[39m[23m "249"[90m, [39m"326"[90m, [39m"376"[90m, [39m"425"[90m, [39m"450"[90m, [39m"483"[90m, [39m"485"[90m, [39m"CF…
$ key_plant_species [3m[90m<int>[39m[23m 174[90m, [39m57[90m, [39m62[90m, [39m250[90m, [39m5[90m, [39m240[90m, [39m5[90m, [39m529[90m, [39m132[90m, [39m459[90m, [39m435[90m, [39m39[90m,[39m…
$ n                 [3m[90m<int>[39m[23m 3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m2[90m, [39m2[90m, [39m2[90m, [39m2[90m, [39m2[90m, [39m2[90m, [39m2[90m, [39m2[90m, [39m2[90m, [39m2…


In [45]:
# display multiple example
df_bind %>%
  filter(survey_ID == 249, key_plant_species == 174)

survey_ID,grid_point,year,key_plant_species,detection_type
<chr>,<int>,<int>,<int>,<chr>
249,67,2011,174,point_intercept
249,67,2011,174,supplemental_obs
249,67,2011,174,supplemental_obs


## Remove multiples

In [46]:
# use df_multiple to select and remove from df_supplemental
df_clean_supp <- df_add_spec %>%
  unite("multiple_key", c(survey_ID, key_plant_species), remove = FALSE) %>%
  anti_join(df_multiple, by = "multiple_key") %>%
  select(,-multiple_key) %>%
  glimpse()

Rows: 12,586
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"30…
$ grid_point        [3m[90m<int>[39m[23m 324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m230[90m, [39m23…
$ year              [3m[90m<int>[39m[23m 2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011…
$ key_plant_species [3m[90m<int>[39m[23m 69[90m, [39m72[90m, [39m5[90m, [39m82[90m, [39m75[90m, [39m174[90m, [39m230[90m, [39m67[90m, [39m529[90m, [39m240[90m, [39m308[90m, [39m230[90m,[39m…
$ detection_type    [3m[90m<chr>[39m[23m "supplemental_obs"[90m, [39m"supplemental_obs"[90m, [39m"supplementa…


## Join clean dataframes

In [47]:
df_clean_supp %>%
  glimpse()

Rows: 12,586
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"308"[90m, [39m"30…
$ grid_point        [3m[90m<int>[39m[23m 324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m324[90m, [39m230[90m, [39m23…
$ year              [3m[90m<int>[39m[23m 2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011…
$ key_plant_species [3m[90m<int>[39m[23m 69[90m, [39m72[90m, [39m5[90m, [39m82[90m, [39m75[90m, [39m174[90m, [39m230[90m, [39m67[90m, [39m529[90m, [39m240[90m, [39m308[90m, [39m230[90m,[39m…
$ detection_type    [3m[90m<chr>[39m[23m "supplemental_obs"[90m, [39m"supplemental_obs"[90m, [39m"supplementa…


In [48]:
df_clean_bind <- df_plant_intercepts %>%
  # Remove "NV" key plant code which is equal to 360
  filter(key_plant_species != 360) %>%
  bind_rows(df_clean_supp) %>%
  glimpse()

Rows: 40,669
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"436"[90m, [39m"43…
$ grid_point        [3m[90m<int>[39m[23m 1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1[90m, [39m1…
$ year              [3m[90m<int>[39m[23m 2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011[90m, [39m2011…
$ key_plant_species [3m[90m<int>[39m[23m 5[90m, [39m481[90m, [39m201[90m, [39m554[90m, [39m241[90m, [39m16[90m, [39m250[90m, [39m102[90m, [39m267[90m, [39m154[90m, [39m57[90m, [39m2…
$ detection_type    [3m[90m<chr>[39m[23m "point_intercept"[90m, [39m"point_intercept"[90m, [39m"point_interce…


In [49]:
# double check for multiples
df_clean_bind %>%
  group_by(survey_ID, key_plant_species) %>%
  summarise(n = n()) %>%
  arrange(desc(n)) %>%
  filter(n > 1) %>%
  # I'm not clear if I'm using ungroup in a meaningful way here
  ungroup() %>%
  glimpse()

[1m[22m`summarise()` has grouped output by 'survey_ID'. You can override using the
`.groups` argument.


Rows: 0
Columns: 3
$ survey_ID         [3m[90m<chr>[39m[23m 
$ key_plant_species [3m[90m<int>[39m[23m 
$ n                 [3m[90m<int>[39m[23m 


# Join in Additional Columns

## Filter

In [50]:
gridVeg_species_richness <- df_clean_bind %>%
  filter(year > 2022 & !is.na(key_plant_species)) %>%
  glimpse()
  

Rows: 2,597
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "27869B01-61AE-4DB3-A2AC-AD4D8C9A7ECD"[90m, [39m"27869B01-61…
$ grid_point        [3m[90m<int>[39m[23m 3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m3[90m, [39m4…
$ year              [3m[90m<int>[39m[23m 2023[90m, [39m2023[90m, [39m2023[90m, [39m2023[90m, [39m2023[90m, [39m2023[90m, [39m2023[90m, [39m2023[90m, [39m2023…
$ key_plant_species [3m[90m<int>[39m[23m 529[90m, [39m232[90m, [39m320[90m, [39m265[90m, [39m80[90m, [39m286[90m, [39m153[90m, [39m82[90m, [39m411[90m, [39m576[90m, [39m437[90m,[39m…
$ detection_type    [3m[90m<chr>[39m[23m "point_intercept"[90m, [39m"point_intercept"[90m, [39m"point_interce…


# Output

In [51]:
# 2022-08-17 ES
# write_csv(gridVeg_species_richness, "gridVeg_species_richness-WRANGLE.csv")

# 2025-11-04 esamsoe
# write_csv(gridVeg_species_richness, "../data/processed/gridVeg_species_richness_WRANGLE-251104.csv")
