# Security

* This notebook uses Application Default Credentials (ADC) for BigQuery authentication.
* Authenticate with gcloud before running: `gcloud auth application-default login`
* No API key files needed - authentication is handled through your gcloud session.

# Tools

In [1]:
# Package and library installation
packages_needed = c("tidyverse", "knitr") # comma delimited vector of package names
packages_installed = packages_needed %in% rownames(installed.packages())

if (any(! packages_installed))
  install.packages(packages_needed[! packages_installed])
for (i in 1:length(packages_needed)) {
  library(packages_needed[i], character.only = T)
}

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.1     [32m✔[39m [34mstringr  [39m 1.6.0
[32m✔[39m [34mggplot2  [39m 4.0.0     [32m✔[39m [34mtibble   [39m 3.3.0
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.2.0     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


* Using Application Default Credentials - no manual key file needed if you've run `gcloud auth application-default login`

In [2]:
install.packages("bigrquery")
library(bigrquery)

also installing the dependency ‘nanoparquet’


Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



# Source

## Database Connection

In [3]:
# BigQuery Authentication using Application Default Credentials
# This will use your gcloud authentication automatically
# Run this first in terminal: gcloud auth application-default login
bq_auth()

In [4]:
Sys.setenv(BIGQUERY_TEST_PROJECT = "mpg-data-warehouse")

In [5]:
billing <- bq_test_project()

### vegetation_point_intercept_gridVeg

In [6]:
con_point_intercept <- dbConnect(
  bigrquery::bigquery(),
  project = "mpg-data-warehouse",
  dataset = "vegetation_point_intercept_gridVeg",
  billing = billing
)

In [7]:
dbListTables(con_point_intercept)

In [8]:
ground_point_intercept_sql <-
"
  SELECT
    survey_ID,
    grid_point,
    intercept_ground_code,
    COUNT(intercept_ground_code) / 2 AS intercepts_pct
  FROM
    `mpg-data-warehouse.vegetation_point_intercept_gridVeg.gridVeg_point_intercept_ground`
  GROUP BY
    survey_ID, grid_point, intercept_ground_code
"
bq_ground_point_intercept <- bq_project_query(billing, ground_point_intercept_sql)
tb_ground_point_intercept <- bq_table_download(bq_ground_point_intercept)
# Drop NA rows. These only occur in intercept_ground_code, are always associated with
# zero values in intercepts_pct, and will be expanded to true values later using complete()
# df_ground_point_intercept <- as.data.frame(tb_ground_point_intercept) %>% drop_na() %>% glimpse() ## LINE COMMENTED OUT; LINE BELOW ADDED; BL; 2021-01-22
df_ground_point_intercept <- as.data.frame(tb_ground_point_intercept) %>% filter(intercept_ground_code != "NA") %>% glimpse()

Rows: 10,357
Columns: 4
$ survey_ID             [3m[90m<chr>[39m[23m "D4CB77CE-87D8-481F-90CB-EC627082C96F"[90m, [39m"6654E7B…
$ grid_point            [3m[90m<int>[39m[23m 558[90m, [39m190[90m, [39m119[90m, [39m98[90m, [39m112[90m, [39m165[90m, [39m549[90m, [39m62[90m, [39m30[90m, [39m561[90m, [39m1…
$ intercept_ground_code [3m[90m<chr>[39m[23m "BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m"BV"[90m, [39m"G"[90m, [39m"L"[90m, [39m"L"[90m, [39m"L"…
$ intercepts_pct        [3m[90m<dbl>[39m[23m 22.5[90m, [39m23.0[90m, [39m28.0[90m, [39m21.5[90m, [39m32.0[90m, [39m18.0[90m, [39m33.5[90m, [39m48.0[90m, [39m…


### gridVeg_survey_metadata

In [9]:
survey_metadata_sql <-
"
  SELECT
    survey_ID,
    year,
    survey_sequence
  FROM
    `mpg-data-warehouse.vegetation_point_intercept_gridVeg.gridVeg_survey_metadata`
"
bq_survey_metadata <- bq_project_query(billing, survey_metadata_sql)
tb_survey_metadata <- bq_table_download(bq_survey_metadata)
df_survey_metadata <- as.data.frame(tb_survey_metadata) %>% glimpse()

Rows: 1,723
Columns: 3
$ survey_ID       [3m[90m<chr>[39m[23m "138"[90m, [39m"139"[90m, [39m"135"[90m, [39m"134"[90m, [39m"137"[90m, [39m"136"[90m, [39m"141"[90m, [39m"143"…
$ year            [3m[90m<int>[39m[23m 2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m2010[90m, [39m…
$ survey_sequence [3m[90m<chr>[39m[23m "2010"[90m, [39m"2010"[90m, [39m"2010"[90m, [39m"2010"[90m, [39m"2010"[90m, [39m"2010"[90m, [39m"2010"…


### ground_cover_metadata

In [10]:
sql_ground_meta <-
"
SELECT
  intercept_ground_code,
  ground_group
FROM
  `mpg-data-warehouse.vegetation_point_intercept_gridVeg.gridVeg_ground_cover_metadata`
"
bq_ground_meta <- bq_project_query(billing, sql_ground_meta)
tb_ground_meta <- bq_table_download(bq_ground_meta)
df_ground_meta <- as.data.frame(tb_ground_meta) %>% arrange(intercept_ground_code)

df_ground_meta %>% kable(format = "pandoc")



intercept_ground_code   ground_group 
----------------------  -------------
BG                      inorganic    
BV                      vas_plant    
G                       inorganic    
L                       litter       
LIC                     lichen       
M                       nonvas_plant 
M/L                     nonvas_plant 
OTHER                   inorganic    
R                       inorganic    
SC                      dung         
SD                      dung         
SE                      dung         
SH                      dung         
SU                      dung         
WDL                     woody        
WDS                     woody        
WDSTUMP                 woody        
WDT                     woody        

# Wrangle

In [11]:
# show unique intercept ground codes
df_ground_point_intercept %>%
  distinct(intercept_ground_code) %>%
  arrange(intercept_ground_code)

intercept_ground_code
<chr>
BG
BV
G
L
LIC
M
M/L
OTHER
R
SC


## Complete intercept_ground_code
- Complete the data frame for `intercept_ground_code` and fill null cells with 0
- As of 2020-08-24: with 18 ground cover codes and 1244 survey IDs, we should end up with 22,392 rows after completeing the data frame
- We should also end up with 1244 distinct combinations of survey_ID and grid_point
- What should be expected for 2025 updates?

In [12]:
df_ground_point_intercept_fill <-
  df_ground_point_intercept %>%
  complete(intercept_ground_code, nesting(survey_ID, grid_point), fill= list(intercepts_pct = 0)) %>%
  glimpse()

df_ground_point_intercept_fill %>%
  distinct(survey_ID, grid_point) %>%
  count()

Rows: 26,910
Columns: 4
$ intercept_ground_code [3m[90m<chr>[39m[23m "BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m…
$ survey_ID             [3m[90m<chr>[39m[23m "012C5FAD-2451-41B0-9E2F-432D1ECEB55C"[90m, [39m"0133805…
$ grid_point            [3m[90m<int>[39m[23m 285[90m, [39m505[90m, [39m401[90m, [39m23[90m, [39m126[90m, [39m306[90m, [39m305[90m, [39m323[90m, [39m142[90m, [39m511[90m,[39m…
$ intercepts_pct        [3m[90m<dbl>[39m[23m 15.5[90m, [39m14.5[90m, [39m16.5[90m, [39m16.5[90m, [39m35.0[90m, [39m20.5[90m, [39m1.5[90m, [39m0.0[90m, [39m7.…


n
<int>
1495


## Join datasets

In [13]:
df_gridVeg_groundCover_intercepts_join <-
  df_ground_point_intercept_fill %>%
  left_join(df_survey_metadata, by = "survey_ID") %>%
  left_join(df_ground_meta, by = "intercept_ground_code") %>%
  select(survey_ID, year, survey_sequence, grid_point, intercept_ground_code, ground_group, intercepts_pct) %>%
  glimpse()

Rows: 26,910
Columns: 7
$ survey_ID             [3m[90m<chr>[39m[23m "012C5FAD-2451-41B0-9E2F-432D1ECEB55C"[90m, [39m"0133805…
$ year                  [3m[90m<int>[39m[23m 2016[90m, [39m2016[90m, [39m2017[90m, [39m2016[90m, [39m2025[90m, [39m2023[90m, [39m2021[90m, [39m2015[90m, [39m…
$ survey_sequence       [3m[90m<chr>[39m[23m "2016"[90m, [39m"2016"[90m, [39m"2017"[90m, [39m"2016"[90m, [39m"2025"[90m, [39m"2023"[90m, [39m…
$ grid_point            [3m[90m<int>[39m[23m 285[90m, [39m505[90m, [39m401[90m, [39m23[90m, [39m126[90m, [39m306[90m, [39m305[90m, [39m323[90m, [39m142[90m, [39m511[90m,[39m…
$ intercept_ground_code [3m[90m<chr>[39m[23m "BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m…
$ ground_group          [3m[90m<chr>[39m[23m "inorganic"[90m, [39m"inorganic"[90m, [39m"inorganic"[90m, [39m"inorgani…
$ intercepts_pct      

## Filter

In [16]:
# df_gridVeg_groundCover_intercepts
# df_gridVeg_groundCover_intercepts_join
df_gridVeg_groundCover_intercepts <- df_gridVeg_groundCover_intercepts_join %>%
  filter(year > 2022 & grid_point != 586) %>%
  glimpse()

Rows: 1,944
Columns: 7
$ survey_ID             [3m[90m<chr>[39m[23m "01C59CA8-FEA9-4CAC-AB3E-1B08D747F8C5"[90m, [39m"01E1461…
$ year                  [3m[90m<int>[39m[23m 2025[90m, [39m2023[90m, [39m2023[90m, [39m2025[90m, [39m2023[90m, [39m2024[90m, [39m2024[90m, [39m2025[90m, [39m…
$ survey_sequence       [3m[90m<chr>[39m[23m "2025"[90m, [39m"2023"[90m, [39m"2023"[90m, [39m"2025"[90m, [39m"2023"[90m, [39m"2024"[90m, [39m…
$ grid_point            [3m[90m<int>[39m[23m 126[90m, [39m306[90m, [39m227[90m, [39m28[90m, [39m305[90m, [39m319[90m, [39m320[90m, [39m466[90m, [39m119[90m, [39m55[90m, [39m…
$ intercept_ground_code [3m[90m<chr>[39m[23m "BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m"BG"[90m, [39m…
$ ground_group          [3m[90m<chr>[39m[23m "inorganic"[90m, [39m"inorganic"[90m, [39m"inorganic"[90m, [39m"inorgani…
$ intercepts_pct       

## QC
* No rows contain missing data, and this makes sense since `complete()` was used to fill response variables.
* Numeric variables appear correct in the summary

In [17]:
df_gridVeg_groundCover_intercepts[which(!complete.cases(df_gridVeg_groundCover_intercepts)), ]

survey_ID,year,survey_sequence,grid_point,intercept_ground_code,ground_group,intercepts_pct
<chr>,<int>,<chr>,<int>,<chr>,<chr>,<dbl>


In [18]:
summary(df_gridVeg_groundCover_intercepts)

  survey_ID              year      survey_sequence      grid_point    
 Length:1944        Min.   :2023   Length:1944        Min.   :  2.00  
 Class :character   1st Qu.:2023   Class :character   1st Qu.: 72.25  
 Mode  :character   Median :2024   Mode  :character   Median :134.50  
                    Mean   :2024                      Mean   :202.62  
                    3rd Qu.:2025                      3rd Qu.:309.25  
                    Max.   :2025                      Max.   :582.00  
 intercept_ground_code ground_group       intercepts_pct  
 Length:1944           Length:1944        Min.   : 0.000  
 Class :character      Class :character   1st Qu.: 0.000  
 Mode  :character      Mode  :character   Median : 0.000  
                                          Mean   : 5.554  
                                          3rd Qu.: 3.000  
                                          Max.   :95.500  

In [19]:
df_gridVeg_groundCover_intercepts %>%
  group_by(year) %>%
  summarize(count_distinct = n_distinct(survey_ID))

year,count_distinct
<int>,<int>
2023,36
2024,33
2025,39


# Output

In [None]:
# Output 2022-08-17 esamsoe
# write_csv(df_gridVeg_groundCover_intercepts, file = "gridVeg_groundCover_intercepts_WRANGLE.csv")

# Output 2025-11-04 esamsoe
# write_csv(df_gridVeg_groundCover_intercepts, file = "../data/processed/gridVeg_groundCover_intercepts_WRANGLE-251104.csv")
