# Tools

In [None]:
library(tidyverse)

“running command 'timedatectl' had status 1”
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [None]:
install.packages("bigrquery")
library(bigrquery)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘Rcpp’, ‘rapidjsonr’




# Source

## Database Connection

In [None]:
# BigQuery API Key
bq_auth(path = "/content/mpg-data-warehouse-api_key-master.json")

In [None]:
Sys.setenv(BIGQUERY_TEST_PROJECT = "mpg-data-warehouse")

In [None]:
billing <- bq_test_project()

## Database Query

### gridVeg_plant_intercepts

In [None]:
# vegetation_gridVeg_summaries:gridVeg_plant_intercepts
sql_plant_intercepts <-
"
  SELECT
    survey_ID, grid_point, year, key_plant_species
  FROM
    `mpg-data-warehouse.vegetation_gridVeg_summaries.gridVeg_foliar_cover_all`
"
bq_plant_intercepts <- bq_project_query(billing, sql_plant_intercepts)
tb_plant_intercepts <- bq_table_download(bq_plant_intercepts)

In [None]:
df_plant_intercepts <- as.data.frame(tb_plant_intercepts) %>%
  mutate(detection_type = "point_intercept") %>% glimpse()

Rows: 26,387
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "436", "436", "436", "436", "436", "436", "436", "43…
$ grid_point        [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ year              [3m[90m<int>[39m[23m 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
$ key_plant_species [3m[90m<int>[39m[23m 5, 481, 201, 554, 241, 16, 250, 102, 267, 154, 57, 2…
$ detection_type    [3m[90m<chr>[39m[23m "point_intercept", "point_intercept", "point_interce…


In [None]:
df_plant_intercepts %>% filter(is.na(key_plant_species))

survey_ID,grid_point,year,key_plant_species,detection_type
<chr>,<int>,<int>,<int>,<chr>


### gridVeg_additional_species

In [None]:
# vegetation_point_intercept_gridVeg:gridVeg_additional_species
sql_add_spec <-
  "
    SELECT
      survey_ID, grid_point, year, key_plant_species
    FROM
      `mpg-data-warehouse.vegetation_point_intercept_gridVeg.gridVeg_additional_species`
  "
bq_add_spec <- bq_project_query(billing, sql_add_spec)
tb_add_spec <- bq_table_download(bq_add_spec)

In [None]:
df_add_spec <- as.data.frame(tb_add_spec) %>%
  mutate(detection_type = "supplemental_obs") %>%
  filter(!is.na(key_plant_species)) %>%
  glimpse()

Rows: 13,110
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "447", "447", "447", "447", "447", "447", "447", "44…
$ grid_point        [3m[90m<int>[39m[23m 230, 230, 230, 230, 230, 230, 230, 230, 230, 230, 23…
$ year              [3m[90m<int>[39m[23m 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
$ key_plant_species [3m[90m<int>[39m[23m 187, 266, 20, 307, 240, 522, 125, 31, 75, 5, 230, 30…
$ detection_type    [3m[90m<chr>[39m[23m "supplemental_obs", "supplemental_obs", "supplementa…


In [None]:
df_add_spec %>% filter(is.na(key_plant_species))

survey_ID,grid_point,year,key_plant_species,detection_type
<chr>,<int>,<int>,<int>,<chr>


### gridVeg_survey_metadata


In [None]:
# vegetation_point_intercept_gridVeg:gridVeg_survey_metadata
sql_survey_meta <-
  "
    SELECT
      *
    FROM
      `mpg-data-warehouse.vegetation_point_intercept_gridVeg.gridVeg_survey_metadata`
  "
bq_survey_meta <- bq_project_query(billing, sql_survey_meta)
tb_survey_meta <- bq_table_download(bq_survey_meta)
df_survey_meta <- as.data.frame(tb_survey_meta) %>% glimpse()

Rows: 1,615
Columns: 6
$ survey_ID       [3m[90m<chr>[39m[23m "138", "139", "135", "134", "137", "136", "141", "143"…
$ grid_point      [3m[90m<int>[39m[23m 61, 60, 52, 51, 62, 44, 57, 66, 65, 59, 69, 76, 75, 68…
$ year            [3m[90m<int>[39m[23m 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, …
$ date            [3m[90m<date>[39m[23m 2010-08-27, 2010-08-27, 2010-08-27, 2010-08-27, 2010-…
$ survey_sequence [3m[90m<chr>[39m[23m "2010", "2010", "2010", "2010", "2010", "2010", "2010"…
$ surveyor        [3m[90m<chr>[39m[23m "EAR", "EAR", "EAR", "EAR", "EAR", "EAR", "EAR", "EAR"…


### location_position_classification

In [None]:
# grid_point_summaries:location_position_classification
sql_loc_pos <-
  "
    SELECT
      *
    FROM
      `mpg-data-warehouse.grid_point_summaries.location_position_classification`
  "
bq_loc_pos <- bq_project_query(billing, sql_loc_pos)
tb_loc_pos <- bq_table_download(bq_loc_pos)
df_loc_pos <- as.data.frame(tb_loc_pos) %>% glimpse()

Rows: 588
Columns: 16
$ grid_point                  [3m[90m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,…
$ lat                         [3m[90m<dbl>[39m[23m 46.73193, 46.72972, 46.72443, 46.72487, 46…
$ long                        [3m[90m<dbl>[39m[23m -114.0017, -114.0010, -114.0227, -114.0195…
$ aspect_mean_deg             [3m[90m<dbl>[39m[23m 321.18675, 65.92522, 240.01216, 290.66936,…
$ aspect_direction            [3m[90m<chr>[39m[23m "NW", "ENE", "WSW", "WNW", "W", "SW", "W",…
$ aspect_northness            [3m[90m<dbl>[39m[23m 0.77919303, 0.40792854, -0.49981612, 0.352…
$ aspect_eastness             [3m[90m<dbl>[39m[23m -0.62678403, 0.91301386, -0.86613154, -0.9…
$ elevation_mean_m            [3m[90m<dbl>[39m[23m 1395.468, 1455.145, 1126.677, 1164.549, 11…
$ slope_mean_deg              [3m[90m<dbl>[39m[23m 28.139095, 20.516106, 5.503165, 6.102558, …
$ cover_type_2016_gridVeg     [3m[90m<chr>[39m[23m "woodland/forest", "non-irrigate

### vegetation_species_metadata

In [None]:
# vegetation_species_metadata:vegetation_species_metadata
sql_species_meta <-
  "
    SELECT
      *
    FROM
      `mpg-data-warehouse.vegetation_species_metadata.vegetation_species_metadata`
  "
bq_species_meta <- bq_project_query(billing, sql_species_meta)
tb_species_meta <- bq_table_download(bq_species_meta)
df_species_meta <- as.data.frame(tb_species_meta) %>% glimpse()

Rows: 765
Columns: 9
$ key_plant_species   [3m[90m<int>[39m[23m 360, 719, 137, 668, 201, 532, 335, 671, 235, 265, …
$ key_plant_code      [3m[90m<chr>[39m[23m "NV", "GLYBOR", "CINLAT", "DANINT", "ELYTRA", "TRI…
$ plant_name_sci      [3m[90m<chr>[39m[23m "no vegetation", "Glyceria borealis", "Cinna latif…
$ plant_name_syn      [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, "Trisetum canescens", NA, "Tri…
$ plant_name_common   [3m[90m<chr>[39m[23m "no vegetation", "small floating manna grass", "dr…
$ plant_name_family   [3m[90m<chr>[39m[23m "None", "Poaceae", "Poaceae", "Poaceae", "Poaceae"…
$ plant_native_status [3m[90m<chr>[39m[23m "none", "native", "native", "native", "native", "n…
$ plant_life_cycle    [3m[90m<chr>[39m[23m "unknown", "perennial", "perennial", "perennial", …
$ plant_life_form     [3m[90m<chr>[39m[23m "none", "graminoid", "graminoid", "graminoid", "gr…


# Wrangle

In [None]:
df_plant_intercepts %>% glimpse()

Rows: 26,387
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "436", "436", "436", "436", "436", "436", "436", "43…
$ grid_point        [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ year              [3m[90m<int>[39m[23m 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
$ key_plant_species [3m[90m<int>[39m[23m 5, 481, 201, 554, 241, 16, 250, 102, 267, 154, 57, 2…
$ detection_type    [3m[90m<chr>[39m[23m "point_intercept", "point_intercept", "point_interce…


In [None]:
df_add_spec %>% glimpse()

Rows: 13,110
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "447", "447", "447", "447", "447", "447", "447", "44…
$ grid_point        [3m[90m<int>[39m[23m 230, 230, 230, 230, 230, 230, 230, 230, 230, 230, 23…
$ year              [3m[90m<int>[39m[23m 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
$ key_plant_species [3m[90m<int>[39m[23m 187, 266, 20, 307, 240, 522, 125, 31, 75, 5, 230, 30…
$ detection_type    [3m[90m<chr>[39m[23m "supplemental_obs", "supplemental_obs", "supplementa…


## Bind rows

In [None]:
df_bind <- df_plant_intercepts %>%
  # Remove "NV" key plant code which is equal to 360
  filter(key_plant_species != 360) %>%
  bind_rows(df_add_spec) %>%
  glimpse()

Rows: 39,497
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "436", "436", "436", "436", "436", "436", "436", "43…
$ grid_point        [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ year              [3m[90m<int>[39m[23m 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
$ key_plant_species [3m[90m<int>[39m[23m 5, 481, 201, 554, 241, 16, 250, 102, 267, 154, 57, 2…
$ detection_type    [3m[90m<chr>[39m[23m "point_intercept", "point_intercept", "point_interce…


## Identify multiples

In [None]:
df_multiple <- df_bind %>%
  group_by(survey_ID, key_plant_species) %>%
  summarise(n = n()) %>%
  arrange(desc(n)) %>%
  filter(n > 1) %>%
  ungroup() %>%
  unite("multiple_key", survey_ID:key_plant_species, remove = FALSE) %>%
  glimpse()

[1m[22m`summarise()` has grouped output by 'survey_ID'. You can override using the
`.groups` argument.


Rows: 1,292
Columns: 4
$ multiple_key      [3m[90m<chr>[39m[23m "249_174", "326_57", "376_62", "425_250", "450_5", "…
$ survey_ID         [3m[90m<chr>[39m[23m "249", "326", "376", "425", "450", "483", "485", "CF…
$ key_plant_species [3m[90m<int>[39m[23m 174, 57, 62, 250, 5, 240, 5, 529, 132, 435, 39, 68, …
$ n                 [3m[90m<int>[39m[23m 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…


In [None]:
# display multiple example
df_bind %>%
  filter(survey_ID == 249, key_plant_species == 174)

survey_ID,grid_point,year,key_plant_species,detection_type
<chr>,<int>,<int>,<int>,<chr>
249,67,2011,174,point_intercept
249,67,2011,174,supplemental_obs
249,67,2011,174,supplemental_obs


## Remove multiples

In [None]:
# use df_multiple to select and remove from df_supplemental
df_clean_supp <- df_add_spec %>%
  unite("multiple_key", c(survey_ID, key_plant_species), remove = FALSE) %>%
  anti_join(df_multiple, by = "multiple_key") %>%
  select(,-multiple_key) %>%
  glimpse()

Rows: 11,730
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "447", "447", "447", "447", "447", "447", "447", "44…
$ grid_point        [3m[90m<int>[39m[23m 230, 230, 230, 230, 230, 230, 230, 230, 230, 230, 23…
$ year              [3m[90m<int>[39m[23m 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
$ key_plant_species [3m[90m<int>[39m[23m 266, 20, 240, 522, 125, 31, 75, 5, 230, 308, 225, 17…
$ detection_type    [3m[90m<chr>[39m[23m "supplemental_obs", "supplemental_obs", "supplementa…


## Join clean dataframes

In [None]:
df_clean_supp %>%
  glimpse()

Rows: 11,730
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "447", "447", "447", "447", "447", "447", "447", "44…
$ grid_point        [3m[90m<int>[39m[23m 230, 230, 230, 230, 230, 230, 230, 230, 230, 230, 23…
$ year              [3m[90m<int>[39m[23m 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
$ key_plant_species [3m[90m<int>[39m[23m 266, 20, 240, 522, 125, 31, 75, 5, 230, 308, 225, 17…
$ detection_type    [3m[90m<chr>[39m[23m "supplemental_obs", "supplemental_obs", "supplementa…


In [None]:
df_clean_bind <- df_plant_intercepts %>%
  # Remove "NV" key plant code which is equal to 360
  filter(key_plant_species != 360) %>%
  bind_rows(df_clean_supp) %>%
  glimpse()

Rows: 38,117
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "436", "436", "436", "436", "436", "436", "436", "43…
$ grid_point        [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ year              [3m[90m<int>[39m[23m 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
$ key_plant_species [3m[90m<int>[39m[23m 5, 481, 201, 554, 241, 16, 250, 102, 267, 154, 57, 2…
$ detection_type    [3m[90m<chr>[39m[23m "point_intercept", "point_intercept", "point_interce…


In [None]:
# double check for multiples
df_clean_bind %>%
  group_by(survey_ID, key_plant_species) %>%
  summarise(n = n()) %>%
  arrange(desc(n)) %>%
  filter(n > 1) %>%
  # I'm not clear if I'm using ungroup in a meaningful way here
  ungroup() %>%
  glimpse()

[1m[22m`summarise()` has grouped output by 'survey_ID'. You can override using the
`.groups` argument.


Rows: 0
Columns: 3
$ survey_ID         [3m[90m<chr>[39m[23m 
$ key_plant_species [3m[90m<int>[39m[23m 
$ n                 [3m[90m<int>[39m[23m 


# Join in Additional Columns

## Filter

In [None]:
gridVeg_species_richness <- df_clean_bind %>%
  filter(year == 2022 & !is.na(key_plant_species)) %>%
  glimpse()

Rows: 1,261
Columns: 5
$ survey_ID         [3m[90m<chr>[39m[23m "9D272F52-DDBB-4CC1-8DC5-C12EEB2D4EBA", "9D272F52-DD…
$ grid_point        [3m[90m<int>[39m[23m 45, 45, 45, 45, 45, 45, 45, 54, 54, 54, 54, 54, 54, …
$ year              [3m[90m<int>[39m[23m 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022…
$ key_plant_species [3m[90m<int>[39m[23m 525, 225, 82, 648, 529, 405, 406, 525, 320, 522, 405…
$ detection_type    [3m[90m<chr>[39m[23m "point_intercept", "point_intercept", "point_interce…


# Output

In [None]:
# 2022-08-17 ES
write_csv(gridVeg_species_richness, "gridVeg_species_richness-WRANGLE.csv")