<a href="https://colab.research.google.com/github/samsoe/mpg_notebooks/blob/master/gridVeg_plant_abundance_matrix_wrangle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Security

* The user must load a `json` file containing the BigQuery API key into the local directory `/content/...`
* The user must have a Google Maps API key to enable mapping. 
   * CAUTION make sure the key is deleted from the current instance of the notebook before sharing

# Tools

* Remember that the file containing authorization keys for Big Query must be loaded into the virutual envrionment manually.

In [1]:
install.packages("bigrquery")
library(bigrquery)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘bit’, ‘bit64’, ‘gargle’, ‘rapidjsonr’




In [2]:
library(tidyverse)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.5     [32m✔[39m [34mdplyr  [39m 1.0.3
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



# Source

## Database Connection

In [3]:
# BigQuery API Key
bq_auth(path = "/content/mpg-data-warehouse-api_key-master.json")

In [4]:
Sys.setenv(BIGQUERY_TEST_PROJECT = "mpg-data-warehouse")

In [5]:
billing <- bq_test_project()

### Survey Data: Plant Intercepts

In [None]:
con_survey_effort <- dbConnect(
  bigrquery::bigquery(),
  project = "mpg-data-warehouse",
  dataset = "vegetation_gridVeg_summaries",
  billing = billing
)

In [None]:
dbListTables(con_survey_effort)

In [None]:
intercepts_sql <- 
  "
  SELECT survey_ID, grid_point, key_plant_code, intercepts_pct
  FROM `mpg-data-warehouse.vegetation_gridVeg_summaries.gridVeg_plant_intercepts`
  "

In [None]:
bq_intercepts <- bq_project_query(billing, intercepts_sql)

In [None]:
tb_intercepts <- bq_table_download(bq_intercepts)

In [None]:
df_intercepts <- as.data.frame(tb_intercepts)

In [None]:
glimpse(df_intercepts)

Rows: 25,089
Columns: 4
$ survey_ID      [3m[90m<chr>[39m[23m "436", "436", "436", "436", "436", "436", "436", "436"…
$ grid_point     [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ key_plant_code [3m[90m<chr>[39m[23m "ELYTRA", "POAPAL", "FESCAM", "ARESER", "HEUCYL", "CAR…
$ intercepts_pct [3m[90m<dbl>[39m[23m 2.5, 1.5, 14.5, 1.0, 2.5, 1.0, 1.0, 0.5, 0.5, 0.5, 0.5…


### Survey Metadata

In [None]:
con_survey_meta <- dbConnect(
  bigrquery::bigquery(),
  project = "mpg-data-warehouse",
  dataset = "vegetation_point_intercept_gridVeg",
  billing = billing
)

In [None]:
dbListTables(con_survey_meta)

In [None]:
meta_sql <- "SELECT survey_ID, year, survey_sequence FROM `mpg-data-warehouse.vegetation_point_intercept_gridVeg.gridVeg_survey_metadata`"

In [None]:
bq_meta <- bq_project_query(billing, meta_sql)

In [None]:
tb_meta <- bq_table_download(bq_meta)

In [None]:
df_meta <- as.data.frame(tb_meta)

In [None]:
glimpse(df_meta)

Rows: 1,472
Columns: 3
$ survey_ID       [3m[90m<chr>[39m[23m "F31C56A8-912D-410C-A17D-4C2DD75F71A4", "A19E87E6-A89…
$ year            [3m[90m<int>[39m[23m 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016,…
$ survey_sequence [3m[90m<chr>[39m[23m "2016", "2016", "2016", "2016", "2016", "2016", "2016…


# Wrangle

## Reshape Intercepts/Effort

Transform the long-form data to a wide species-samples matrix. 

In [None]:
df_intercepts_mat <- 
  df_intercepts %>%
  arrange(key_plant_code) %>% 
  pivot_wider(id_cols = c(survey_ID, grid_point), names_from = key_plant_code, values_from = intercepts_pct, values_fill = 0) %>% 
  glimpse()

Rows: 1,244
Columns: 491
$ survey_ID  [3m[90m<chr>[39m[23m "826", "945", "822", "944", "840", "E883773B-70B8-472F-8D5…
$ grid_point [3m[90m<int>[39m[23m 399, 415, 424, 425, 494, 338, 399, 415, 399, 489, 117, 371…
$ ABIGRA     [3m[90m<dbl>[39m[23m 10.0, 1.5, 2.0, 2.0, 1.5, 0.5, 23.5, 10.5, 17.0, 0.0, 0.0,…
$ ABILAS     [3m[90m<dbl>[39m[23m 0.5, 0.0, 0.0, 0.0, 14.5, 0.0, 0.0, 0.0, 0.0, 0.5, 31.5, 4…
$ ACEGLA     [3m[90m<dbl>[39m[23m 7.0, 1.5, 0.5, 12.5, 0.5, 0.0, 5.0, 0.5, 9.5, 8.5, 0.0, 9.…
$ ACHMIL     [3m[90m<dbl>[39m[23m 0.0, 0.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.5…
$ ACTRUB     [3m[90m<dbl>[39m[23m 0.0, 2.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0…
$ AGAURT     [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ AGOAUR     [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ AGOGLA     [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Join Tables

Join the `year` and `survey_sequence` variables from the survey metadata table, and move the new variables to the left side of the matrix, consistent with the requested schema.

In [None]:
df_abundance_matrix <- df_intercepts_mat %>%
  left_join(df_meta, by = "survey_ID") %>% 
  select(survey_ID, year, survey_sequence, grid_point, everything()) %>% 
  arrange(year, grid_point) %>% 
  glimpse()

Rows: 1,244
Columns: 493
$ survey_ID       [3m[90m<chr>[39m[23m "436", "437", "561", "560", "559", "558", "695", "438…
$ year            [3m[90m<int>[39m[23m 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,…
$ survey_sequence [3m[90m<chr>[39m[23m "2011-12", "2011-12", "2011-12", "2011-12", "2011-12"…
$ grid_point      [3m[90m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17…
$ ABIGRA          [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ ABILAS          [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ ACEGLA          [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ ACHMIL          [3m[90m<dbl>[39m[23m 1.5, 0.0, 0.0, 1.0, 3.5, 0.5, 1.5, 0.5, 1.5, 0.0, 1.5…
$ ACTRUB          [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ AGAURT          [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

### Check for errors
No NA values found, schema is correct and variable types are consistent. 

In [None]:
which(is.na(df_abundance_matrix)) %>% length()

# Output