<a href="https://colab.research.google.com/github/samsoe/mpg_notebooks/blob/master/gridVeg_groundCover_intercepts_WRANGLE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Security

* The user must load a `json` file containing the BigQuery API key into the local directory `/content/...`
* The user must have a Google Maps API key to enable mapping. 
   * CAUTION make sure the key is deleted from the current instance of the notebook before sharing

# Tools

In [1]:
# Package and library installation
packages_needed = c("tidyverse", "knitr") # comma delimited vector of package names
packages_installed = packages_needed %in% rownames(installed.packages())

if (any(! packages_installed))
  install.packages(packages_needed[! packages_installed])
for (i in 1:length(packages_needed)) {
  library(packages_needed[i], character.only = T)
}

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.5     [32m✔[39m [34mdplyr  [39m 1.0.3
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



* Remember that the file containing authorization keys for Big Query must be loaded into the virtual envrionment manually.

In [None]:
install.packages("bigrquery")
library(bigrquery)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘bit’, ‘bit64’, ‘gargle’, ‘rapidjsonr’




# Source

## Database Connection

In [None]:
# BigQuery API Key
bq_auth(path = "/content/mpg-data-warehouse-api_key-master.json")

In [None]:
Sys.setenv(BIGQUERY_TEST_PROJECT = "mpg-data-warehouse")

In [None]:
billing <- bq_test_project()

### vegetation_point_intercept_gridVeg

In [None]:
con_point_intercept <- dbConnect(
  bigrquery::bigquery(),
  project = "mpg-data-warehouse",
  dataset = "vegetation_point_intercept_gridVeg",
  billing = billing
)

In [None]:
dbListTables(con_point_intercept)

In [None]:
ground_point_intercept_sql <- 
"
  SELECT
    survey_ID,
    grid_point,
    intercept_ground_code,
    COUNT(intercept_ground_code) / 2 AS intercepts_pct
  FROM 
    `mpg-data-warehouse.vegetation_point_intercept_gridVeg.gridVeg_point_intercept_ground`
  GROUP BY
    survey_ID, grid_point, intercept_ground_code
"
bq_ground_point_intercept <- bq_project_query(billing, ground_point_intercept_sql)
tb_ground_point_intercept <- bq_table_download(bq_ground_point_intercept)
# Drop NA rows. These only occur in intercept_ground_code, are always associated with 
# zero values in intercepts_pct, and will be expanded to true values later using complete()
# df_ground_point_intercept <- as.data.frame(tb_ground_point_intercept) %>% drop_na() %>% glimpse() ## LINE COMMENTED OUT; LINE BELOW ADDED; BL; 2021-01-22
df_ground_point_intercept <- as.data.frame(tb_ground_point_intercept) %>% filter(intercept_ground_code != "NA") %>% glimpse()

Rows: 9,518
Columns: 4
$ survey_ID             [3m[90m<chr>[39m[23m "69", "69", "69", "234", "234", "234", "234", "…
$ grid_point            [3m[90m<int>[39m[23m 329, 329, 329, 48, 48, 48, 48, 295, 295, 295, 2…
$ intercept_ground_code [3m[90m<chr>[39m[23m "L", "WDT", "BV", "L", "BV", "M/L", "G", "L", "…
$ intercepts_pct        [3m[90m<dbl>[39m[23m 78.0, 1.0, 18.5, 41.5, 19.5, 12.0, 16.5, 51.5, …


### gridVeg_survey_metadata

In [None]:
survey_metadata_sql <-
"
  SELECT
    survey_ID,
    year,
    survey_sequence
  FROM
    `mpg-data-warehouse.vegetation_point_intercept_gridVeg.gridVeg_survey_metadata`
"
bq_survey_metadata <- bq_project_query(billing, survey_metadata_sql)
tb_survey_metadata <- bq_table_download(bq_survey_metadata)
df_survey_metadata <- as.data.frame(tb_survey_metadata) %>% glimpse()

Rows: 1,472
Columns: 3
$ survey_ID       [3m[90m<chr>[39m[23m "F31C56A8-912D-410C-A17D-4C2DD75F71A4", "A19E87E6-A89…
$ year            [3m[90m<int>[39m[23m 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016,…
$ survey_sequence [3m[90m<chr>[39m[23m "2016", "2016", "2016", "2016", "2016", "2016", "2016…


year,survey_sequence
<int>,<chr>
2016,2016
2010,2010
2011,2011-12
2012,2011-12
2017,2017
2013,2013
2015,2015


### ground_cover_metadata

In [None]:
sql_ground_meta <- 
"
SELECT
  intercept_ground_code,
  ground_group
FROM
  `mpg-data-warehouse.vegetation_point_intercept_gridVeg.gridVeg_ground_cover_metadata`
"
bq_ground_meta <- bq_project_query(billing, sql_ground_meta)
tb_ground_meta <- bq_table_download(bq_ground_meta)
df_ground_meta <- as.data.frame(tb_ground_meta) %>% arrange(intercept_ground_code)

df_ground_meta %>% kable(format = "pandoc")



intercept_ground_code   ground_group 
----------------------  -------------
BG                      inorganic    
BV                      vas_plant    
G                       inorganic    
L                       litter       
LIC                     lichen       
M                       nonvas_plant 
M/L                     nonvas_plant 
OTHER                   inorganic    
R                       inorganic    
S                       inorganic    
SC                      dung         
SD                      dung         
SE                      dung         
SH                      dung         
SU                      dung         
WDL                     woody        
WDS                     woody        
WDSTUMP                 woody        
WDT                     woody        

# Wrangle

In [None]:
# show unique intercept ground codes
df_ground_point_intercept %>%
  distinct(intercept_ground_code) %>% 
  arrange(intercept_ground_code)

intercept_ground_code
<chr>
BG
BV
G
L
LIC
M
M/L
OTHER
R
S


## Complete intercept_ground_code
- Complete the data frame for `intercept_ground_code` and fill null cells with 0
- As of 2020-08-24: with 18 ground cover codes and 1244 survey IDs, we should end up with 22,392 rows after completeing the data frame
- We should also end up with 1244 distinct combinations of survey_ID and grid_point

In [None]:
df_ground_point_intercept_fill <- 
  df_ground_point_intercept %>%
  complete(intercept_ground_code, nesting(survey_ID, grid_point), fill= list(intercepts_pct = 0)) %>%
  glimpse()

df_ground_point_intercept_fill %>% 
  distinct(survey_ID, grid_point) %>% 
  count()

Rows: 23,636
Columns: 4
$ intercept_ground_code [3m[90m<chr>[39m[23m "BG", "BG", "BG", "BG", "BG", "BG", "BG", "BG",…
$ survey_ID             [3m[90m<chr>[39m[23m "012C5FAD-2451-41B0-9E2F-432D1ECEB55C", "013380…
$ grid_point            [3m[90m<int>[39m[23m 285, 505, 401, 23, 323, 511, 402, 196, 226, 564…
$ intercepts_pct        [3m[90m<dbl>[39m[23m 7.0, 4.5, 4.5, 9.0, 0.0, 0.0, 14.0, 18.5, 20.5,…


n
<int>
1244


## Join datasets

In [None]:
df_gridVeg_groundCover_intercepts <- 
  df_ground_point_intercept_fill %>%
  left_join(df_survey_metadata, by = "survey_ID") %>% 
  left_join(df_ground_meta, by = "intercept_ground_code") %>% 
  select(survey_ID, year, survey_sequence, grid_point, intercept_ground_code, ground_group, intercepts_pct) %>% 
  glimpse()

Rows: 23,636
Columns: 7
$ survey_ID             [3m[90m<chr>[39m[23m "012C5FAD-2451-41B0-9E2F-432D1ECEB55C", "013380…
$ year                  [3m[90m<int>[39m[23m 2016, 2016, 2017, 2016, 2015, 2016, 2015, 2016,…
$ survey_sequence       [3m[90m<chr>[39m[23m "2016", "2016", "2017", "2016", "2015", "2016",…
$ grid_point            [3m[90m<int>[39m[23m 285, 505, 401, 23, 323, 511, 402, 196, 226, 564…
$ intercept_ground_code [3m[90m<chr>[39m[23m "BG", "BG", "BG", "BG", "BG", "BG", "BG", "BG",…
$ ground_group          [3m[90m<chr>[39m[23m "inorganic", "inorganic", "inorganic", "inorgan…
$ intercepts_pct        [3m[90m<dbl>[39m[23m 7.0, 4.5, 4.5, 9.0, 0.0, 0.0, 14.0, 18.5, 20.5,…


## QC
* No rows contain missing data, and this makes sense since `complete()` was used to fill response variables.
* Numeric variables appear correct in the summary

In [None]:
df_gridVeg_groundCover_intercepts[which(!complete.cases(df_gridVeg_groundCover_intercepts)), ]

survey_ID,year,survey_sequence,grid_point,intercept_ground_code,ground_group,intercepts_pct
<chr>,<int>,<chr>,<int>,<chr>,<chr>,<dbl>


In [None]:
summary(df_gridVeg_groundCover_intercepts)

  survey_ID              year      survey_sequence      grid_point   
 Length:24880       Min.   :2011   Length:24880       Min.   :  1.0  
 Class :character   1st Qu.:2011   Class :character   1st Qu.:136.8  
 Mode  :character   Median :2015   Mode  :character   Median :290.0  
                    Mean   :2014                      Mean   :286.3  
                    3rd Qu.:2016                      3rd Qu.:431.0  
                    Max.   :2017                      Max.   :583.0  
 intercept_ground_code ground_group       intercepts_pct  
 Length:24880          Length:24880       Min.   : 0.000  
 Class :character      Class :character   1st Qu.: 0.000  
 Mode  :character      Mode  :character   Median : 0.000  
                                          Mean   : 4.997  
                                          3rd Qu.: 2.000  
                                          Max.   :99.500  

In [None]:
df_gridVeg_groundCover_intercepts %>% filter(survey_ID == 234)

survey_ID,year,survey_sequence,grid_point,intercept_ground_code,ground_group,intercepts_pct
<chr>,<int>,<chr>,<int>,<chr>,<chr>,<dbl>
234,2011,2011-12,48,BG,inorganic,4.5
234,2011,2011-12,48,BV,vas_plant,19.5
234,2011,2011-12,48,G,inorganic,16.5
234,2011,2011-12,48,L,litter,41.5
234,2011,2011-12,48,LIC,lichen,0.0
234,2011,2011-12,48,M,nonvas_plant,0.0
234,2011,2011-12,48,M/L,nonvas_plant,12.0
234,2011,2011-12,48,OTHER,inorganic,0.0
234,2011,2011-12,48,R,inorganic,3.0
234,2011,2011-12,48,S,inorganic,2.0


# Output

In [None]:
# Revision not yet output
# write_csv(df_gridVeg_groundCover_intercepts, path = "gridVeg_groundCover_intercepts_WRANGLE.csv")