<a href="https://colab.research.google.com/github/samsoe/mpg_notebooks/blob/master/Bird_point_count_monitoring_Wrangle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Documentation: [Readme bird point count monitoring](https://drive.google.com/open?id=1PRryJzGOUtfr-fKXzb3tbr48xiaTuAMVk18XFXlvxcM)

# Tools

In [0]:
library(tidyverse)
library(lubridate)

In [0]:
# utilitize replace_with_na function
install.packages("naniar")
library(naniar)

# Source

In [0]:
# 2020-05-15_bird_surveyInfo_records_function.csv
src = "https://drive.google.com/uc?id=1Cmo8e2H0Td2pWXrB4-_YHwdmUJP2Isnj"

In [0]:
df <- read.csv(src, stringsAsFactors = FALSE)

# Rename Columns

Update column values to [DB Schema](https://docs.google.com/document/d/1PRryJzGOUtfr-fKXzb3tbr48xiaTuAMVk18XFXlvxcM/edit#heading=h.5inr0hjz4m4m) 

In [0]:
new_names <- c('survey_ID', 'survey_date', 'survey_time_start_MDT', 'survey_time_end_MDT', 'survey_visit', 
              'survey_grid_point', 'survey_observer', 'survey_wind_code', 'survey_sky_code',
              'survey_noise_code', 'survey_temperature_F', 'records_ID', 'records_spe_code', 
              'records_abundance', 'records_sex', 'records_detect_song', 'records_detect_call', 
              'records_detect_visual', 'records_detect_other', 'records_detect_distance_min_meters', 
              'records_interval', 'records_location_code', 'habitat_location_description', 
              'function_spe_name_common', 'function_habitat_preference', 'function_food_preference', 
              'function_nesting_behavior', 'function_feeding_behavior', 
              'function_taxa_order', 'function_taxa_family')

df <- setNames(df, new_names)

# Clean

In [30]:
str(df)

'data.frame':	117911 obs. of  30 variables:
 $ survey_ID                         : int  1 1 1 1 1 1 1 1 1 1 ...
 $ survey_date                       : chr  "2010-05-31" "2010-05-31" "2010-05-31" "2010-05-31" ...
 $ survey_time_start_MDT             : int  709 709 709 709 709 709 709 709 709 709 ...
 $ survey_time_end_MDT               : int  719 719 719 719 719 719 719 719 719 719 ...
 $ survey_visit                      : int  1 1 1 1 1 1 1 1 1 1 ...
 $ survey_grid_point                 : int  476 476 476 476 476 476 476 476 476 476 ...
 $ survey_observer                   : chr  "Eric Rasmussen" "Eric Rasmussen" "Eric Rasmussen" "Eric Rasmussen" ...
 $ survey_wind_code                  : int  1 1 1 1 1 1 1 1 1 1 ...
 $ survey_sky_code                   : int  2 2 2 2 2 2 2 2 2 2 ...
 $ survey_noise_code                 : int  2 2 2 2 2 2 2 2 2 2 ...
 $ survey_temperature_F              : int  50 50 50 50 50 50 50 50 50 50 ...
 $ records_ID                        : int  1 2 3 4 5 6 7 

## records_sex

In variable `sex`, limit data to c("Female", "Male", "Unknown"). Correct typo and capitalization errors.

In [31]:
# Show unique values
fct_count(df$records_sex)

f,n
<fct>,<int>
,1
1,2
Female,1869
M,1
male,5
Male,80968
MaMe,1
unknown,5
Unknown,35059


In [32]:
# Display deviants
df %>%
  select(survey_date, records_sex) %>%
  filter(records_sex != "Female" & 
         records_sex != "Male" & 
         records_sex != "Unknown")

survey_date,records_sex
<chr>,<chr>
2010-06-25,unknown
2011-07-01,unknown
2012-07-11,male
2013-06-17,unknown
2013-06-18,unknown
2013-05-22,male
2013-05-17,male
2014-05-16,male
2014-06-14,MaMe
2015-05-21,unknown


In [33]:
# Capitalize all values
df <- df %>%
  mutate(records_sex = str_to_title(records_sex))

fct_count(df$records_sex)

f,n
<fct>,<int>
,1
1,2
Female,1869
M,1
Male,80973
Mame,1
Unknown,35064


In [0]:
df <- df %>%
  mutate(records_sex = ifelse(records_sex == "M", "Male",
               ifelse(records_sex == "Mame", "Male",
               ifelse(records_sex == 1, "Unknown",
               ifelse(records_sex == "", "Unknown", records_sex)))))

In [35]:
fct_count(df$records_sex)

f,n
<fct>,<int>
Female,1869
Male,80975
Unknown,35067


## records_location_code

In variable `location_code`, capitalize instances of "f" and "u".

In [36]:
fct_count(df$records_location_code)

f,n
<fct>,<int>
0,11959
1,18433
10,141
11,1452
12,860
13,783
14,256
15,5448
16,8384
17,1694


In [0]:
# Capitalize "f","u"
df <- df %>%
  mutate(records_location_code = str_to_title(records_location_code))

In [38]:
fct_count(df$records_location_code)

f,n
<fct>,<int>
0,11959
1,18433
10,141
11,1452
12,860
13,783
14,256
15,5448
16,8384
17,1694


## survey_observer

In [39]:
unique(sort(df$survey_observer))

In [40]:
df <- df %>%
  mutate(survey_observer = ifelse(survey_observer == "n", 
                                         "unknown", 
                                         survey_observer))

unique(df$survey_observer)

## records_detect_other


Leave values as is for the forseeable future 

In [0]:
fct_count(df$records_detect_other)

## records_interval

In [42]:
fct_count(df$records_interval)

f,n
<fct>,<int>
,281
1,85175
1`,1
2,29592
After survey,1838
Before survey,1023
U,1


In [117]:
df <- df %>%
  mutate(records_interval = ifelse(records_interval == '1`', '1', 
                            ifelse(records_interval == '', is.na, records_interval)))

fct_count(df$records_interval)

f,n
<fct>,<int>
1,85176
2,29592
After survey,1838
Before survey,1023
U,1
,281


## Resolve "na" and ""

['Habitat', 'Food', 'Nesting', 'Behavior', 'Order', 'Family'] all have 2 "na" values and 451 "" values.  Update these cells to NA

In [46]:
fct_count(df$function_habitat_preference)

f,n
<fct>,<int>
,451
Forest,19233
Grassland,41108
Lake/Pond,2380
Marsh,1344
Mountains,1353
na,2
Open Woodland,42342
River/Stream,201
Scrub,6067


In [47]:
df <- df %>%
  mutate_if(is.character, list(~ na_if(., "na"))) %>%
  # would be nice to mutate "na" and "" in a single line
  mutate_if(is.character, list(~ na_if(., ""))) %>%
  # remove "na" and "" factor levels
  droplevels()

fct_count(df$function_habitat_preference)

f,n
<fct>,<int>
Forest,19233
Grassland,41108
Lake/Pond,2380
Marsh,1344
Mountains,1353
Open Woodland,42342
River/Stream,201
Scrub,6067
Shoreline,506
Town,2924


## Time

### survey_time_start

In [50]:
sort(unique(df$survey_time_start_MDT))

In [0]:
# replace_with_na function is from naniar
df <- df %>%
  replace_with_na(replace = list(survey_time_start_MDT = c(0, 1, 50, 98)))

### survey_time_end

In [118]:
sort(unique(df$survey_time_end))

In [0]:
# replace_with_na function is from naniar
df <- df %>%
  replace_with_na(replace = list(survey_time_end_MDT = c(6658, 1, 2, 85, 107)))

# Set DataTypes

In [72]:
str(df)

'data.frame':	117911 obs. of  30 variables:
 $ survey_ID                         : int  1 1 1 1 1 1 1 1 1 1 ...
 $ survey_date                       : Date, format: "2010-05-31" "2010-05-31" ...
 $ survey_time_start_MDT             : int  709 709 709 709 709 709 709 709 709 709 ...
 $ survey_time_end_MDT               : int  719 719 719 719 719 719 719 719 719 719 ...
 $ survey_visit                      : int  1 1 1 1 1 1 1 1 1 1 ...
 $ survey_grid_point                 : int  476 476 476 476 476 476 476 476 476 476 ...
 $ survey_observer                   : chr  "Eric Rasmussen" "Eric Rasmussen" "Eric Rasmussen" "Eric Rasmussen" ...
 $ survey_wind_code                  : int  1 1 1 1 1 1 1 1 1 1 ...
 $ survey_sky_code                   : int  2 2 2 2 2 2 2 2 2 2 ...
 $ survey_noise_code                 : int  2 2 2 2 2 2 2 2 2 2 ...
 $ survey_temperature_F              : int  50 50 50 50 50 50 50 50 50 50 ...
 $ records_ID                        : int  1 2 3 4 5 6 7 8 9 10 ...
 $ rec

## survey_date

In [0]:
df$survey_date <- as_date(df$survey_date)

## survey_time_*

In [0]:
df$survey_time_start_MDT <- hms::as.hms(parse_date_time(df$survey_time_start_MDT, "HM"))
df$survey_time_end_MDT <- hms::as.hms(parse_date_time(df$survey_time_end_MDT, "HM"))

## records_detect_*

In [0]:
df <- df %>%
  mutate_at(vars(16:18), as.logical)

## records_location_code

In [115]:
fct_count(df$records_location_code)

f,n
<fct>,<int>
0,11959
1,18433
10,141
11,1452
12,860
13,783
14,256
15,5448
16,8384
17,1694


In [119]:
str(df)

'data.frame':	117911 obs. of  30 variables:
 $ survey_ID                         : int  1 1 1 1 1 1 1 1 1 1 ...
 $ survey_date                       : Date, format: "2010-05-31" "2010-05-31" ...
 $ survey_time_start_MDT             : 'hms' num  07:09:00 07:09:00 07:09:00 07:09:00 ...
  ..- attr(*, "units")= chr "secs"
 $ survey_time_end_MDT               : 'hms' num  07:19:00 07:19:00 07:19:00 07:19:00 ...
  ..- attr(*, "units")= chr "secs"
 $ survey_visit                      : int  1 1 1 1 1 1 1 1 1 1 ...
 $ survey_grid_point                 : int  476 476 476 476 476 476 476 476 476 476 ...
 $ survey_observer                   : chr  "Eric Rasmussen" "Eric Rasmussen" "Eric Rasmussen" "Eric Rasmussen" ...
 $ survey_wind_code                  : int  1 1 1 1 1 1 1 1 1 1 ...
 $ survey_sky_code                   : int  2 2 2 2 2 2 2 2 2 2 ...
 $ survey_noise_code                 : int  2 2 2 2 2 2 2 2 2 2 ...
 $ survey_temperature_F              : int  50 50 50 50 50 50 50 50 50 50 ...
 

# Push to BigQuery

Based on my understanding the push to BQ from bigrquery is still convoluted.  I was able to push to a table in BQ but the column order was not preserved.

## API Key

In [0]:
bq_auth(path = "/content/mpg-data-warehouse-34434e1a9914.json")

## Connection

In [0]:
Sys.setenv(BIGQUERY_TEST_PROJECT = "mpg-data-warehouse")
billing <- bq_test_project()

In [0]:
con <- dbConnect(
  bigrquery::bigquery(),
  project = "mpg-data-warehouse",
  dataset = "bird_point_counts",
  billing = billing
)

In [0]:
con

<BigQueryConnection>
  Dataset: mpg-data-warehouse.bird_point_counts
  Billing: mpg-data-warehouse

In [0]:
dbListTables(con)

## Push

### testing


In [0]:
ds <- bq_dataset("mpg-data-warehouse", "bird_point_counts")

In [0]:
ds

<bq_dataset> mpg-data-warehouse.bird_point_counts

In [0]:
bq_test <- bq_table(ds, "testing")

In [216]:
bq_table_exists(bq_table(ds, "bird_surveyInfo_records_function"))

In [0]:
bq_table_upload(bq_test, df)

In [206]:


bq_x <- bq_table_create(
  ds, "bird_point_counts"),
  "some_x"
)

ERROR: ignored

In [0]:
help(bq_table_upload)

### odbc

In [209]:
con <- DBI::dbConnect(odbc::odbc(),
                      Driver         = "[your driver's name]",
                      Catalog        = "[BigQuery project name]",
                      Email          = "[Google service account email]",
                      KeyFilePath    = "[Full path to key file]",
                      OAuthMechanism = 0)

<bq_dataset> mpg-data-warehouse.bird_point_counts

In [72]:
str(df)

'data.frame':	117911 obs. of  30 variables:
 $ survey_ID                         : int  1 1 1 1 1 1 1 1 1 1 ...
 $ survey_date                       : Date, format: "2010-05-31" "2010-05-31" ...
 $ survey_time_start_MDT             : int  709 709 709 709 709 709 709 709 709 709 ...
 $ survey_time_end_MDT               : int  719 719 719 719 719 719 719 719 719 719 ...
 $ survey_visit                      : int  1 1 1 1 1 1 1 1 1 1 ...
 $ survey_grid_point                 : int  476 476 476 476 476 476 476 476 476 476 ...
 $ survey_observer                   : chr  "Eric Rasmussen" "Eric Rasmussen" "Eric Rasmussen" "Eric Rasmussen" ...
 $ survey_wind_code                  : int  1 1 1 1 1 1 1 1 1 1 ...
 $ survey_sky_code                   : int  2 2 2 2 2 2 2 2 2 2 ...
 $ survey_noise_code                 : int  2 2 2 2 2 2 2 2 2 2 ...
 $ survey_temperature_F              : int  50 50 50 50 50 50 50 50 50 50 ...
 $ records_ID                        : int  1 2 3 4 5 6 7 8 9 10 ...
 $ rec