Use chromote for web scraping (#78)

- Update `wdpa_fetch()` to use the _chromote_ package to handle web scrapping (instead of _webdriver_). - Update `wdpa_fetch()` so that it can download country-level data in either shapefile or file geodatabase format (using the new `datatype` parameter). Since file geodatabase data appears to be more robust, `wdpa_fetch()` now defaults to downloading data in file geodatabase format. - Update `wdpa_clean()` to standardize column names so that cleaning either shapefile or file geodatabase data results in the same output. - Update `wdpa_clean()` so that it removes leading/trailing white space characters from the `"MANG_PLAN"` field. - Fix bug in `wdpa_read()` that causes output objects to contain no columns. - Update README and vignette to be compatible with updates to _ggmap_ package. - Remove _withr_ package from DESCRIPTION because it is not used.
prioritizr · Apr 25, 2024 · 9b1ef74 · 9b1ef74
1 parent ce28c41
commit 9b1ef74
Show file tree

Hide file tree

Showing 58 changed files with 816 additions and 415 deletions.
diff --git a/.github/workflows/R-CMD-check-macos.yaml b/.github/workflows/R-CMD-check-macos.yaml
@@ -50,16 +50,27 @@ jobs:
           brew install mpfr
           brew install automake
           brew install gdal
+          brew install --cask chromium
 
       - uses: r-lib/actions/setup-r-dependencies@v2
         with:
           cache-version: 4
           extra-packages: |
             any::rcmdcheck
+            any::remotes
             cran/XML
+            prepr=?ignore
           needs: |
             check
 
+      - name: Install spatial R packages
+        run: |
+          options(repos = "https://cloud.r-project.org/")
+          install.packages("sf")
+          install.packages("lwgeom")
+          remotes::install_github("dickoa/prepr")
+        shell: Rscript {0}
+
       - name: Session information
         run: |
           options(width = 100)

diff --git a/.github/workflows/R-CMD-check-ubuntu.yaml b/.github/workflows/R-CMD-check-ubuntu.yaml
@@ -52,7 +52,8 @@ jobs:
           sudo apt update
           sudo apt-get install \
             libgdal-dev libgeos-dev libproj-dev \
-            libgmp3-dev libmpfr-dev libudunits2-dev
+            libgmp3-dev libmpfr-dev libudunits2-dev \
+            chromium-browser
 
       - uses: r-lib/actions/setup-r-dependencies@v2
         with:

diff --git a/.github/workflows/R-CMD-check-windows.yaml b/.github/workflows/R-CMD-check-windows.yaml
@@ -46,6 +46,10 @@ jobs:
 
       - uses: r-lib/actions/setup-pandoc@v2
 
+      - uses: browser-actions/setup-chrome@v1
+        with:
+          install-dependencies: true
+
       - uses: r-lib/actions/setup-r@v2
         with:
           r-version: ${{ matrix.config.r }}
@@ -56,14 +60,17 @@ jobs:
           cache-version: 4
           extra-packages: |
             any::rcmdcheck
+            any::remotes
             any::XML
             prepr=?ignore
           needs: |
             check
 
       - name: Install spatial R packages
         run: |
-          install.packages("remotes")
+          options(repos = "https://cloud.r-project.org/")
+          install.packages("sf")
+          install.packages("lwgeom")
           remotes::install_github("dickoa/prepr")
         shell: Rscript {0}
 

diff --git a/.github/workflows/documentation.yaml b/.github/workflows/documentation.yaml
@@ -50,7 +50,8 @@ jobs:
           sudo apt update
           sudo apt-get install \
             libgdal-dev libgeos-dev libproj-dev \
-            libgmp3-dev libmpfr-dev libudunits2-dev
+            libgmp3-dev libmpfr-dev libudunits2-dev \
+            chromium-browser
 
       - uses: r-lib/actions/setup-r-dependencies@v2
         with:
@@ -84,6 +85,7 @@ jobs:
         run: |
           result <- urlchecker::url_check()
           result <- result[!startsWith(result$URL, "https://doi.org/"), , drop = FALSE]
+          result <- result[!startsWith(result$URL, "http://geo.abds.is"), , drop = FALSE]
           if (nrow(result) > 0) {
             print(result)
             stop("Invalid URLs detected")

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: wdpar
 Type: Package
-Version: 1.3.7.1
+Version: 1.3.7.2
 Title: Interface to the World Database on Protected Areas
 Description: Fetch and clean data from the World Database on Protected
     Areas (WDPA) and the World Database on Other Effective Area-Based
@@ -18,8 +18,7 @@ Imports:
     curl (>= 3.2),
     httr (>= 1.3.1),
     countrycode (>= 1.1.0),
-    withr (>= 2.5.0),
-    webdriver (>= 1.0.6),
+    chromote (>= 0.2.0),
     xml2 (>= 1.2.0),
     cli (>= 1.0.1),
     lwgeom (>= 0.2-1),
@@ -31,9 +30,9 @@ Suggests:
     knitr (>= 1.2.0),
     roxygen2 (>= 6.1.1),
     rmarkdown (>= 1.10),
-    ggmap (>= 2.6.1),
+    ggmap (>= 4.0.0),
     ggplot2 (>= 3.1.0),
-    prepr (>= 0.1.9000),
+    prepr (>= 0.3.0),
     dplyr (>= 1.0.7),
     ps (>= 1.5.0)
 Depends:
@@ -45,7 +44,7 @@ Language: en-US
 URL: https://prioritizr.github.io/wdpar/, https://github.com/prioritizr/wdpar
 BugReports: https://github.com/prioritizr/wdpar/issues
 VignetteBuilder: knitr
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Collate:
     'internal.R'
     'package.R'
@@ -61,4 +60,4 @@ Collate:
     'zzz.R'
 Roxygen: list(markdown = TRUE)
 Remotes:
-  dickoa/prepr
+  prioritizr/prepr
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,19 @@
+# wdpar 1.3.7.2
+
+- Update `wdpa_fetch()` to use the _chromote_ package to handle web scrapping
+ (instead of _webdriver_).
+- Update `wdpa_fetch()` so that it can download country-level data in either
+  shapefile or file geodatabase format (using the new `datatype` parameter).
+  Since file geodatabase data appears to be more robust, `wdpa_fetch()`
+  now defaults to downloading data in file geodatabase format.
+- Update `wdpa_clean()` to standardize column names so that cleaning
+  either shapefile or file geodatabase data results in the same output.
+- Update `wdpa_clean()` so that it removes leading/trailing white space
+  characters from the `"MANG_PLAN"` field.
+- Fix bug in `wdpa_read()` that causes output objects to contain no columns.
+- Update README and vignette to be compatible with updates to _ggmap_ package.
+- Remove _withr_ package from DESCRIPTION because it is not used.
+
 # wdpar 1.3.7.1
 
 - Update tests to accommodate corrupted data from Protected Planet.
@@ -97,7 +113,7 @@
 
 - CRAN release.
 - Update `read_sf_n` to import data faster.
-- Remove withr R package from DESCRIPTION because it is not used.
+- Remove _withr_ package from DESCRIPTION because it is not used.
 
 # wdpar 1.3.1.6
 

diff --git a/R/package.R b/R/package.R
@@ -29,9 +29,9 @@
 #' *Science*, **350**: 1255--1258.
 #'
 #' @name wdpar
-#' @aliases wdpar-package
 #' @docType package
-NULL
+#' @aliases wdpar-package
+"_PACKAGE"
 
 #' @import sf
 NULL

diff --git a/R/wdpa_clean.R b/R/wdpa_clean.R
@@ -80,6 +80,12 @@ NULL
 #'     This step is only performed if the argument to `exclude_unesco` is
 #'     `TRUE`.
 #'
+#'   \item Standardize column names. This is important so that data
+#'     imported as in shapefile or file geodatabase format have the
+#'     same column names. Specifically, if present, the `"PARENT_ISO3"` field is
+#'     renamed to "PARENT_ISO" and the "SHAPE" field is renamed to
+#'     `"geometry"`.
+#'
 #'   \item Create a field (`"GEOMETRY_TYPE"`) indicating if areas are
 #'     represented as point localities (`"POINT"`) or as polygons
 #'     (`"POLYGON"`).
@@ -144,6 +150,8 @@ NULL
 #'   \item The size of areas are calculated in square kilometers and stored in
 #'     the field `"AREA_KM2"`.
 #'
+#'   \item Trimming extra leading or trailing white space characters
+#'     from the `"MANG_PLAN"` field  (e.g., `" "`, `"\n"`, `"\r"`).
 #'  }
 #'
 #' @section Recommended practices for large datasets:
@@ -284,6 +292,20 @@ wdpa_clean <- function(x,
       cli::cli_progress_step("retaining UNESCO Biosphere Reserves")
     }
   }
+  # standardize column names
+  if (verbose) {
+    cli::cli_progress_step("standardizing field names")
+  }
+  if ("PARENT_ISO3" %in% names(x)) {
+    names(x)[names(x) == "PARENT_ISO3"] <- "PARENT_ISO"
+  }
+  if ("SHAPE" %in% names(x)) {
+    names(x)[names(x) == "SHAPE"] <- "geometry"
+    x <- sf::st_set_geometry(x, "geometry")
+  }
+  if (verbose) {
+    cli::cli_progress_step("standardizing field names")
+  }
   ## assign column indicating geometry type
   is_point <- vapply(sf::st_geometry(x), inherits, logical(1),
                      c("POINT", "MULTIPOINT"))
@@ -434,6 +456,13 @@ wdpa_clean <- function(x,
   }
   areas <- as.numeric(sf::st_area(x)) * 1e-6
   x$AREA_KM2 <- as.numeric(areas)
+  ## trim white space characters
+  if (verbose) {
+    cli::cli_progress_step(
+      "trimming extra white space characters from MANG_PLAN"
+    )
+  }
+  x$MANG_PLAN <- trimws(x$MANG_PLAN)
   ## move geometry to last column
   if ((!"geometry" %in% names(x))) {
     geom_col <- attr(x, "sf_column")

diff --git a/R/wdpa_fetch.R b/R/wdpa_fetch.R
@@ -42,6 +42,8 @@ NULL
 #'   reported? Defaults to `TRUE` in an interactive session, otherwise
 #'   `FALSE`.
 #'
+#' @inheritParams wdpa_url
+#'
 #' @details
 #' This function obtains and imports data from Protected Planet.
 #' By default (per `force_download = FALSE`), it will check to see if the
@@ -69,13 +71,33 @@ NULL
 #' (UNEP-WCMC 2019).
 #'
 #' @section Troubleshooting:
-#' This function will sometimes return the error message
-#' `PhantomJS signals port = 4567 is already in use`.
-#' This error message can occur when you have previously run the function and
-#' it threw an error, or it terminated early.
-#' It can also occur when attempting to run the the function in multiple
-#' sessions on the same computer.
-#' To address this issue, you will need to restart your computer.
+#' The function requires a Chromium-based browser
+#' (e.g., Google Chrome, Chromium, or Brave) to be installed.
+#' This is because it uses the \pkg{chromote} to find the URL
+#' for downloading data from Protected Planet.
+#' If you don't have one of these browsers installed, then please try
+#' installing Google Chrome.
+#' If you do have one of these browsers installed and this function
+#' throws an error indicating that it can't find the browser,
+#' try setting the `CHROMOTE_CHROME` environment variable to the
+#' file path of the executable. For example, you could do this with:
+#' ```
+#' Sys.setenv(CHROMOTE_CHROME = "INSERT_FILE_PATH_HERE.exe")
+#' ```
+#'
+#' Also, the function will sometimes produce a message
+#' that complains about a `handle_read_frame` error. Please understand
+#' that this message is, in fact, not an error and can be safely ignored
+#' (see <https://github.com/rstudio/chromote/pull/111>).
+#' As such, if you see this message when running the function,
+#' you can assume that the function still worked correctly.
+#' For reference, the misleading message will look something like this:
+#' ```
+#' [2024-04-23 12:06:36] [error] handle_read_frame error: websocketpp.transport:7 (End of File)
+#' ```
+#'
+#' For further help with troubleshooting, please refer to the documentation
+#' for the \pkg{chromote} package (https://rstudio.github.io/chromote/).
 #'
 #' @return [sf::sf()] object.
 #'
@@ -132,6 +154,7 @@ wdpa_fetch <- function(x, wait = FALSE,
                        check_version = TRUE,
                        n = NULL,
                        page_wait = 2,
+                       datatype = "gdb",
                        verbose = interactive()) {
   # check that arguments are valid
   ## check that classes are correct
@@ -157,10 +180,18 @@ wdpa_fetch <- function(x, wait = FALSE,
     ## find latest version of the dataset
     current_month_year <- wdpa_latest_version()
     ## find the download link and set file path to save the data
-    download_url <- wdpa_url(x, wait = wait, page_wait = page_wait)
+    download_url <- wdpa_url(
+      x, wait = wait, page_wait = page_wait, datatype = datatype
+    )
     ## note that file name conventions on protectedplanet.net have changed
     ## (detected on 8th Oct 2020) and so file names are manually changed
     ## to follow the previous convention
+    ##
+    ## also, note that to ensure backwwards compatibility with prevoius
+    ## versions of wdpar, data that are downloaded in file geodatabase format
+    ## will also be renamed to end with "-shapefile.zip" (even though they do
+    ## not contain shapefile data) and we will logic in wdpa_read() to
+    ## correctly import the data
     if (!identical(x, "global")) {
       file_name <- paste0("WDPA_", current_month_year, "_", country_code(x),
                           "-shapefile.zip")

diff --git a/R/wdpa_read.R b/R/wdpa_read.R
@@ -117,15 +117,38 @@ wdpa_read <- function(x, n = NULL) {
     wdpa_point_data <- wdpa_point_data[, point_matching_cols]
     wdpa_data <- rbind(wdpa_polygon_data, wdpa_point_data)
   } else {
-    ## extract any data stored in zip files
+    ## load country-level data
+    ### extract data stored in zip files
     zip_path <- dir(tdir, "^.*\\.zip$", recursive = TRUE, full.names = TRUE)
-    if (length(zip_path) > 0)
+    if (length(zip_path) > 0) {
       result <- Map(utils::unzip, zip_path,
                     exdir = gsub(".zip", "", zip_path, fixed = TRUE))
-    ## import shapefile data
+    }
+    ### try and find shapefiles and gdb in unzipped files
     shapefile_path <- dir(tdir, "^.*\\.shp$", recursive = TRUE,
                           full.names = TRUE)
-    wdpa_data <- lapply(shapefile_path, read_sf_n, n = n)
+    gdb_path <- dir(tdir, "^.*\\.gdb$", recursive = TRUE,
+                    full.names = TRUE, include.dirs = TRUE)
+    if (length(shapefile_path) > 0) {
+      ### if has shapefiles, then...
+      ### import shapefile data
+      wdpa_data <- lapply(shapefile_path, read_sf_n, n = n)
+      ### exclude any shapefiles that are empty and don't contain any data
+      if (length(wdpa_data) > 1) {
+        wdpa_data <- wdpa_data[vapply(wdpa_data, nrow, integer(1)) > 0]
+      }
+    } else if (length(gdb_path) > 0) {
+      ### if has file geodatabase, then...
+      ### determine which layers to import
+      d <- sf::st_layers(gdb_path)
+      is_d_spatial <- !vapply(d$crs, is.na, logical(1))
+      wdpa_data <- lapply(d$name[is_d_spatial], sf::read_sf, dsn = gdb_path)
+    } else {
+      stop(
+        "Couldn't find shapefile or file geodatabase inside zip file.",
+        call. = FALSE
+      )
+    }
     ## merge shapefile data together
     if (length(wdpa_data) > 1) {
       col_names <- Reduce(base::intersect, lapply(wdpa_data, names))