Spring cleaning 2024 (#18)

* Create qualtrics_fetch2 dataset * Update link to iptools * Reassign column names when rename = TRUE Fix bug where column names are wrong when rename = TRUE and column names are specified * Remove .data$ from column references Avoids tidyselect errors but still has "no visible binding for global variable" notes when checking * Revert "Remove .data$ from column references" This reverts commit 98f5b51. * Remove .data from selections * Fix tests with rename = FALSE * Tidy up DESCRIPTION
ropensci · Jan 12, 2024 · 4837033 · 4837033
1 parent fff109a
commit 4837033
Show file tree

Hide file tree

Showing 24 changed files with 190 additions and 69 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,32 +1,24 @@
 Package: excluder
 Title: Checks for Exclusion Criteria in Online Data
 Version: 0.5.0
-Authors@R: 
-  c(person(given = "Jeffrey R.",
-           family = "Stevens",
-           role = c("aut", "cre", "cph"),
-           email = "jeffrey.r.stevens@protonmail.com",
+Authors@R: c(
+    person("Jeffrey R.", "Stevens", , "jeffrey.r.stevens@protonmail.com", role = c("aut", "cre", "cph"),
            comment = c(ORCID = "0000-0003-2375-1360")),
-    person(given = "Joseph",
-             family = "O'Brien",
-             role = c("rev"),
-             comment = c(ORCID = "0000-0001-9851-5077")),
-    person(given = "Julia",
-             family = "Silge",
-             role = c("rev"),
-             email = "julia.silge@gmail.com",
-             comment = c(ORCID = "0000-0002-3671-836X")))
-Description: Data that are collected through online sources such as Mechanical 
-            Turk may require excluding rows because of IP address duplication, 
-            geolocation, or completion duration. This package facilitates
-            exclusion of these data for Qualtrics datasets.
+    person("Joseph", "O'Brien", role = "rev",
+           comment = c(ORCID = "0000-0001-9851-5077")),
+    person("Julia", "Silge", , "julia.silge@gmail.com", role = "rev",
+           comment = c(ORCID = "0000-0002-3671-836X"))
+  )
+Description: Data that are collected through online sources such as
+    Mechanical Turk may require excluding rows because of IP address
+    duplication, geolocation, or completion duration. This package
+    facilitates exclusion of these data for Qualtrics datasets.
 License: GPL (>= 3)
-Encoding: UTF-8
-LazyData: true
-Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
-URL: https://docs.ropensci.org/excluder/, https://github.com/ropensci/excluder/
+URL: https://docs.ropensci.org/excluder/,
+    https://github.com/ropensci/excluder/
 BugReports: https://github.com/ropensci/excluder/issues/
+Depends: 
+    R (>= 3.5.0)
 Imports: 
     cli,
     curl,
@@ -40,8 +32,6 @@ Imports:
     stringr,
     tidyr,
     tidyselect
-Depends: 
-    R (>= 3.5.0)
 Suggests: 
     covr,
     knitr,
@@ -50,5 +40,10 @@ Suggests:
     rmarkdown,
     testthat (>= 3.0.0),
     withr
+VignetteBuilder: 
+    knitr
 Config/testthat/edition: 3
-VignetteBuilder: knitr
+Encoding: UTF-8
+LazyData: true
+Roxygen: list(markdown = TRUE)
+RoxygenNote: 7.2.3
diff --git a/R/duplicates.R b/R/duplicates.R
@@ -77,6 +77,8 @@ mark_duplicates <- function(x,
   # Rename columns
   if (rename) {
     x <- rename_columns(x, alert = FALSE)
+    id_col <- "ResponseId"
+    ip_col <- "IPAddress"
   }
 
   # Check for presence of required columns
@@ -100,7 +102,7 @@ mark_duplicates <- function(x,
     no_nas_ip <- tidyr::drop_na(x, tidyselect::all_of(ip_col))
     n_nas_ip <- nrow(x) - nrow(no_nas_ip)
     same_ip <- janitor::get_dupes(no_nas_ip, tidyselect::all_of(ip_col)) %>%
-      dplyr::select(-.data$dupe_count)
+      dplyr::select(-"dupe_count")
     n_same_ip <- nrow(same_ip)
     if (identical(quiet, FALSE)) {
       cli::cli_alert_info(
@@ -125,7 +127,7 @@ mark_duplicates <- function(x,
       no_nas_loc,
       tidyselect::all_of(location_col)
     ) %>%
-      dplyr::select(-.data$dupe_count)
+      dplyr::select(-"dupe_count")
     n_same_location <- nrow(same_location)
     if (identical(quiet, FALSE)) {
       cli::cli_alert_info(
@@ -244,7 +246,7 @@ check_duplicates <- function(x,
     quiet = quiet
   ) %>%
     dplyr::filter(.data$exclusion_duplicates == "duplicates") %>%
-    keep_marked_column(.data$exclusion_duplicates, keep)
+    keep_marked_column("exclusion_duplicates", keep)
 
   # Determine whether to print results
   print_data(exclusions, print)
@@ -316,7 +318,7 @@ exclude_duplicates <- function(x,
     quiet = quiet
   ) %>%
     dplyr::filter(.data$exclusion_duplicates != "duplicates") %>%
-    dplyr::select(-.data$exclusion_duplicates)
+    dplyr::select(-"exclusion_duplicates")
 
   # Print exclusion statement
   if (identical(silent, FALSE)) {

diff --git a/R/duration.R b/R/duration.R
@@ -61,6 +61,7 @@ mark_duration <- function(x,
   # Rename columns
   if (rename) {
     x <- rename_columns(x, alert = FALSE)
+    id_col <- "ResponseId"
   }
 
   # Check for presence of required columns
@@ -192,7 +193,7 @@ check_duration <- function(x,
   ) %>%
     dplyr::filter(.data$exclusion_duration == "duration_quick" |
       .data$exclusion_duration == "duration_slow") %>%
-    keep_marked_column(.data$exclusion_duration, keep)
+    keep_marked_column("exclusion_duration", keep)
 
   # Determine whether to print results
   print_data(exclusions, print)
@@ -256,7 +257,7 @@ exclude_duration <- function(x,
   ) %>%
     dplyr::filter(.data$exclusion_duration != "duration_quick" &
       .data$exclusion_duration != "duration_slow") %>%
-    dplyr::select(-.data$exclusion_duration)
+    dplyr::select(-"exclusion_duration")
 
   # Print exclusion statement
 

diff --git a/R/ip.R b/R/ip.R
@@ -75,6 +75,8 @@ mark_ip <- function(x,
   # Rename columns
   if (rename) {
     x <- rename_columns(x, alert = FALSE)
+    id_col <- "ResponseId"
+    ip_col <- "IPAddress"
   }
 
   # Check for presence of required column
@@ -123,7 +125,7 @@ mark_ip <- function(x,
     outside_country <- !ipaddress::is_within_any(survey_ips, country_ip_ranges)
     filtered_data <- dplyr::bind_cols(filtered_data, outside = outside_country)
     filtered_data <- dplyr::filter(filtered_data, .data$outside == TRUE) %>%
-      dplyr::select(-.data$outside)
+      dplyr::select(-"outside")
     n_outside_country <- nrow(filtered_data)
 
     # Filter NAs when requested
@@ -231,7 +233,7 @@ check_ip <- function(x,
     quiet = quiet
   ) %>%
     dplyr::filter(.data$exclusion_ip == "ip") %>%
-    keep_marked_column(.data$exclusion_ip, keep)
+    keep_marked_column("exclusion_ip", keep)
 
   # Determine whether to print results
   print_data(exclusions, print)
@@ -300,7 +302,7 @@ exclude_ip <- function(x,
     quiet = quiet
   ) %>%
     dplyr::filter(.data$exclusion_ip != "ip") %>%
-    dplyr::select(-.data$exclusion_ip)
+    dplyr::select(-"exclusion_ip")
 
   # Print exclusion statement
   if (identical(silent, FALSE)) {

diff --git a/R/location.R b/R/location.R
@@ -65,6 +65,8 @@ mark_location <- function(x,
   # Rename columns
   if (rename) {
     x <- rename_columns(x, alert = FALSE)
+    id_col <- "ResponseId"
+    location_col <- c("LocationLatitude", "LocationLongitude")
   }
 
   # Check for presence of required column
@@ -91,7 +93,7 @@ mark_location <- function(x,
   # Determine if geolocation is within US
   no_nas$country <- maps::map.where(database = "usa", longitude, latitude)
   outside_us <- dplyr::filter(no_nas, is.na(.data$country)) %>%
-    dplyr::select(-.data$country)
+    dplyr::select(-"country")
   n_outside_us <- nrow(outside_us)
 
   # Combine no location with outside US
@@ -189,7 +191,7 @@ check_location <- function(x,
     quiet = quiet
   ) %>%
     dplyr::filter(.data$exclusion_location == "location") %>%
-    keep_marked_column(.data$exclusion_location, keep)
+    keep_marked_column("exclusion_location", keep)
 
   # Determine whether to print results
   print_data(exclusions, print)
@@ -249,7 +251,7 @@ exclude_location <- function(x,
     quiet = quiet
   ) %>%
     dplyr::filter(.data$exclusion_location != "location") %>%
-    dplyr::select(-.data$exclusion_location)
+    dplyr::select(-"exclusion_location")
 
   # Print exclusion statement
   if (identical(silent, FALSE)) {

diff --git a/R/preview.R b/R/preview.R
@@ -51,6 +51,8 @@ mark_preview <- function(x,
   # Rename columns
   if (rename) {
     x <- rename_columns(x, alert = FALSE)
+    id_col <- "ResponseId"
+    preview_col <- "Status"
   }
 
   # Check for presence of required column
@@ -148,7 +150,7 @@ check_preview <- function(x,
     quiet = quiet
   ) %>%
     dplyr::filter(.data$exclusion_preview == "preview") %>%
-    keep_marked_column(.data$exclusion_preview, keep)
+    keep_marked_column("exclusion_preview", keep)
 
   # Determine whether to print results
   print_data(exclusions, print)
@@ -205,7 +207,7 @@ exclude_preview <- function(x,
     quiet = quiet
   ) %>%
     dplyr::filter(.data$exclusion_preview != "preview") %>%
-    dplyr::select(-.data$exclusion_preview)
+    dplyr::select(-"exclusion_preview")
 
   # Print exclusion statement
   if (identical(silent, FALSE)) {

diff --git a/R/progress.R b/R/progress.R
@@ -64,6 +64,7 @@ mark_progress <- function(x,
   # Rename columns
   if (rename) {
     x <- rename_columns(x, alert = FALSE)
+    id_col <- "ResponseId"
   }
 
   # Check for presence of required column
@@ -185,7 +186,7 @@ check_progress <- function(x,
     quiet = quiet
   ) %>%
     dplyr::filter(.data$exclusion_progress == "progress") %>%
-    keep_marked_column(.data$exclusion_progress, keep)
+    keep_marked_column("exclusion_progress", keep)
 
   # Determine whether to print results
   print_data(exclusions, print)
@@ -253,7 +254,7 @@ exclude_progress <- function(x,
     quiet = quiet
   ) %>%
     dplyr::filter(.data$exclusion_progress != "progress") %>%
-    dplyr::select(-.data$exclusion_progress)
+    dplyr::select(-"exclusion_progress")
 
   # Print exclusion statement
   if (identical(silent, FALSE)) {

diff --git a/R/qualtrics_data.R b/R/qualtrics_data.R
@@ -38,7 +38,8 @@
 #'
 #' A dataset containing the metadata from a standard Qualtrics survey with
 #' browser metadata collected and exported with "Use numeric values".
-#' These data were randomly generated using [iptools::ip_random()] and
+#' These data were randomly generated using [iptools::ip_random()](
+#' https://hrbrmstr.github.io/iptools/reference/ip_random.html) and
 #' [rgeolocate::ip2location()](
 #' https://cran.r-project.org/package=rgeolocate) functions.
 #'
@@ -71,7 +72,8 @@
 #'
 #' A dataset containing the metadata from a standard Qualtrics survey with
 #' browser metadata collected and exported with "Use choice text".
-#' These data were randomly generated using [iptools::ip_random()] and
+#' These data were randomly generated using [iptools::ip_random()](
+#' https://hrbrmstr.github.io/iptools/reference/ip_random.html) and
 #' [rgeolocate::ip2location()](
 #' https://cran.r-project.org/package=rgeolocate) functions.
 #'
@@ -108,10 +110,8 @@
 #' were imported using
 #' [`qualtRics::fetch_survey()`](
 #' https://docs.ropensci.org/qualtRics/reference/fetch_survey.html).
-#' and then the secondary labels were assigned as column names with
-#' [`sjlabelled::get_label()`](
-#' https://strengejacke.github.io/sjlabelled/reference/get_label.html).
-#' These data were randomly generated using [iptools::ip_random()] and
+#' These data were randomly generated using [iptools::ip_random()](
+#' https://hrbrmstr.github.io/iptools/reference/ip_random.html) and
 #' [rgeolocate::ip2location()](
 #' https://cran.r-project.org/package=rgeolocate) functions.
 #'
@@ -141,3 +141,46 @@
 #' }
 #' @family data
 "qualtrics_fetch"
+
+#' Example numeric metadata imported with `qualtRics::fetch_survey()` from
+#' simulated Qualtrics study but with labels included as column names
+#'
+#' A dataset containing the metadata from a standard Qualtrics survey with
+#' browser metadata collected and exported with "Use numeric values". The data
+#' were imported using
+#' [`qualtRics::fetch_survey()`](
+#' https://docs.ropensci.org/qualtRics/reference/fetch_survey.html).
+#' and then the secondary labels were assigned as column names with
+#' [`sjlabelled::get_label()`](
+#' https://strengejacke.github.io/sjlabelled/reference/get_label.html).
+#' These data were randomly generated using [iptools::ip_random()](
+#' https://hrbrmstr.github.io/iptools/reference/ip_random.html) and
+#' [rgeolocate::ip2location()](
+#' https://cran.r-project.org/package=rgeolocate) functions.
+#'
+#' @format A data frame with 100 rows and 17 variables:
+#' \describe{
+#'   \item{Start Date}{date and time data collection started, in ISO 8601 format}
+#'   \item{End Date}{date and time data collection ended, in ISO 8601 format}
+#'   \item{Response Type}{numeric flag for preview (1) vs. implemented survey (0)
+#'   entries}
+#'   \item{IP Address}{participant IP address (truncated for anonymity)}
+#'   \item{Progress}{percentage of survey completed}
+#'   \item{Duration (in seconds)}{duration of time required to complete survey,
+#'   in seconds}
+#'   \item{Finished}{numeric flag for whether survey was completed (1) or
+#'   progress was < 100 (0)}
+#'   \item{Recorded Date}{date and time survey was recorded, in ISO 8601 format}
+#'   \item{Response ID}{random ID for participants}
+#'   \item{Location Latitude}{latitude geolocated from IP address}
+#'   \item{Location Longitude}{longitude geolocated from IP address}
+#'   \item{User Language}{language set in Qualtrics}
+#'   \item{Click to write the question text - Browser}{user web browser type}
+#'   \item{Click to write the question text - Version}{user web browser version}
+#'   \item{Click to write the question text - Operating System}{user operating system}
+#'   \item{Click to write the question text - Resolution}{user screen resolution}
+#'   \item{like}{response to question about whether the user liked the survey
+#'   (1 = Yes, 0 = No)}
+#' }
+#' @family data
+"qualtrics_fetch2"
diff --git a/R/rename_columns.R b/R/rename_columns.R
@@ -58,15 +58,15 @@ rename_columns <- function(x, alert = TRUE) {
     # Rename columns
     x %>%
       dplyr::rename(
-        StartDate = .data$`Start Date`,
-        EndDate = .data$`End Date`,
-        Status = .data$`Response Type`,
-        IPAddress = .data$`IP Address`,
-        RecordedDate = .data$`Recorded Date`,
-        ResponseId = .data$`Response ID`,
-        LocationLatitude = .data$`Location Latitude`,
-        LocationLongitude = .data$`Location Longitude`,
-        UserLanguage = .data$`User Language`
+        StartDate = "Start Date",
+        EndDate = "End Date",
+        Status = "Response Type",
+        IPAddress = "IP Address",
+        RecordedDate = "Recorded Date",
+        ResponseId = "Response ID",
+        LocationLatitude = "Location Latitude",
+        LocationLongitude = "Location Longitude",
+        UserLanguage = "User Language"
       ) %>%
       dplyr::rename_with(~ gsub(throwaway, "", .x), dplyr::contains(throwaway))
   } else if (any(grepl("_Resolution", column_names))) {