Skip to content

Commit

Permalink
Spring cleaning 2024 (#18)
Browse files Browse the repository at this point in the history
* Create qualtrics_fetch2 dataset

* Update link to iptools

* Reassign column names when rename = TRUE

Fix bug where column names are wrong when rename = TRUE and column names are specified

* Remove .data$ from column references

Avoids tidyselect errors but still has "no visible binding for global variable" notes when checking

* Revert "Remove .data$ from column references"

This reverts commit 98f5b51.

* Remove .data from selections

* Fix tests with rename = FALSE

* Tidy up DESCRIPTION
  • Loading branch information
JeffreyRStevens committed Jan 12, 2024
1 parent fff109a commit 4837033
Show file tree
Hide file tree
Showing 24 changed files with 190 additions and 69 deletions.
47 changes: 21 additions & 26 deletions DESCRIPTION
@@ -1,32 +1,24 @@
Package: excluder
Title: Checks for Exclusion Criteria in Online Data
Version: 0.5.0
Authors@R:
c(person(given = "Jeffrey R.",
family = "Stevens",
role = c("aut", "cre", "cph"),
email = "jeffrey.r.stevens@protonmail.com",
Authors@R: c(
person("Jeffrey R.", "Stevens", , "jeffrey.r.stevens@protonmail.com", role = c("aut", "cre", "cph"),
comment = c(ORCID = "0000-0003-2375-1360")),
person(given = "Joseph",
family = "O'Brien",
role = c("rev"),
comment = c(ORCID = "0000-0001-9851-5077")),
person(given = "Julia",
family = "Silge",
role = c("rev"),
email = "julia.silge@gmail.com",
comment = c(ORCID = "0000-0002-3671-836X")))
Description: Data that are collected through online sources such as Mechanical
Turk may require excluding rows because of IP address duplication,
geolocation, or completion duration. This package facilitates
exclusion of these data for Qualtrics datasets.
person("Joseph", "O'Brien", role = "rev",
comment = c(ORCID = "0000-0001-9851-5077")),
person("Julia", "Silge", , "julia.silge@gmail.com", role = "rev",
comment = c(ORCID = "0000-0002-3671-836X"))
)
Description: Data that are collected through online sources such as
Mechanical Turk may require excluding rows because of IP address
duplication, geolocation, or completion duration. This package
facilitates exclusion of these data for Qualtrics datasets.
License: GPL (>= 3)
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
URL: https://docs.ropensci.org/excluder/, https://github.com/ropensci/excluder/
URL: https://docs.ropensci.org/excluder/,
https://github.com/ropensci/excluder/
BugReports: https://github.com/ropensci/excluder/issues/
Depends:
R (>= 3.5.0)
Imports:
cli,
curl,
Expand All @@ -40,8 +32,6 @@ Imports:
stringr,
tidyr,
tidyselect
Depends:
R (>= 3.5.0)
Suggests:
covr,
knitr,
Expand All @@ -50,5 +40,10 @@ Suggests:
rmarkdown,
testthat (>= 3.0.0),
withr
VignetteBuilder:
knitr
Config/testthat/edition: 3
VignetteBuilder: knitr
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
10 changes: 6 additions & 4 deletions R/duplicates.R
Expand Up @@ -77,6 +77,8 @@ mark_duplicates <- function(x,
# Rename columns
if (rename) {
x <- rename_columns(x, alert = FALSE)
id_col <- "ResponseId"
ip_col <- "IPAddress"
}

# Check for presence of required columns
Expand All @@ -100,7 +102,7 @@ mark_duplicates <- function(x,
no_nas_ip <- tidyr::drop_na(x, tidyselect::all_of(ip_col))
n_nas_ip <- nrow(x) - nrow(no_nas_ip)
same_ip <- janitor::get_dupes(no_nas_ip, tidyselect::all_of(ip_col)) %>%
dplyr::select(-.data$dupe_count)
dplyr::select(-"dupe_count")
n_same_ip <- nrow(same_ip)
if (identical(quiet, FALSE)) {
cli::cli_alert_info(
Expand All @@ -125,7 +127,7 @@ mark_duplicates <- function(x,
no_nas_loc,
tidyselect::all_of(location_col)
) %>%
dplyr::select(-.data$dupe_count)
dplyr::select(-"dupe_count")
n_same_location <- nrow(same_location)
if (identical(quiet, FALSE)) {
cli::cli_alert_info(
Expand Down Expand Up @@ -244,7 +246,7 @@ check_duplicates <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_duplicates == "duplicates") %>%
keep_marked_column(.data$exclusion_duplicates, keep)
keep_marked_column("exclusion_duplicates", keep)

# Determine whether to print results
print_data(exclusions, print)
Expand Down Expand Up @@ -316,7 +318,7 @@ exclude_duplicates <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_duplicates != "duplicates") %>%
dplyr::select(-.data$exclusion_duplicates)
dplyr::select(-"exclusion_duplicates")

# Print exclusion statement
if (identical(silent, FALSE)) {
Expand Down
5 changes: 3 additions & 2 deletions R/duration.R
Expand Up @@ -61,6 +61,7 @@ mark_duration <- function(x,
# Rename columns
if (rename) {
x <- rename_columns(x, alert = FALSE)
id_col <- "ResponseId"
}

# Check for presence of required columns
Expand Down Expand Up @@ -192,7 +193,7 @@ check_duration <- function(x,
) %>%
dplyr::filter(.data$exclusion_duration == "duration_quick" |
.data$exclusion_duration == "duration_slow") %>%
keep_marked_column(.data$exclusion_duration, keep)
keep_marked_column("exclusion_duration", keep)

# Determine whether to print results
print_data(exclusions, print)
Expand Down Expand Up @@ -256,7 +257,7 @@ exclude_duration <- function(x,
) %>%
dplyr::filter(.data$exclusion_duration != "duration_quick" &
.data$exclusion_duration != "duration_slow") %>%
dplyr::select(-.data$exclusion_duration)
dplyr::select(-"exclusion_duration")

# Print exclusion statement

Expand Down
8 changes: 5 additions & 3 deletions R/ip.R
Expand Up @@ -75,6 +75,8 @@ mark_ip <- function(x,
# Rename columns
if (rename) {
x <- rename_columns(x, alert = FALSE)
id_col <- "ResponseId"
ip_col <- "IPAddress"
}

# Check for presence of required column
Expand Down Expand Up @@ -123,7 +125,7 @@ mark_ip <- function(x,
outside_country <- !ipaddress::is_within_any(survey_ips, country_ip_ranges)
filtered_data <- dplyr::bind_cols(filtered_data, outside = outside_country)
filtered_data <- dplyr::filter(filtered_data, .data$outside == TRUE) %>%
dplyr::select(-.data$outside)
dplyr::select(-"outside")
n_outside_country <- nrow(filtered_data)

# Filter NAs when requested
Expand Down Expand Up @@ -231,7 +233,7 @@ check_ip <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_ip == "ip") %>%
keep_marked_column(.data$exclusion_ip, keep)
keep_marked_column("exclusion_ip", keep)

# Determine whether to print results
print_data(exclusions, print)
Expand Down Expand Up @@ -300,7 +302,7 @@ exclude_ip <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_ip != "ip") %>%
dplyr::select(-.data$exclusion_ip)
dplyr::select(-"exclusion_ip")

# Print exclusion statement
if (identical(silent, FALSE)) {
Expand Down
8 changes: 5 additions & 3 deletions R/location.R
Expand Up @@ -65,6 +65,8 @@ mark_location <- function(x,
# Rename columns
if (rename) {
x <- rename_columns(x, alert = FALSE)
id_col <- "ResponseId"
location_col <- c("LocationLatitude", "LocationLongitude")
}

# Check for presence of required column
Expand All @@ -91,7 +93,7 @@ mark_location <- function(x,
# Determine if geolocation is within US
no_nas$country <- maps::map.where(database = "usa", longitude, latitude)
outside_us <- dplyr::filter(no_nas, is.na(.data$country)) %>%
dplyr::select(-.data$country)
dplyr::select(-"country")
n_outside_us <- nrow(outside_us)

# Combine no location with outside US
Expand Down Expand Up @@ -189,7 +191,7 @@ check_location <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_location == "location") %>%
keep_marked_column(.data$exclusion_location, keep)
keep_marked_column("exclusion_location", keep)

# Determine whether to print results
print_data(exclusions, print)
Expand Down Expand Up @@ -249,7 +251,7 @@ exclude_location <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_location != "location") %>%
dplyr::select(-.data$exclusion_location)
dplyr::select(-"exclusion_location")

# Print exclusion statement
if (identical(silent, FALSE)) {
Expand Down
6 changes: 4 additions & 2 deletions R/preview.R
Expand Up @@ -51,6 +51,8 @@ mark_preview <- function(x,
# Rename columns
if (rename) {
x <- rename_columns(x, alert = FALSE)
id_col <- "ResponseId"
preview_col <- "Status"
}

# Check for presence of required column
Expand Down Expand Up @@ -148,7 +150,7 @@ check_preview <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_preview == "preview") %>%
keep_marked_column(.data$exclusion_preview, keep)
keep_marked_column("exclusion_preview", keep)

# Determine whether to print results
print_data(exclusions, print)
Expand Down Expand Up @@ -205,7 +207,7 @@ exclude_preview <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_preview != "preview") %>%
dplyr::select(-.data$exclusion_preview)
dplyr::select(-"exclusion_preview")

# Print exclusion statement
if (identical(silent, FALSE)) {
Expand Down
5 changes: 3 additions & 2 deletions R/progress.R
Expand Up @@ -64,6 +64,7 @@ mark_progress <- function(x,
# Rename columns
if (rename) {
x <- rename_columns(x, alert = FALSE)
id_col <- "ResponseId"
}

# Check for presence of required column
Expand Down Expand Up @@ -185,7 +186,7 @@ check_progress <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_progress == "progress") %>%
keep_marked_column(.data$exclusion_progress, keep)
keep_marked_column("exclusion_progress", keep)

# Determine whether to print results
print_data(exclusions, print)
Expand Down Expand Up @@ -253,7 +254,7 @@ exclude_progress <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_progress != "progress") %>%
dplyr::select(-.data$exclusion_progress)
dplyr::select(-"exclusion_progress")

# Print exclusion statement
if (identical(silent, FALSE)) {
Expand Down
55 changes: 49 additions & 6 deletions R/qualtrics_data.R
Expand Up @@ -38,7 +38,8 @@
#'
#' A dataset containing the metadata from a standard Qualtrics survey with
#' browser metadata collected and exported with "Use numeric values".
#' These data were randomly generated using [iptools::ip_random()] and
#' These data were randomly generated using [iptools::ip_random()](
#' https://hrbrmstr.github.io/iptools/reference/ip_random.html) and
#' [rgeolocate::ip2location()](
#' https://cran.r-project.org/package=rgeolocate) functions.
#'
Expand Down Expand Up @@ -71,7 +72,8 @@
#'
#' A dataset containing the metadata from a standard Qualtrics survey with
#' browser metadata collected and exported with "Use choice text".
#' These data were randomly generated using [iptools::ip_random()] and
#' These data were randomly generated using [iptools::ip_random()](
#' https://hrbrmstr.github.io/iptools/reference/ip_random.html) and
#' [rgeolocate::ip2location()](
#' https://cran.r-project.org/package=rgeolocate) functions.
#'
Expand Down Expand Up @@ -108,10 +110,8 @@
#' were imported using
#' [`qualtRics::fetch_survey()`](
#' https://docs.ropensci.org/qualtRics/reference/fetch_survey.html).
#' and then the secondary labels were assigned as column names with
#' [`sjlabelled::get_label()`](
#' https://strengejacke.github.io/sjlabelled/reference/get_label.html).
#' These data were randomly generated using [iptools::ip_random()] and
#' These data were randomly generated using [iptools::ip_random()](
#' https://hrbrmstr.github.io/iptools/reference/ip_random.html) and
#' [rgeolocate::ip2location()](
#' https://cran.r-project.org/package=rgeolocate) functions.
#'
Expand Down Expand Up @@ -141,3 +141,46 @@
#' }
#' @family data
"qualtrics_fetch"

#' Example numeric metadata imported with `qualtRics::fetch_survey()` from
#' simulated Qualtrics study but with labels included as column names
#'
#' A dataset containing the metadata from a standard Qualtrics survey with
#' browser metadata collected and exported with "Use numeric values". The data
#' were imported using
#' [`qualtRics::fetch_survey()`](
#' https://docs.ropensci.org/qualtRics/reference/fetch_survey.html).
#' and then the secondary labels were assigned as column names with
#' [`sjlabelled::get_label()`](
#' https://strengejacke.github.io/sjlabelled/reference/get_label.html).
#' These data were randomly generated using [iptools::ip_random()](
#' https://hrbrmstr.github.io/iptools/reference/ip_random.html) and
#' [rgeolocate::ip2location()](
#' https://cran.r-project.org/package=rgeolocate) functions.
#'
#' @format A data frame with 100 rows and 17 variables:
#' \describe{
#' \item{Start Date}{date and time data collection started, in ISO 8601 format}
#' \item{End Date}{date and time data collection ended, in ISO 8601 format}
#' \item{Response Type}{numeric flag for preview (1) vs. implemented survey (0)
#' entries}
#' \item{IP Address}{participant IP address (truncated for anonymity)}
#' \item{Progress}{percentage of survey completed}
#' \item{Duration (in seconds)}{duration of time required to complete survey,
#' in seconds}
#' \item{Finished}{numeric flag for whether survey was completed (1) or
#' progress was < 100 (0)}
#' \item{Recorded Date}{date and time survey was recorded, in ISO 8601 format}
#' \item{Response ID}{random ID for participants}
#' \item{Location Latitude}{latitude geolocated from IP address}
#' \item{Location Longitude}{longitude geolocated from IP address}
#' \item{User Language}{language set in Qualtrics}
#' \item{Click to write the question text - Browser}{user web browser type}
#' \item{Click to write the question text - Version}{user web browser version}
#' \item{Click to write the question text - Operating System}{user operating system}
#' \item{Click to write the question text - Resolution}{user screen resolution}
#' \item{like}{response to question about whether the user liked the survey
#' (1 = Yes, 0 = No)}
#' }
#' @family data
"qualtrics_fetch2"
18 changes: 9 additions & 9 deletions R/rename_columns.R
Expand Up @@ -58,15 +58,15 @@ rename_columns <- function(x, alert = TRUE) {
# Rename columns
x %>%
dplyr::rename(
StartDate = .data$`Start Date`,
EndDate = .data$`End Date`,
Status = .data$`Response Type`,
IPAddress = .data$`IP Address`,
RecordedDate = .data$`Recorded Date`,
ResponseId = .data$`Response ID`,
LocationLatitude = .data$`Location Latitude`,
LocationLongitude = .data$`Location Longitude`,
UserLanguage = .data$`User Language`
StartDate = "Start Date",
EndDate = "End Date",
Status = "Response Type",
IPAddress = "IP Address",
RecordedDate = "Recorded Date",
ResponseId = "Response ID",
LocationLatitude = "Location Latitude",
LocationLongitude = "Location Longitude",
UserLanguage = "User Language"
) %>%
dplyr::rename_with(~ gsub(throwaway, "", .x), dplyr::contains(throwaway))
} else if (any(grepl("_Resolution", column_names))) {
Expand Down

0 comments on commit 4837033

Please sign in to comment.