diff --git a/R/followers.R b/R/followers.R index 69c56bc7..d0c53c26 100644 --- a/R/followers.R +++ b/R/followers.R @@ -11,12 +11,12 @@ #' if (auth_has_default()) { #' users <- get_followers("_R_Foundation") #' users -#' +#' #' # use `cursor` to find the next "page" of results #' more_users <- get_followers("_R_Foundation", cursor = users) #' #' } -#' @return A tibble data frame with one column named "from_id" with the +#' @return A tibble data frame with one column named "from_id" with the #' followers and another one "to_id" with the user used as input. #' @export get_followers <- function(user, n = 5000, @@ -26,25 +26,25 @@ get_followers <- function(user, n = 5000, verbose = TRUE, token = NULL, page = lifecycle::deprecated()) { - + if (lifecycle::is_present(page)) { lifecycle::deprecate_warn("1.0.0", "get_followers(page)", "get_followers(cursor)") cursor <- page } stopifnot(is.atomic(user), isTRUE(length(user) == 1)) - + params <- list(stringify_ids = TRUE) params[[user_type(user)]] <- user - results <- TWIT_paginate_cursor(token, "/1.1/followers/ids", params, - page_size = if(n >= 5000) 5000 else n, + results <- TWIT_paginate_cursor(token, "/1.1/followers/ids", params, + page_size = if (n >= 5000) 5000 else n, n = n, cursor = cursor, retryonratelimit = retryonratelimit, verbose = verbose ) - + if (parse) { df <- tibble::tibble(from_id = unlist(lapply(results, function(x) x$ids)), to_id = user) diff --git a/R/http.R b/R/http.R index 2a27856f..24d955af 100644 --- a/R/http.R +++ b/R/http.R @@ -1,18 +1,18 @@ TWIT_get <- function(token, api, params = NULL, ..., host = "api.twitter.com") { - resp <- TWIT_method("GET", - token = token, + resp <- TWIT_method("GET", + token = token, api = api, params = params, ..., host = host ) - + from_js(resp) } TWIT_post <- function(token, api, params = NULL, body = NULL, ..., host = "api.twitter.com") { - TWIT_method("POST", - token = token, + TWIT_method("POST", + token = token, api = api, params = params, body = body, @@ -21,8 +21,8 @@ TWIT_post <- function(token, api, params = NULL, body = NULL, ..., host = "api.t ) } -TWIT_method <- function(method, token, api, - params = NULL, +TWIT_method <- function(method, token, api, + params = NULL, host = "api.twitter.com", retryonratelimit = NULL, verbose = TRUE, @@ -33,18 +33,18 @@ TWIT_method <- function(method, token, api, token <- check_token(token) url <- paste0("https://", host, api, ".json") - + resp <- switch(method, GET = httr::GET(url, query = params, token, ...), POST = httr::POST(url, query = params, token, ...), stop("Unsupported method", call. = FALSE) ) - + switch(resp_type(resp), ok = NULL, protected = handle_protected(resp, params), rate_limit = handle_rate_limit( - resp, api, + resp, api, retryonratelimit = retryonratelimit, verbose = verbose ), @@ -55,11 +55,11 @@ TWIT_method <- function(method, token, api, } #' Pagination -#' -#' @description +#' +#' @description #' `r lifecycle::badge("experimental")` #' These are internal functions used for pagination inside of rtweet. -#' +#' #' @keywords internal #' @param token Expert use only. Use this to override authentication for #' a single API call. In most cases you are better off changing the @@ -67,69 +67,69 @@ TWIT_method <- function(method, token, api, #' @param n Desired number of results to return. Results are downloaded #' in pages when `n` is large; the default value will download a single #' page. Set `n = Inf` to download as many results as possible. -#' +#' #' The Twitter API rate limits the number of requests you can perform -#' in each 15 minute period. The easiest way to download more than that is +#' in each 15 minute period. The easiest way to download more than that is #' to use `retryonratelimit = TRUE`. -#' +#' #' You are not guaranteed to get exactly `n` results back. You will get -#' fewer results when tweets have been deleted or if you hit a rate limit. +#' fewer results when tweets have been deleted or if you hit a rate limit. #' You will get more results if you ask for a number of tweets that's not #' a multiple of page size, e.g. if you request `n = 150` and the page #' size is 200, you'll get 200 results back. -#' @param get_id A single argument function that returns a vector of ids given +#' @param get_id A single argument function that returns a vector of ids given #' the JSON response. The defaults are chosen to cover the most common cases, #' but you'll need to double check whenever implementing pagination for #' a new endpoint. -#' @param max_id Supply a vector of ids or a data frame of previous results to +#' @param max_id Supply a vector of ids or a data frame of previous results to #' find tweets **older** than `max_id`. -#' @param since_id Supply a vector of ids or a data frame of previous results to +#' @param since_id Supply a vector of ids or a data frame of previous results to #' find tweets **newer** than `since_id`. #' @param retryonratelimit If `TRUE`, and a rate limit is exhausted, will wait #' until it refreshes. Most Twitter rate limits refresh every 15 minutes. #' If `FALSE`, and the rate limit is exceeded, the function will terminate -#' early with a warning; you'll still get back all results received up to -#' that point. The default value, `NULL`, consults the option -#' `rtweet.retryonratelimit` so that you can globally set it to `TRUE`, +#' early with a warning; you'll still get back all results received up to +#' that point. The default value, `NULL`, consults the option +#' `rtweet.retryonratelimit` so that you can globally set it to `TRUE`, #' if desired. -#' -#' If you expect a query to take hours or days to perform, you should not +#' +#' If you expect a query to take hours or days to perform, you should not #' rely soley on `retryonratelimit` because it does not handle other common #' failure modes like temporarily losing your internet connection. -#' @param parse If `TRUE`, the default, returns a tidy data frame. Use `FALSE` -#' to return the "raw" list corresponding to the JSON returned from the +#' @param parse If `TRUE`, the default, returns a tidy data frame. Use `FALSE` +#' to return the "raw" list corresponding to the JSON returned from the #' Twitter API. -#' @param verbose Show progress bars and other messages indicating current +#' @param verbose Show progress bars and other messages indicating current #' progress? -TWIT_paginate_max_id <- function(token, api, params, - get_id = function(x) x$id_str, - n = 1000, - page_size = 200, +TWIT_paginate_max_id <- function(token, api, params, + get_id = function(x) x$id_str, + n = 1000, + page_size = 200, since_id = NULL, max_id = NULL, - count_param = "count", + count_param = "count", retryonratelimit = NULL, verbose = TRUE) { if (!is.null(max_id)) { - max_id <- rtweet::max_id(max_id) + max_id <- rtweet::max_id(max_id) } if (!is.null(since_id)) { - since_id <- rtweet::since_id(since_id) + since_id <- rtweet::since_id(since_id) } - + params$since_id <- since_id - params[[count_param]] <- page_size - pages <- ceiling(n / page_size) - results <- vector("list", if (is.finite(pages)) pages else 1000) - + params[[count_param]] <- page_size + pages <- if (is.infinite(n)) page_size else max(c(n %/% page_size), 1) + results <- vector("list", pages) + if (verbose) { pb <- progress::progress_bar$new( format = "Downloading multiple pages :bar", total = pages - ) + ) withr::defer(pb$terminate()) } - + i <- 0 while (i < pages) { i <- i + 1 @@ -140,13 +140,13 @@ TWIT_paginate_max_id <- function(token, api, params, json <- catch_rate_limit( TWIT_get( - token, api, params, + token, api, params, retryonratelimit = retryonratelimit, verbose = verbose ) ) if (is_rate_limit(json)) { - warn_early_term(json, + warn_early_term(json, hint = paste0("Set `max_id = '", max_id, "' to continue."), hint_if = !is.null(max_id) ) @@ -158,14 +158,14 @@ TWIT_paginate_max_id <- function(token, api, params, if (length(id) == 0) { break } - if(i > length(results)) { + if(i > length(results)) { # double length per https://en.wikipedia.org/wiki/Dynamic_array#Geometric_expansion_and_amortized_cost length(results) <- 2 * length(results) } - + max_id <- max_id(id) results[[i]] <- json - + if (verbose) { pb$tick() } @@ -175,14 +175,14 @@ TWIT_paginate_max_id <- function(token, api, params, # https://developer.twitter.com/en/docs/pagination #' @rdname TWIT_paginate_max_id -#' -#' @param cursor Which page of results to return. The default will return -#' the first page; you can supply the result from a previous call to +#' +#' @param cursor Which page of results to return. The default will return +#' the first page; you can supply the result from a previous call to #' continue pagination from where it left off. -TWIT_paginate_cursor <- function(token, api, params, - n = 5000, - page_size = 5000, - cursor = "-1", +TWIT_paginate_cursor <- function(token, api, params, + n = 5000, + page_size = 5000, + cursor = "-1", get_id = function(x) x$ids, retryonratelimit = NULL, verbose = TRUE) { @@ -192,17 +192,18 @@ TWIT_paginate_cursor <- function(token, api, params, # Last request retrieved all available results return(list()) } - + # TODO: consider if its worth using fastmap::faststack() here - results <- vector("list", if (is.infinite(n)) page_size else n) + pages <- if (is.infinite(n)) page_size else max(c(n %/% page_size), 1) + results <- vector("list", pages) i <- 1 n_seen <- 0 - + if (verbose) { pb <- progress::progress_bar$new( format = "Downloading multiple pages :bar", - total = if (is.infinite(n)) page_size else n - ) + total = length(results) + ) withr::defer(pb$terminate()) } @@ -210,41 +211,53 @@ TWIT_paginate_cursor <- function(token, api, params, params$cursor <- cursor json <- catch_rate_limit( TWIT_get( - token, api, params, + token, api, params, retryonratelimit = retryonratelimit, verbose = verbose ) ) + # Rate limit hit, repeat call to continue + error_limit <- "errors" %in% names(json) + continue_limit <- !is.null(retryonratelimit) && retryonratelimit + if (error_limit && continue_limit) { + json <- catch_rate_limit( + TWIT_get( + token, api, params, + retryonratelimit = retryonratelimit, + verbose = verbose + )) + } + if (is_rate_limit(json)) { if (!is.null(retryonratelimit)){ - warn_early_term(json, + warn_early_term(json, hint = paste0("Set `cursor = '", cursor, "' to continue."), hint_if = !identical(cursor, "-1") ) } - break + next } if (i > length(results)) { # double length per https://en.wikipedia.org/wiki/Dynamic_array#Geometric_expansion_and_amortized_cost length(results) <- 2 * length(results) } - results[[i]] <- json if (any(grepl("next_cursor", names(json)))) { - cursor <- ifelse(!is.null(json$next_cursor_str), - json$next_cursor_str, + cursor <- ifelse(!is.null(json$next_cursor_str), + json$next_cursor_str, json$next_cursor) } else { # If next_cursor is missing there are no message within the last 30 days - cursor <- "0" + cursor <- "0" } + results[[i]] <- json n_seen <- n_seen + length(get_id(json)) i <- i + 1 empty_response <- !is.null(json$events) && length(json$events) == 0 if (identical(cursor, "0") || n_seen >= n || empty_response) { break } - + if (verbose) { pb$update(n_seen / n) } @@ -255,19 +268,19 @@ TWIT_paginate_cursor <- function(token, api, params, #' @rdname TWIT_paginate_max_id #' @keywords internal -TWIT_paginate_chunked <- function(token, api, params_list, - retryonratelimit = NULL, +TWIT_paginate_chunked <- function(token, api, params_list, + retryonratelimit = NULL, verbose = TRUE) { - + pages <- length(params_list) results <- vector("list", pages) - + if (verbose) { pb <- progress::progress_bar$new( format = "Downloading multiple pages :bar", total = pages - ) + ) withr::defer(pb$terminate()) } @@ -275,31 +288,42 @@ TWIT_paginate_chunked <- function(token, api, params_list, params <- params_list[[i]] json <- catch_rate_limit( TWIT_get( - token, api, params, + token, api, params, retryonratelimit = retryonratelimit, verbose = verbose ) ) + # Rate limit hit, repeat call to continue + error_limit <- "errors" %in% names(json) + continue_limit <- !is.null(retryonratelimit) && retryonratelimit + if (error_limit && continue_limit) { + json <- catch_rate_limit( + TWIT_get( + token, api, params, + retryonratelimit = retryonratelimit, + verbose = verbose + )) + } if (is_rate_limit(json)) { warn_early_term(json, hint_if = FALSE) break } - + results[[i]] <- json - + if (verbose) { pb$tick() } } results -} +} #' @rdname TWIT_paginate_max_id -TWIT_paginate_premium <- function(token, api, params, +TWIT_paginate_premium <- function(token, api, params, n = 100, - page_size = 100, - cursor = "next", + page_size = 100, + cursor = "next", retryonratelimit = NULL, verbose = TRUE) { if (identical(cursor, "next")) { @@ -309,48 +333,62 @@ TWIT_paginate_premium <- function(token, api, params, cursor <- next_cursor(cursor) } params[["next"]] <- cursor - + # TODO: consider if its worth using fastmap::faststack() here - results <- vector("list", ceiling(n/page_size)) + pages <- if (is.infinite(n)) page_size else max(c(n %/% page_size), 1) + results <- vector("list", pages) i <- 1 n_seen <- 0 - + if (length(results) > 1) { params[["maxResults"]] <- page_size } - + if (verbose) { pb <- progress::progress_bar$new( format = "Downloading multiple pages :bar", - total = length(results)) + total = length(results)) withr::defer(pb$terminate()) } # Time to sleep avoid hitting the lowest rate limits - min_sleep <- 0.9 + min_sleep <- 0.9 if (page_size == 500) { min_sleep <- min_sleep * 2 } repeat({ - - Sys.sleep(min_sleep) + + Sys.sleep(min_sleep) params[["next"]] <- cursor json <- catch_rate_limit( TWIT_get( - token, api, params, + token, api, params, retryonratelimit = retryonratelimit, verbose = verbose ) ) - + if (is_rate_limit(json)) { if (!is.null(retryonratelimit)){ - warn_early_term(json, + warn_early_term(json, hint = paste0("Set `continue = '", cursor, "' to continue."), hint_if = !identical(cursor, "next") ) } break } + + # Rate limit hit, repeat call to continue + error_limit <- "errors" %in% names(json) + continue_limit <- !is.null(retryonratelimit) && retryonratelimit + if (error_limit && continue_limit) { + json <- catch_rate_limit( + TWIT_get( + token, api, params, + retryonratelimit = retryonratelimit, + verbose = verbose + )) + } + if (i > length(results)) { # double length per https://en.wikipedia.org/wiki/Dynamic_array#Geometric_expansion_and_amortized_cost length(results) <- 2 * length(results) @@ -358,19 +396,19 @@ TWIT_paginate_premium <- function(token, api, params, results[[i]] <- json$results if (any(grepl("next", names(json)))) { cursor <- if (!is.null(json[["next"]])) json[["next"]] - } + } n_seen <- n_seen + nrow(json$results) i <- i + 1 empty_response <- !is.null(json$results) && length(json$results) == 0 if ( length(n_seen) == 0 || n_seen >= n || empty_response) { break } - + if (verbose) { pb$update(n_seen / n) } }) - + structure(results, "next" = cursor) } @@ -410,14 +448,14 @@ handle_rate_limit <- function(x, api, retryonratelimit = NULL, verbose = TRUE) { headers <- httr::headers(x) n <- headers$`x-rate-limit-limit` when <- .POSIXct(as.numeric(headers$`x-rate-limit-reset`)) - + retryonratelimit <- retryonratelimit %||% getOption("rtweet.retryonratelimit", FALSE) - + if (retryonratelimit) { wait_until(when, api, verbose = verbose) } else { message <- c( - paste0("Rate limit exceeded for Twitter endpoint '", api, "'"), + paste0("Rate limit exceeded for Twitter endpoint '", api, "'"), paste0("Will receive ", n, " more requests at ", format(when, "%H:%M")) ) abort(message, class = "rtweet_rate_limit", when = when) @@ -445,7 +483,7 @@ warn_early_term <- function(cnd, hint, hint_if) { # https://developer.twitter.com/en/support/twitter-api/error-troubleshooting handle_error <- function(x) { if (!is.null(x$headers[["content-type"]])) { - stop("Twitter API failed [", x$status_code, "]\n", + stop("Twitter API failed [", x$status_code, "]\n", "Check error message at https://developer.twitter.com/en/support/twitter-api/error-troubleshooting", call. = FALSE) } json <- from_js(x) @@ -456,7 +494,7 @@ handle_error <- function(x) { handle_protected <- function(resp, params) { handle_codes(resp) - + if (any(c("screen_name", "user_id") %in% names(params))) { account <- params$screen_name if (is.null(account)) account <- params$user_id @@ -497,7 +535,7 @@ handle_codes <- function(x) { warning) funct(paste0(errors$message[e], " (", errors$code[e], ")"), call. = FALSE) - + } } } diff --git a/man/lookup_users.Rd b/man/lookup_users.Rd index 13f0a020..6d91719e 100644 --- a/man/lookup_users.Rd +++ b/man/lookup_users.Rd @@ -50,7 +50,7 @@ if (auth_has_default()) { users <- c("twitter", "rladiesglobal", "_R_Foundation") users <- lookup_users(users) users - + # latest tweet from each user tweets_data(users) } diff --git a/man/post_tweet.Rd b/man/post_tweet.Rd index dbe6d5ae..17776d68 100644 --- a/man/post_tweet.Rd +++ b/man/post_tweet.Rd @@ -93,8 +93,8 @@ grid(8, lwd = .15, lty = 2, col = "#00000088") dev.off() ## post tweet with media attachment -post_tweet("a tweet with media attachment", media = tmp, - media_alt_text = "Random points example of rtweet::post_tweet. +post_tweet("a tweet with media attachment", media = tmp, + media_alt_text = "Random points example of rtweet::post_tweet. rtweet requires alt text with all media") # example of replying within a thread diff --git a/tests/testthat/test-limits.R b/tests/testthat/test-limits.R new file mode 100644 index 00000000..d8f1f306 --- /dev/null +++ b/tests/testthat/test-limits.R @@ -0,0 +1,6 @@ +test_that("pagination with rate limits works", { + skip("requires manual testing") + # One needs to wait 15 minutes... + df_follower <- get_followers("CDU", n = 80000L, retryonratelimit = TRUE) + expect_equal(nrow(df_follower), 80000L) +})