From b20d01c72924d1fe434adc6cb63046723ec0bf64 Mon Sep 17 00:00:00 2001 From: Jenny Bryan Date: Tue, 11 Nov 2025 09:20:18 -0800 Subject: [PATCH 1/3] Pass the arguments of encode_base64() along --- R/utils.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/utils.R b/R/utils.R index 925893d..81e1d95 100644 --- a/R/utils.R +++ b/R/utils.R @@ -171,9 +171,9 @@ encode_base64 <- function(x, line_length = 76L, newline = "\r\n") { } if (is.raw(x)) { - base64encode(x, 76L, newline) + base64encode(x, line_length, newline) } else { - base64encode(charToRaw(x), 76L, "\r\n") + base64encode(charToRaw(x), line_length, newline) } } From e5cdfddff6cb5d645765c6c85a24a45a176760d4 Mon Sep 17 00:00:00 2001 From: Jenny Bryan Date: Tue, 11 Nov 2025 11:35:26 -0800 Subject: [PATCH 2/3] Handle long-ish `Subject` with non-ASCII characters --- NEWS.md | 2 + R/gm_mime.R | 151 +++++++++++++++++++++++++++------- tests/testthat/test-gm_mime.R | 85 +++++++++++++++++-- 3 files changed, 201 insertions(+), 37 deletions(-) diff --git a/NEWS.md b/NEWS.md index 9b11b9f..7d24f5b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,8 @@ - Previously: `c("condition", "error", "gmail_error")` - Now: `c("gmailr_error", "gargle_error_request_failed", "http_error_{XXX}", "gargle_error", "rlang_error", "error", "condition")` +* Text headers, such as `Subject`, are now properly prepared as per RFC 2047, fixing a problem with long-ish headers that contain non-ASCII characters (#193). + ## Deprecations * Functions that lack the `gm_` prefix have been removed, concluding a deprecation process that kicked off with gmailr 1.0.0 (released 2019-08-23). These functions were hard deprecated in gmailr 2.0.0 (released 2023-06-30). This eliminates many name conflicts with other packages (including the base package). diff --git a/R/gm_mime.R b/R/gm_mime.R index ca31931..04e9ada 100644 --- a/R/gm_mime.R +++ b/R/gm_mime.R @@ -205,32 +205,6 @@ gm_attach_file <- function(mime, filename, type = NULL, id = NULL, ...) { ) } -header_encode <- function(x) { - x <- enc2utf8(unlist(strsplit(as.character(x), ", ?"))) - - # this won't deal with <> used in quotes, but I think it is rare enough that - # is ok - m <- rematch2::re_match(x, "^(?[^<]*?)(?: *<(?[^>]+)>)?$") - res <- character(length(x)) - - # simple addresses contain no <>, so we don't need to do anything further - simple <- !nzchar(m$addr_spec) - res[simple] <- m$phrase[simple] - - # complex addresses may need to be base64-encoded - needs_encoding <- Encoding(m$phrase) != "unknown" - res[needs_encoding] <- sprintf( - "=?utf-8?B?%s?=", - vcapply(m$phrase[needs_encoding], encode_base64) - ) - res[!needs_encoding] <- m$phrase[!needs_encoding] - - # Add the addr_spec onto non-simple examples - res[!simple] <- sprintf("%s <%s>", res[!simple], m$addr_spec[!simple]) - - paste0(res, collapse = ", ") -} - #' Convert a mime object to character representation #' #' This function converts a mime object into a character vector @@ -240,8 +214,9 @@ header_encode <- function(x) { #' @param ... further arguments ignored #' @export as.character.mime <- function(x, newline = "\r\n", ...) { - # encode headers - x$header <- lapply(x$header, header_encode) + for (i in seq_along(x$header)) { + x$header[[i]] <- encode_header(names(x$header)[i], x$header[[i]]) + } # Check if we need nested structure ((text + HTML) + attachments) has_both_bodies <- exists_list(x$parts, TEXT_PART) && @@ -364,3 +339,123 @@ with_defaults <- function(defaults, ...) { missing <- setdiff(names(defaults), names(args)) c(defaults[missing], args) } + +# Header encoding helpers ------------------------------------------------------ +# +# In general, the Gmail API requires following RFC 2822 Internet Message Format +# https://datatracker.ietf.org/doc/html/rfc2822 +# +# Then, within that, non-ASCII text in headers is addressed in RFC 2047 MIME +# Part Three: Message Header Extensions for Non-ASCII Text +# https://datatracker.ietf.org/doc/html/rfc2047 +# +# Refactoring the header processing was motivated by +# https://github.com/r-lib/gmailr/issues/193 + +# Strategy: Divide headers into address headers vs. everything else. +# +# Use existing helper to encode address headers, as it was clearly written for +# that use case. +# +# Use a new helper for other headers, that can deal with "folding" (see the RFC) +# long-ish, non-ASCII text, e.g. in the Subject. + +encode_header <- function(name, value) { + address_headers <- c( + "To", + "From", + "Cc", + "Bcc", + "Reply-To", + "Sender", + "Resent-To", + "Resent-From", + "Resent-Cc", + "Resent-Bcc", + "Resent-Sender" + ) + + fun <- if (name %in% address_headers) { + header_encode_address + } else { + header_encode_text + } + fun(value) +} + +# Pre-existing helper now renamed to reflect its motivating use case. +# - May contain multiple comma-separated addresses +# - Each address may have the format "Name" +# - Only the "Name" part needs encoding, not the email address +header_encode_address <- function(x) { + x <- enc2utf8(unlist(strsplit(as.character(x), ", ?"))) + + # this won't deal with <> used in quotes, but I think it is rare enough that + # is ok + m <- rematch2::re_match(x, "^(?[^<]*?)(?: *<(?[^>]+)>)?$") + res <- character(length(x)) + + # simple addresses contain no <>, so we don't need to do anything further + simple <- !nzchar(m$addr_spec) + res[simple] <- m$phrase[simple] + + # complex addresses may need to be base64-encoded + needs_encoding <- Encoding(m$phrase) != "unknown" + res[needs_encoding] <- sprintf( + "=?utf-8?B?%s?=", + vcapply(m$phrase[needs_encoding], encode_base64) + ) + res[!needs_encoding] <- m$phrase[!needs_encoding] + + # Add the addr_spec onto non-simple examples + res[!simple] <- sprintf("%s <%s>", res[!simple], m$addr_spec[!simple]) + + paste0(res, collapse = ", ") +} + +# New helper for a generic "text" header +# - Single value (not comma-separated) +# - May contain long Unicode text that exceeds RFC 2047's 75-character limit +# - Must be "folded" into multiple encoded-words if too long +header_encode_text <- function(x) { + if (length(x) == 0 || is.null(x)) { + return(x) + } + + x <- enc2utf8(as.character(x)) + + # Pass pure ASCII through unchanged + if (Encoding(x) == "unknown") { + return(x) + } + + # First, get a single base64-encoded string + b64_full <- encode_base64(x, line_length = 0L, newline = "") + b64_len <- nchar(b64_full) + + # encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" + # charset is utf-8 + # encoding is "B" (as opposed to "Q"), as in "BASE64" + encode_word <- function(b64) sprintf("=?utf-8?B?%s?=", b64) + + # RFC 2047: "An 'encoded-word' may not be more than 75 characters long, + # including 'charset', 'encoding', 'encoded-text', and delimiters." + # Format: =?utf-8?B??= + # The formalities account for 12 characters, which leaves up to 63 characters + # for the encoded text. However, base64 works in 4-character groups, so we + # must use a multiple of 4: the largest is 60. + max_b64_per_word <- 60 + + # Return as single encoded-word, if possible + if (b64_len <= max_b64_per_word) { + return(encode_word(b64_full)) + } + + # Otherwise, split into multiple encoded-words + starts <- seq(1L, b64_len, by = max_b64_per_word) + stops <- c(starts[-1] - 1L, b64_len) + encoded_words <- encode_word(substring(b64_full, starts, stops)) + + # Join multiple encoded-words with CRLF SPACE per RFC 2047 + paste0(encoded_words, collapse = "\r\n ") +} diff --git a/tests/testthat/test-gm_mime.R b/tests/testthat/test-gm_mime.R index 1db4523..66ee57c 100644 --- a/tests/testthat/test-gm_mime.R +++ b/tests/testthat/test-gm_mime.R @@ -4,39 +4,39 @@ test_that("MIME - Basic functions", { expect_true(length(msg$header) > 0) rv <- gm_to(msg, "adam@ali.as") - expect_equal(header_encode(rv$header$To), "adam@ali.as") + expect_equal(header_encode_address(rv$header$To), "adam@ali.as") rv <- gm_from(msg, "bob@ali.as") - expect_equal(header_encode(rv$header$From), "bob@ali.as") + expect_equal(header_encode_address(rv$header$From), "bob@ali.as") rv <- gm_to(msg, c("adam@ali.as", "another@ali.as", "bob@ali.as")) expect_equal( - header_encode(rv$header$To), + header_encode_address(rv$header$To), "adam@ali.as, another@ali.as, bob@ali.as" ) rv <- gm_cc(msg, c("adam@ali.as", "another@ali.as", "bob@ali.as")) expect_equal( - header_encode(rv$header$Cc), + header_encode_address(rv$header$Cc), "adam@ali.as, another@ali.as, bob@ali.as" ) rv <- gm_bcc(msg, c("adam@ali.as", "another@ali.as", "bob@ali.as")) expect_equal( - header_encode(rv$header$Bcc), + header_encode_address(rv$header$Bcc), "adam@ali.as, another@ali.as, bob@ali.as" ) }) -test_that("header_encode encodes non-ascii values as base64", { - expect_equal(header_encode("f\U00F6\U00F6"), "=?utf-8?B?ZsO2w7Y=?=") +test_that("header_encode_address encodes non-ascii values as base64", { + expect_equal(header_encode_address("f\U00F6\U00F6"), "=?utf-8?B?ZsO2w7Y=?=") expect_equal( - header_encode('"f\U00F6\U00F6 b\U00Er1" '), + header_encode_address('"f\U00F6\U00F6 b\U00Er1" '), "=?utf-8?B?ImbDtsO2IGIOcjEi?= " ) - res <- header_encode( + res <- header_encode_address( c( '"f\U00F6\U00F6 b\U00E1r" ', '"foo bar" ', @@ -248,3 +248,70 @@ test_that("trailing whitespace", { quoted_printable_encode("foo\t \n \t") ) }) + +test_that("header_encode_text() passes ASCII-only text through", { + ascii_subject <- "This is a plain ASCII subject" + result <- header_encode_text(ascii_subject) + expect_equal(result, ascii_subject) + + long_ascii <- strrep("a", 100) + result <- header_encode_text(long_ascii) + expect_equal(result, long_ascii) +}) + +test_that("header_encode_text() encodes short Unicode text", { + # Short subject with Unicode that fits in single encoded-word + short_unicode <- "Hello \u00E1\u00E9\u00ED\u00F3\u00FA" # "Hello áéíóú" + result <- header_encode_text(short_unicode) + + # Should not contain CRLF (no folding) + expect_no_match(result, "\r\n", fixed = TRUE) + # Should be a single encoded-word + expect_match(result, "^=[?]utf-8[?]B[?][A-Za-z0-9+/=]+[?]=$") + # Should be within RFC 2047 limit + expect_lte(nchar(result), 75) +}) + +# https://github.com/r-lib/gmailr/issues/193 +test_that("header_encode_text() folds long non-ASCII text", { + long_subject <- paste0("\u00E1", strrep("a", 54), "\u00E1") + result <- header_encode_text(long_subject) + + # Should contain CRLF SPACE (folded into multiple encoded-words) + expect_match(result, "\r\n ", fixed = TRUE) + + # Each line should be an encoded-word within RFC 2047 limit + lines <- strsplit(result, "\r\n ", fixed = TRUE)[[1]] + expect_gt(length(lines), 1) + for (line in lines) { + expect_lte(nchar(line), 75) + expect_match(line, "^=[?]utf-8[?]B[?][A-Za-z0-9+/=]+[?]=$") + } +}) + +test_that("header_encode_text() roundtrip: encode then decode", { + # this is to make sure we break up the encoded-text in chunks of 4 characters + original <- "\U0001F389\U0001F38A\U0001F388 Célébration extraordinaire à Zürich! \U0001F973\U0001F382\U0001F37E Join us for a très spécial soirée! \U0001F942\U0001F377\U0001F95C" + encoded <- header_encode_text(original) + encoded_words <- strsplit(encoded, "\r\n ", fixed = TRUE)[[1]] + encoded_text <- sub("[?]=$", "", sub("^=[?]utf-8[?]B[?]", "", encoded_words)) + decoded <- paste0(rawToChar(base64decode(encoded_text)), collapse = "") + expect_equal(decoded, original) +}) + +test_that("gm_subject() uses proper encoding in full MIME message", { + # Long subject - should be folded + long_subject <- paste0("\u00E1", strrep("a", 100), "\u00E1") + msg_long <- gm_mime() |> + gm_to("test@example.com") |> + gm_subject(long_subject) |> + gm_text_body("Body") + + msg_long_chr <- as.character(msg_long) + + # The subject should span multiple lines with proper folding + expect_match( + msg_long_chr, + "Subject: =[?]utf-8[?]B[?][A-Za-z0-9+/=]+[?]=\r\n =[?]utf-8[?]B[?]" + ) +}) From fa27f2dfcecc529dd3cf7dadab60989f70b9957d Mon Sep 17 00:00:00 2001 From: Jenny Bryan Date: Tue, 11 Nov 2025 14:40:13 -0800 Subject: [PATCH 3/3] Harden against R 4.1 on Windows --- tests/testthat/test-gm_mime.R | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-gm_mime.R b/tests/testthat/test-gm_mime.R index 66ee57c..888f9d6 100644 --- a/tests/testthat/test-gm_mime.R +++ b/tests/testthat/test-gm_mime.R @@ -261,7 +261,7 @@ test_that("header_encode_text() passes ASCII-only text through", { test_that("header_encode_text() encodes short Unicode text", { # Short subject with Unicode that fits in single encoded-word - short_unicode <- "Hello \u00E1\u00E9\u00ED\u00F3\u00FA" # "Hello áéíóú" + short_unicode <- "Hello \u00E1\u00E9\u00ED\u00F3\u00FA" result <- header_encode_text(short_unicode) # Should not contain CRLF (no folding) @@ -291,11 +291,15 @@ test_that("header_encode_text() folds long non-ASCII text", { test_that("header_encode_text() roundtrip: encode then decode", { # this is to make sure we break up the encoded-text in chunks of 4 characters - original <- "\U0001F389\U0001F38A\U0001F388 Célébration extraordinaire à Zürich! \U0001F973\U0001F382\U0001F37E Join us for a très spécial soirée! \U0001F942\U0001F377\U0001F95C" + original <- "\U0001F389\U0001F38A\U0001F388 C\u00E9l\u00E9bration extraordinaire \u00E0 Z\u00FCrich! \U0001F973\U0001F382\U0001F37E Join us for a tr\u00E8s sp\u00E9cial soir\u00E9e! \U0001F942\U0001F377\U0001F95C" encoded <- header_encode_text(original) + encoded_words <- strsplit(encoded, "\r\n ", fixed = TRUE)[[1]] encoded_text <- sub("[?]=$", "", sub("^=[?]utf-8[?]B[?]", "", encoded_words)) - decoded <- paste0(rawToChar(base64decode(encoded_text)), collapse = "") + + # Decode each chunk separately (to verify each is valid base64), then concatenate + decoded <- rawToChar(unlist(lapply(encoded_text, base64decode))) + Encoding(decoded) <- "UTF-8" expect_equal(decoded, original) })