Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
- Previously: `c("condition", "error", "gmail_error")`
- Now: `c("gmailr_error", "gargle_error_request_failed", "http_error_{XXX}", "gargle_error", "rlang_error", "error", "condition")`

* Text headers, such as `Subject`, are now properly prepared as per RFC 2047, fixing a problem with long-ish headers that contain non-ASCII characters (#193).

## Deprecations

* Functions that lack the `gm_` prefix have been removed, concluding a deprecation process that kicked off with gmailr 1.0.0 (released 2019-08-23). These functions were hard deprecated in gmailr 2.0.0 (released 2023-06-30). This eliminates many name conflicts with other packages (including the base package).
Expand Down
151 changes: 123 additions & 28 deletions R/gm_mime.R
Original file line number Diff line number Diff line change
Expand Up @@ -205,32 +205,6 @@ gm_attach_file <- function(mime, filename, type = NULL, id = NULL, ...) {
)
}

header_encode <- function(x) {
x <- enc2utf8(unlist(strsplit(as.character(x), ", ?")))

# this won't deal with <> used in quotes, but I think it is rare enough that
# is ok
m <- rematch2::re_match(x, "^(?<phrase>[^<]*?)(?: *<(?<addr_spec>[^>]+)>)?$")
res <- character(length(x))

# simple addresses contain no <>, so we don't need to do anything further
simple <- !nzchar(m$addr_spec)
res[simple] <- m$phrase[simple]

# complex addresses may need to be base64-encoded
needs_encoding <- Encoding(m$phrase) != "unknown"
res[needs_encoding] <- sprintf(
"=?utf-8?B?%s?=",
vcapply(m$phrase[needs_encoding], encode_base64)
)
res[!needs_encoding] <- m$phrase[!needs_encoding]

# Add the addr_spec onto non-simple examples
res[!simple] <- sprintf("%s <%s>", res[!simple], m$addr_spec[!simple])

paste0(res, collapse = ", ")
}

#' Convert a mime object to character representation
#'
#' This function converts a mime object into a character vector
Expand All @@ -240,8 +214,9 @@ header_encode <- function(x) {
#' @param ... further arguments ignored
#' @export
as.character.mime <- function(x, newline = "\r\n", ...) {
# encode headers
x$header <- lapply(x$header, header_encode)
for (i in seq_along(x$header)) {
x$header[[i]] <- encode_header(names(x$header)[i], x$header[[i]])
}

# Check if we need nested structure ((text + HTML) + attachments)
has_both_bodies <- exists_list(x$parts, TEXT_PART) &&
Expand Down Expand Up @@ -364,3 +339,123 @@ with_defaults <- function(defaults, ...) {
missing <- setdiff(names(defaults), names(args))
c(defaults[missing], args)
}

# Header encoding helpers ------------------------------------------------------
#
# In general, the Gmail API requires following RFC 2822 Internet Message Format
# https://datatracker.ietf.org/doc/html/rfc2822
#
# Then, within that, non-ASCII text in headers is addressed in RFC 2047 MIME
# Part Three: Message Header Extensions for Non-ASCII Text
# https://datatracker.ietf.org/doc/html/rfc2047
#
# Refactoring the header processing was motivated by
# https://github.com/r-lib/gmailr/issues/193

# Strategy: Divide headers into address headers vs. everything else.
#
# Use existing helper to encode address headers, as it was clearly written for
# that use case.
#
# Use a new helper for other headers, that can deal with "folding" (see the RFC)
# long-ish, non-ASCII text, e.g. in the Subject.

encode_header <- function(name, value) {
address_headers <- c(
"To",
"From",
"Cc",
"Bcc",
"Reply-To",
"Sender",
"Resent-To",
"Resent-From",
"Resent-Cc",
"Resent-Bcc",
"Resent-Sender"
)

fun <- if (name %in% address_headers) {
header_encode_address
} else {
header_encode_text
}
fun(value)
}

# Pre-existing helper now renamed to reflect its motivating use case.
# - May contain multiple comma-separated addresses
# - Each address may have the format "Name" <email@example.com>
# - Only the "Name" part needs encoding, not the email address
header_encode_address <- function(x) {
x <- enc2utf8(unlist(strsplit(as.character(x), ", ?")))

# this won't deal with <> used in quotes, but I think it is rare enough that
# is ok
m <- rematch2::re_match(x, "^(?<phrase>[^<]*?)(?: *<(?<addr_spec>[^>]+)>)?$")
res <- character(length(x))

# simple addresses contain no <>, so we don't need to do anything further
simple <- !nzchar(m$addr_spec)
res[simple] <- m$phrase[simple]

# complex addresses may need to be base64-encoded
needs_encoding <- Encoding(m$phrase) != "unknown"
res[needs_encoding] <- sprintf(
"=?utf-8?B?%s?=",
vcapply(m$phrase[needs_encoding], encode_base64)
)
res[!needs_encoding] <- m$phrase[!needs_encoding]

# Add the addr_spec onto non-simple examples
res[!simple] <- sprintf("%s <%s>", res[!simple], m$addr_spec[!simple])

paste0(res, collapse = ", ")
}

# New helper for a generic "text" header
# - Single value (not comma-separated)
# - May contain long Unicode text that exceeds RFC 2047's 75-character limit
# - Must be "folded" into multiple encoded-words if too long
header_encode_text <- function(x) {
if (length(x) == 0 || is.null(x)) {
return(x)
}

x <- enc2utf8(as.character(x))

# Pass pure ASCII through unchanged
if (Encoding(x) == "unknown") {
return(x)
}

# First, get a single base64-encoded string
b64_full <- encode_base64(x, line_length = 0L, newline = "")
b64_len <- nchar(b64_full)

# encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
# charset is utf-8
# encoding is "B" (as opposed to "Q"), as in "BASE64"
encode_word <- function(b64) sprintf("=?utf-8?B?%s?=", b64)

# RFC 2047: "An 'encoded-word' may not be more than 75 characters long,
# including 'charset', 'encoding', 'encoded-text', and delimiters."
# Format: =?utf-8?B?<encoded-text>?=
# The formalities account for 12 characters, which leaves up to 63 characters
# for the encoded text. However, base64 works in 4-character groups, so we
# must use a multiple of 4: the largest is 60.
max_b64_per_word <- 60

# Return as single encoded-word, if possible
if (b64_len <= max_b64_per_word) {
return(encode_word(b64_full))
}

# Otherwise, split into multiple encoded-words
starts <- seq(1L, b64_len, by = max_b64_per_word)
stops <- c(starts[-1] - 1L, b64_len)
encoded_words <- encode_word(substring(b64_full, starts, stops))

# Join multiple encoded-words with CRLF SPACE per RFC 2047
paste0(encoded_words, collapse = "\r\n ")
}
4 changes: 2 additions & 2 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,9 @@ encode_base64 <- function(x, line_length = 76L, newline = "\r\n") {
}

if (is.raw(x)) {
base64encode(x, 76L, newline)
base64encode(x, line_length, newline)
} else {
base64encode(charToRaw(x), 76L, "\r\n")
base64encode(charToRaw(x), line_length, newline)
}
}

Expand Down
89 changes: 80 additions & 9 deletions tests/testthat/test-gm_mime.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,39 +4,39 @@ test_that("MIME - Basic functions", {
expect_true(length(msg$header) > 0)

rv <- gm_to(msg, "adam@ali.as")
expect_equal(header_encode(rv$header$To), "adam@ali.as")
expect_equal(header_encode_address(rv$header$To), "adam@ali.as")

rv <- gm_from(msg, "bob@ali.as")
expect_equal(header_encode(rv$header$From), "bob@ali.as")
expect_equal(header_encode_address(rv$header$From), "bob@ali.as")

rv <- gm_to(msg, c("adam@ali.as", "another@ali.as", "bob@ali.as"))
expect_equal(
header_encode(rv$header$To),
header_encode_address(rv$header$To),
"adam@ali.as, another@ali.as, bob@ali.as"
)

rv <- gm_cc(msg, c("adam@ali.as", "another@ali.as", "bob@ali.as"))
expect_equal(
header_encode(rv$header$Cc),
header_encode_address(rv$header$Cc),
"adam@ali.as, another@ali.as, bob@ali.as"
)

rv <- gm_bcc(msg, c("adam@ali.as", "another@ali.as", "bob@ali.as"))
expect_equal(
header_encode(rv$header$Bcc),
header_encode_address(rv$header$Bcc),
"adam@ali.as, another@ali.as, bob@ali.as"
)
})

test_that("header_encode encodes non-ascii values as base64", {
expect_equal(header_encode("f\U00F6\U00F6"), "=?utf-8?B?ZsO2w7Y=?=")
test_that("header_encode_address encodes non-ascii values as base64", {
expect_equal(header_encode_address("f\U00F6\U00F6"), "=?utf-8?B?ZsO2w7Y=?=")

expect_equal(
header_encode('"f\U00F6\U00F6 b\U00Er1" <baz@qux.com>'),
header_encode_address('"f\U00F6\U00F6 b\U00Er1" <baz@qux.com>'),
"=?utf-8?B?ImbDtsO2IGIOcjEi?= <baz@qux.com>"
)

res <- header_encode(
res <- header_encode_address(
c(
'"f\U00F6\U00F6 b\U00E1r" <baz@qux.com>',
'"foo bar" <foo.bar@baz.com>',
Expand Down Expand Up @@ -248,3 +248,74 @@ test_that("trailing whitespace", {
quoted_printable_encode("foo\t \n \t")
)
})

test_that("header_encode_text() passes ASCII-only text through", {
ascii_subject <- "This is a plain ASCII subject"
result <- header_encode_text(ascii_subject)
expect_equal(result, ascii_subject)

long_ascii <- strrep("a", 100)
result <- header_encode_text(long_ascii)
expect_equal(result, long_ascii)
})

test_that("header_encode_text() encodes short Unicode text", {
# Short subject with Unicode that fits in single encoded-word
short_unicode <- "Hello \u00E1\u00E9\u00ED\u00F3\u00FA"
result <- header_encode_text(short_unicode)

# Should not contain CRLF (no folding)
expect_no_match(result, "\r\n", fixed = TRUE)
# Should be a single encoded-word
expect_match(result, "^=[?]utf-8[?]B[?][A-Za-z0-9+/=]+[?]=$")
# Should be within RFC 2047 limit
expect_lte(nchar(result), 75)
})

# https://github.com/r-lib/gmailr/issues/193
test_that("header_encode_text() folds long non-ASCII text", {
long_subject <- paste0("\u00E1", strrep("a", 54), "\u00E1")
result <- header_encode_text(long_subject)

# Should contain CRLF SPACE (folded into multiple encoded-words)
expect_match(result, "\r\n ", fixed = TRUE)

# Each line should be an encoded-word within RFC 2047 limit
lines <- strsplit(result, "\r\n ", fixed = TRUE)[[1]]
expect_gt(length(lines), 1)
for (line in lines) {
expect_lte(nchar(line), 75)
expect_match(line, "^=[?]utf-8[?]B[?][A-Za-z0-9+/=]+[?]=$")
}
})

test_that("header_encode_text() roundtrip: encode then decode", {
# this is to make sure we break up the encoded-text in chunks of 4 characters
original <- "\U0001F389\U0001F38A\U0001F388 C\u00E9l\u00E9bration extraordinaire \u00E0 Z\u00FCrich! \U0001F973\U0001F382\U0001F37E Join us for a tr\u00E8s sp\u00E9cial soir\u00E9e! \U0001F942\U0001F377\U0001F95C"
encoded <- header_encode_text(original)

encoded_words <- strsplit(encoded, "\r\n ", fixed = TRUE)[[1]]
encoded_text <- sub("[?]=$", "", sub("^=[?]utf-8[?]B[?]", "", encoded_words))

# Decode each chunk separately (to verify each is valid base64), then concatenate
decoded <- rawToChar(unlist(lapply(encoded_text, base64decode)))
Encoding(decoded) <- "UTF-8"
expect_equal(decoded, original)
})

test_that("gm_subject() uses proper encoding in full MIME message", {
# Long subject - should be folded
long_subject <- paste0("\u00E1", strrep("a", 100), "\u00E1")
msg_long <- gm_mime() |>
gm_to("test@example.com") |>
gm_subject(long_subject) |>
gm_text_body("Body")

msg_long_chr <- as.character(msg_long)

# The subject should span multiple lines with proper folding
expect_match(
msg_long_chr,
"Subject: =[?]utf-8[?]B[?][A-Za-z0-9+/=]+[?]=\r\n =[?]utf-8[?]B[?]"
)
})