Skip to content

Commit

Permalink
[Minor] Unify converters usage
Browse files Browse the repository at this point in the history
  • Loading branch information
vstakhov committed Nov 19, 2019
1 parent bd45569 commit 7428ea0
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 22 deletions.
3 changes: 2 additions & 1 deletion src/libmime/archives.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ rspamd_archive_file_try_utf (struct rspamd_task *task,
struct rspamd_charset_converter *conv;
UConverter *utf8_converter;

conv = rspamd_mime_get_converter_cached (charset, &uc_err);
conv = rspamd_mime_get_converter_cached (charset, task->task_pool,
FALSE, &uc_err);
utf8_converter = rspamd_get_utf8_converter ();

if (conv == NULL) {
Expand Down
45 changes: 40 additions & 5 deletions src/libmime/mime_encoding.c
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,10 @@ rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,


struct rspamd_charset_converter *
rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err)
rspamd_mime_get_converter_cached (const gchar *enc,
rspamd_mempool_t *pool,
gboolean is_canon,
UErrorCode *err)
{
const gchar *canon_name;
static rspamd_lru_hash_t *cache;
Expand All @@ -147,7 +150,19 @@ rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err)
rspamd_str_equal);
}

canon_name = ucnv_getStandardName (enc, "IANA", err);
if (enc == NULL) {
return NULL;
}

if (!is_canon) {
rspamd_ftok_t cset_tok;

RSPAMD_FTOK_FROM_STR (&cset_tok, enc);
canon_name = rspamd_mime_detect_charset (&cset_tok, pool);
}
else {
canon_name = enc;
}

if (canon_name == NULL) {
return NULL;
Expand Down Expand Up @@ -306,7 +321,7 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
UConverter *utf8_converter;
struct rspamd_charset_converter *conv;

conv = rspamd_mime_get_converter_cached (in_enc, &uc_err);
conv = rspamd_mime_get_converter_cached (in_enc, pool, TRUE, &uc_err);
utf8_converter = rspamd_get_utf8_converter ();

if (conv == NULL) {
Expand Down Expand Up @@ -370,7 +385,8 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
UConverter *utf8_converter;
struct rspamd_charset_converter *conv;

conv = rspamd_mime_get_converter_cached (charset, &uc_err);
conv = rspamd_mime_get_converter_cached (charset, task->task_pool,
TRUE, &uc_err);
utf8_converter = rspamd_get_utf8_converter ();

if (conv == NULL) {
Expand Down Expand Up @@ -429,6 +445,7 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
gboolean
rspamd_mime_to_utf8_byte_array (GByteArray *in,
GByteArray *out,
rspamd_mempool_t *pool,
const gchar *enc)
{
gint32 r, clen, dlen;
Expand All @@ -438,6 +455,24 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
struct rspamd_charset_converter *conv;
rspamd_ftok_t charset_tok;

if (in == NULL || in->len == 0) {
return FALSE;
}

if (enc == NULL) {
/* Assume utf ? */
if (rspamd_fast_utf8_validate (in->data, in->len) == 0) {
g_byte_array_set_size (out, in->len);
memcpy (out->data, in->data, out->len);

return TRUE;
}
else {
/* Bad stuff, keep out */
return FALSE;
}
}

RSPAMD_FTOK_FROM_STR (&charset_tok, enc);

if (rspamd_mime_charset_utf_check (&charset_tok, (gchar *)in->data, in->len,
Expand All @@ -449,7 +484,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
}

utf8_converter = rspamd_get_utf8_converter ();
conv = rspamd_mime_get_converter_cached (enc, &uc_err);
conv = rspamd_mime_get_converter_cached (enc, pool, TRUE, &uc_err);

if (conv == NULL) {
return FALSE;
Expand Down
33 changes: 22 additions & 11 deletions src/libmime/mime_encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ const gchar *rspamd_mime_detect_charset (const rspamd_ftok_t *in,
* @param pool
* @param input
* @param len
* @param in_enc
* @param in_enc canon charset
* @param olen
* @param err
* @return
Expand All @@ -57,14 +57,20 @@ gchar *rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
gsize *olen, GError **err);

/**
* Converts data from `in` to `out`, returns `FALSE` if `enc` is not a valid iconv charset
* Converts data from `in` to `out`,
* returns `FALSE` if `enc` is not a valid iconv charset
*
* This function, in fact, copies `in` from `out` replacing out content in
* total.
* @param in
* @param out
* @param enc
* @param enc validated canonical charset name. If NULL, then utf8 check is done only
* @return
*/
gboolean rspamd_mime_to_utf8_byte_array (GByteArray *in,
GByteArray *out, const gchar *enc);
GByteArray *out,
rspamd_mempool_t *pool,
const gchar *enc);

/**
* Maybe convert part to utf-8
Expand All @@ -83,7 +89,8 @@ void rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
* @return
*/
gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
gchar *in, gsize len, gboolean content_check);
gchar *in, gsize len,
gboolean content_check);

/**
* Ensure that all characters in string are valid utf8 chars or replace them
Expand All @@ -93,14 +100,18 @@ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
*/
void rspamd_mime_charset_utf_enforce (gchar *in, gsize len);

/**
* Gets cached converter
* @param enc
* @param err
* @return
*/
/**
* Gets cached converter
* @param enc input encoding
* @param pool pool to use for temporary normalisation
* @param is_canon TRUE if normalisation is needed
* @param err output error
* @return converter
*/
struct rspamd_charset_converter *rspamd_mime_get_converter_cached (
const gchar *enc,
rspamd_mempool_t *pool,
gboolean is_canon,
UErrorCode *err);

/**
Expand Down
21 changes: 16 additions & 5 deletions src/libmime/mime_headers.c
Original file line number Diff line number Diff line change
Expand Up @@ -512,9 +512,12 @@ rspamd_mime_headers_process (struct rspamd_task *task,
}

static void
rspamd_mime_header_maybe_save_token (rspamd_mempool_t *pool, GString *out,
GByteArray *token, GByteArray *decoded_token,
rspamd_ftok_t *old_charset, rspamd_ftok_t *new_charset)
rspamd_mime_header_maybe_save_token (rspamd_mempool_t *pool,
GString *out,
GByteArray *token,
GByteArray *decoded_token,
rspamd_ftok_t *old_charset,
rspamd_ftok_t *new_charset)
{
if (new_charset->len == 0) {
g_assert_not_reached ();
Expand All @@ -538,14 +541,22 @@ rspamd_mime_header_maybe_save_token (rspamd_mempool_t *pool, GString *out,
}

/* We need to flush and decode old token to out string */
if (rspamd_mime_to_utf8_byte_array (token, decoded_token,
if (rspamd_mime_to_utf8_byte_array (token, decoded_token, pool,
rspamd_mime_detect_charset (new_charset, pool))) {
g_string_append_len (out, decoded_token->data, decoded_token->len);
}

/* We also reset buffer */
g_byte_array_set_size (token, 0);
/* Propagate charset */
/*
* Propagate charset
*
* Here are dragons: we save the original charset to allow buffers concat
* in the condition at the beginning of the function.
* However, it will likely cause unnecessary calls for
* `rspamd_mime_detect_charset` which could be relatively expensive.
* But we ignore that for now...
*/
memcpy (old_charset, new_charset, sizeof (*old_charset));
}

Expand Down

0 comments on commit 7428ea0

Please sign in to comment.