Skip to content

Commit

Permalink
[Minor] Lua_util: Add normalize_utf8 utility
Browse files Browse the repository at this point in the history
Issue: #4475
  • Loading branch information
vstakhov committed May 5, 2023
1 parent 9d0a7d7 commit 7f7e053
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 5 deletions.
2 changes: 1 addition & 1 deletion src/libserver/url.h
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ int rspamd_url_cmp_qsort(const void *u1, const void *u2);
*/
#define rspamd_url_normalise_propagate_flags(pool, input, len_out, url_flags_out) \
do { \
enum rspamd_normalise_result norm_res; \
enum rspamd_utf8_normalise_result norm_res; \
norm_res = rspamd_normalise_unicode_inplace((input), (len_out)); \
if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) { \
url_flags_out |= RSPAMD_URL_FLAG_UNNORMALISED; \
Expand Down
4 changes: 2 additions & 2 deletions src/libutil/cxx/utf8_util.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ rspamd_string_unicode_trim_inplace (const char *str, size_t *len)
return ret;
}

enum rspamd_normalise_result
enum rspamd_utf8_normalise_result
rspamd_normalise_unicode_inplace(char *start, size_t *len)
{
UErrorCode uc_err = U_ZERO_ERROR;
Expand Down Expand Up @@ -156,7 +156,7 @@ rspamd_normalise_unicode_inplace(char *start, size_t *len)
*len = filter_zw_spaces_and_push_back(uc_string);
}

return static_cast<enum rspamd_normalise_result>(ret);
return static_cast<enum rspamd_utf8_normalise_result>(ret);
}

struct rspamd_icu_collate_storage {
Expand Down
4 changes: 2 additions & 2 deletions src/libutil/cxx/utf8_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ extern "C" {
*/
const char* rspamd_string_unicode_trim_inplace (const char *str, size_t *len);

enum rspamd_normalise_result {
enum rspamd_utf8_normalise_result {
RSPAMD_UNICODE_NORM_NORMAL = 0,
RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0),
RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1),
Expand All @@ -49,7 +49,7 @@ enum rspamd_normalise_result {
* @param len
* @return TRUE if a string has been normalised
*/
enum rspamd_normalise_result rspamd_normalise_unicode_inplace(gchar *start, gsize *len);
enum rspamd_utf8_normalise_result rspamd_normalise_unicode_inplace(gchar *start, gsize *len);

/**
* Compare two strings using libicu collator
Expand Down
47 changes: 47 additions & 0 deletions src/lua/lua_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,19 @@ LUA_FUNCTION_DEF (util, strlen_utf8);
*/
LUA_FUNCTION_DEF (util, lower_utf8);

/***
* @function util.normalize_utf8(str)
* Gets a string in UTF8 and normalises it to NFKC_Casefold form
* @param {string} str utf8 encoded string
* @return {string,integer} lowercased utf8 string + result of the normalisation (use bit.band to check):
* RSPAMD_UNICODE_NORM_NORMAL = 0,
* RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0),
* RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1),
* RSPAMD_UNICODE_NORM_ERROR = (1 << 2),
* RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3)
*/
LUA_FUNCTION_DEF (util, normalize_utf8);

/***
* @function util.strequal_caseless(str1, str2)
* Compares two strings regardless of their case using ascii comparison.
Expand Down Expand Up @@ -672,6 +685,7 @@ static const struct luaL_reg utillib_f[] = {
LUA_INTERFACE_DEF (util, parse_mail_address),
LUA_INTERFACE_DEF (util, strlen_utf8),
LUA_INTERFACE_DEF (util, lower_utf8),
LUA_INTERFACE_DEF (util, normalize_utf8),
LUA_INTERFACE_DEF (util, strequal_caseless),
LUA_INTERFACE_DEF (util, strequal_caseless_utf8),
LUA_INTERFACE_DEF (util, get_ticks),
Expand Down Expand Up @@ -1605,6 +1619,39 @@ lua_util_lower_utf8 (lua_State *L)
return 1;
}

static gint
lua_util_normalize_utf8 (lua_State *L)
{
LUA_TRACE_POINT;
struct rspamd_lua_text *t;
bool is_text = lua_type (L, 1) == LUA_TUSERDATA;

t = lua_check_text_or_string (L, 1);

if (!t) {
return luaL_error(L, "invalid arguments");
}

char *cpy = g_malloc (t->len + 1);
memcpy (cpy, t->start, t->len);
cpy[t->len] = '\0';
gsize len = t->len;
enum rspamd_utf8_normalise_result res = rspamd_normalise_unicode_inplace(cpy, &len);

if (is_text) {
struct rspamd_lua_text *out = lua_new_text (L, cpy, len, FALSE);
out->flags |= RSPAMD_TEXT_FLAG_OWN;
}
else {
lua_pushlstring(L, cpy, len);
g_free(cpy);
}

lua_pushinteger(L, res);

return 2;
}

static gint
lua_util_strequal_caseless (lua_State *L)
{
Expand Down

0 comments on commit 7f7e053

Please sign in to comment.