Skip to content

Commit

Permalink
[Feature] Add order to urls structure
Browse files Browse the repository at this point in the history
  • Loading branch information
vstakhov committed Jul 25, 2023
1 parent c82c2cc commit 5fd7a90
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 19 deletions.
23 changes: 13 additions & 10 deletions src/libmime/message.c
Original file line number Diff line number Diff line change
Expand Up @@ -763,7 +763,8 @@ rspamd_message_process_plain_text_part (struct rspamd_task *task,

static gboolean
rspamd_message_process_html_text_part (struct rspamd_task *task,
struct rspamd_mime_text_part *text_part)
struct rspamd_mime_text_part *text_part,
uint16_t *cur_url_order)
{
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML;

Expand All @@ -786,7 +787,8 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
&text_part->exceptions,
MESSAGE_FIELD (task, urls),
text_part->mime_part->urls,
task->cfg ? task->cfg->enable_css_parser : true);
task->cfg ? task->cfg->enable_css_parser : true,
cur_url_order);
rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);

if (text_part->utf_content.len == 0) {
Expand Down Expand Up @@ -842,7 +844,8 @@ rspamd_message_part_can_be_parsed_as_text (struct rspamd_task *task,
static gboolean
rspamd_message_process_text_part_maybe (struct rspamd_task *task,
struct rspamd_mime_part *mime_part,
enum rspamd_message_part_is_text_result is_text)
enum rspamd_message_part_is_text_result is_text,
uint16_t *cur_url_order)
{
struct rspamd_mime_text_part *text_part;
guint flags = 0;
Expand All @@ -864,7 +867,7 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
text_part->flags |= flags;

if (is_text == RSPAMD_MESSAGE_PART_IS_TEXT_HTML) {
if (!rspamd_message_process_html_text_part (task, text_part)) {
if (!rspamd_message_process_html_text_part (task, text_part, cur_url_order)) {
return FALSE;
}
}
Expand Down Expand Up @@ -911,27 +914,27 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
* Use strict extraction mode: we will extract missing urls from
* an html part if needed
*/
rspamd_url_text_extract (task->task_pool, task, text_part,
rspamd_url_text_extract (task->task_pool, task, text_part, cur_url_order,
RSPAMD_URL_FIND_STRICT);
}
else {
/*
* Fall back to full text extraction using TLD patterns
*/
rspamd_url_text_extract (task->task_pool, task, text_part,
rspamd_url_text_extract (task->task_pool, task, text_part, cur_url_order,
RSPAMD_URL_FIND_ALL);
}
}
else {
/*
* Fall back to full text extraction using TLD patterns
*/
rspamd_url_text_extract (task->task_pool, task, text_part,
rspamd_url_text_extract (task->task_pool, task, text_part, cur_url_order,
RSPAMD_URL_FIND_ALL);
}
}
else {
rspamd_url_text_extract (task->task_pool, task, text_part,
rspamd_url_text_extract (task->task_pool, task, text_part, cur_url_order,
RSPAMD_URL_FIND_STRICT);
}

Expand Down Expand Up @@ -1487,13 +1490,14 @@ rspamd_message_process (struct rspamd_task *task)
}
}

uint16_t cur_url_order = 0;
g_array_sort(detected_text_parts, rspamd_mime_text_part_position_compare_func);
/* One more iteration to process text parts in a more specific order */
for (i = 0; i < detected_text_parts->len; i ++) {
part = g_ptr_array_index (MESSAGE_FIELD (task, parts),
g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).pos);
rspamd_message_process_text_part_maybe(task, part,
g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).res);
g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).res, &cur_url_order);
}

g_array_free (detected_text_parts, TRUE);
Expand Down Expand Up @@ -1640,7 +1644,6 @@ rspamd_message_process (struct rspamd_task *task)
}

rspamd_images_link (task);

rspamd_tokenize_meta_words (task);
}

Expand Down
16 changes: 12 additions & 4 deletions src/libserver/html/html.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -1328,7 +1328,8 @@ html_process_input(struct rspamd_task *task,
GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
GPtrArray *part_urls,
bool allow_css) -> html_content *
bool allow_css,
std::uint16_t *cur_url_order) -> html_content *
{
const gchar *p, *c, *end, *start;
guchar t;
Expand Down Expand Up @@ -1372,6 +1373,7 @@ html_process_input(struct rspamd_task *task,
g_assert (task != NULL);

auto *pool = task->task_pool;
auto cur_url_part_order = 0u;

auto *hc = new html_content;
rspamd_mempool_add_destructor(task->task_pool, html_content::html_content_dtor, hc);
Expand Down Expand Up @@ -1472,6 +1474,10 @@ html_process_input(struct rspamd_task *task,
struct rspamd_url *maybe_existing =
rspamd_url_set_add_or_return(url_set, maybe_url.value());
if (maybe_existing == maybe_url.value()) {
if (cur_url_order) {
url->order = *(cur_url_order)++;
}
url->part_order = cur_url_part_order++;
html_process_query_url(pool, url, url_set,
part_urls);
}
Expand Down Expand Up @@ -2273,10 +2279,11 @@ rspamd_html_process_part_full(struct rspamd_task *task,
GByteArray *in, GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
GPtrArray *part_urls,
bool allow_css)
bool allow_css,
uint16_t *cur_url_order)
{
return rspamd::html::html_process_input(task, in, exceptions, url_set,
part_urls, allow_css);
part_urls, allow_css, cur_url_order);
}

void *
Expand All @@ -2286,9 +2293,10 @@ rspamd_html_process_part(rspamd_mempool_t *pool,
struct rspamd_task fake_task;
memset(&fake_task, 0, sizeof(fake_task));
fake_task.task_pool = pool;
uint16_t order = 0;

return rspamd_html_process_part_full (&fake_task, in, NULL,
NULL, NULL, FALSE);
NULL, NULL, FALSE, &order);
}

guint
Expand Down
3 changes: 2 additions & 1 deletion src/libserver/html/html.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ void *rspamd_html_process_part_full(struct rspamd_task *task,
GByteArray *in, GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
GPtrArray *part_urls,
bool allow_css);
bool allow_css,
uint16_t *cur_url_order);

/*
* Returns true if a specified tag has been seen in a part
Expand Down
3 changes: 2 additions & 1 deletion src/libserver/html/html.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,8 @@ auto html_process_input(struct rspamd_task *task,
GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
GPtrArray *part_urls,
bool allow_css) -> html_content *;
bool allow_css,
std::uint16_t *cur_url_order) -> html_content *;
auto html_debug_structure(const html_content &hc) -> std::string;

}
Expand Down
6 changes: 3 additions & 3 deletions src/libserver/html/html_tests.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ TEST_CASE("html parsing")
SUBCASE((std::string("extract tags from: ") + c.first).c_str()) {
GByteArray *tmp = g_byte_array_sized_new(c.first.size());
g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size());
auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true);
auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true, nullptr);
CHECK(hc != nullptr);
auto dump = html_debug_structure(*hc);
CHECK(c.second == dump);
Expand Down Expand Up @@ -215,7 +215,7 @@ TEST_CASE("html text extraction")
SUBCASE((fmt::format("html extraction case {}", i)).c_str()) {
GByteArray *tmp = g_byte_array_sized_new(c.first.size());
g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size());
auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true);
auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true, nullptr);
CHECK(hc != nullptr);
replace_newlines(hc->parsed);
auto expected = c.second;
Expand Down Expand Up @@ -259,7 +259,7 @@ TEST_CASE("html urls extraction")
auto input = std::get<0>(c);
GByteArray *tmp = g_byte_array_sized_new(input.size());
g_byte_array_append(tmp, (const guint8 *)input.data(), input.size());
auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, purls, true);
auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, purls, true, nullptr);
CHECK(hc != nullptr);
auto &expected_text = std::get<2>(c);
if (expected_text.has_value()) {
Expand Down
19 changes: 19 additions & 0 deletions src/libserver/url.c
Original file line number Diff line number Diff line change
Expand Up @@ -2244,6 +2244,9 @@ rspamd_url_parse (struct rspamd_url *uri,
memset (uri, 0, sizeof (*uri));
memset (&u, 0, sizeof (u));
uri->count = 1;
/* Undefine order */
uri->order = -1;
uri->part_order = -1;

if (*uristring == '\0') {
return URI_ERRNO_EMPTY;
Expand Down Expand Up @@ -3453,6 +3456,8 @@ struct rspamd_url_mimepart_cbdata {
struct rspamd_task *task;
struct rspamd_mime_text_part *part;
gsize url_len;
uint16_t *cur_url_order; /* Global ordering */
uint16_t cur_part_order; /* Per part ordering */
};

static gboolean
Expand Down Expand Up @@ -3488,6 +3493,12 @@ rspamd_url_query_callback (struct rspamd_url *url, gsize start_offset,
if (cbd->part && cbd->part->mime_part->urls) {
g_ptr_array_add (cbd->part->mime_part->urls, url);
}

url->part_order = cbd->cur_part_order ++;

if (cbd->cur_url_order) {
url->order = *(cbd->cur_url_order)++;
}
}

return TRUE;
Expand Down Expand Up @@ -3542,6 +3553,11 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,

if (rspamd_url_set_add_or_increase(MESSAGE_FIELD (task, urls), url, false) &&
cbd->part->mime_part->urls) {
url->part_order = cbd->cur_part_order ++;

if (cbd->cur_url_order) {
url->order = *(cbd->cur_url_order)++;
}
g_ptr_array_add (cbd->part->mime_part->urls, url);
}

Expand All @@ -3564,6 +3580,7 @@ void
rspamd_url_text_extract (rspamd_mempool_t *pool,
struct rspamd_task *task,
struct rspamd_mime_text_part *part,
uint16_t *cur_url_order,
enum rspamd_url_find_type how)
{
struct rspamd_url_mimepart_cbdata mcbd;
Expand All @@ -3576,6 +3593,8 @@ rspamd_url_text_extract (rspamd_mempool_t *pool,
mcbd.task = task;
mcbd.part = part;
mcbd.url_len = 0;
mcbd.cur_url_order = cur_url_order;
mcbd.cur_part_order = 0;

rspamd_url_find_multiple (task->task_pool, part->utf_stripped_content->data,
part->utf_stripped_content->len, how, part->newlines,
Expand Down
6 changes: 6 additions & 0 deletions src/libserver/url.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ struct rspamd_url {
uint16_t count;
uint16_t urllen;
uint16_t rawlen;

/* Absolute order of the URL in a message */
uint16_t order;
/* Order of the URL in a specific part of message */
uint16_t part_order;
};

/**
Expand Down Expand Up @@ -156,6 +161,7 @@ void rspamd_url_deinit(void);
void rspamd_url_text_extract(rspamd_mempool_t *pool,
struct rspamd_task *task,
struct rspamd_mime_text_part *part,
uint16_t *cur_order,
enum rspamd_url_find_type how);

/*
Expand Down

0 comments on commit 5fd7a90

Please sign in to comment.