diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index d061f77260..bd323b43f0 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -1331,6 +1331,9 @@ html_process_input(rspamd_mempool_t *pool, url->count++; } } + if (part_urls) { + g_ptr_array_add(part_urls, url); + } href_offset = hc->parsed.size(); } diff --git a/src/libserver/html/html_tests.cxx b/src/libserver/html/html_tests.cxx index 323858d71c..07618b2737 100644 --- a/src/libserver/html/html_tests.cxx +++ b/src/libserver/html/html_tests.cxx @@ -217,6 +217,38 @@ TEST_CASE("html text extraction") rspamd_mempool_delete(pool); } +TEST_CASE("html urls extraction") +{ + using namespace std::string_literals; + const std::vector>> cases{ + {"test", {"https://example.com"}} + }; + + rspamd_url_init(NULL); + auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + "html", 0); + auto i = 1; + for (const auto &c : cases) { + SUBCASE((fmt::format("html url extraction case {}", i)).c_str()) { + GPtrArray *purls = g_ptr_array_new(); + GByteArray *tmp = g_byte_array_sized_new(c.first.size()); + g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size()); + auto *hc = html_process_input(pool, tmp, nullptr, nullptr, purls, true); + CHECK(hc != nullptr); + auto expected = c.second; + CHECK(expected.size() == purls->len); + for (auto j = 0; j < expected.size(); ++j) { + auto *url = (rspamd_url *)g_ptr_array_index(purls, j); + CHECK(expected[j] == std::string{url->string, url->urllen}); + } + g_byte_array_free(tmp, TRUE); + g_ptr_array_free(purls, TRUE); + } + } + + rspamd_mempool_delete(pool); +} + } } /* namespace rspamd::html */