[Test] Add tests for urls extraction

rspamd · Jul 13, 2021 · ccf4d5d · ccf4d5d
1 parent e930958
commit ccf4d5d
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 0 deletions.
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
@@ -1331,6 +1331,9 @@ html_process_input(rspamd_mempool_t *pool,
 						url->count++;
 					}
 				}
+				if (part_urls) {
+					g_ptr_array_add(part_urls, url);
+				}
 
 				href_offset = hc->parsed.size();
 			}

diff --git a/src/libserver/html/html_tests.cxx b/src/libserver/html/html_tests.cxx
@@ -217,6 +217,38 @@ TEST_CASE("html text extraction")
 	rspamd_mempool_delete(pool);
 }
 
+TEST_CASE("html urls extraction")
+{
+	using namespace std::string_literals;
+	const std::vector<std::pair<std::string, std::vector<std::string>>> cases{
+			{"<a href=\"https://example.com\">test</a>", {"https://example.com"}}
+	};
+
+	rspamd_url_init(NULL);
+	auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(),
+			"html", 0);
+	auto i = 1;
+	for (const auto &c : cases) {
+		SUBCASE((fmt::format("html url extraction case {}", i)).c_str()) {
+			GPtrArray *purls = g_ptr_array_new();
+			GByteArray *tmp = g_byte_array_sized_new(c.first.size());
+			g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size());
+			auto *hc = html_process_input(pool, tmp, nullptr, nullptr, purls, true);
+			CHECK(hc != nullptr);
+			auto expected = c.second;
+			CHECK(expected.size() == purls->len);
+			for (auto j = 0; j < expected.size(); ++j) {
+				auto *url = (rspamd_url *)g_ptr_array_index(purls, j);
+				CHECK(expected[j] == std::string{url->string, url->urllen});
+			}
+			g_byte_array_free(tmp, TRUE);
+			g_ptr_array_free(purls, TRUE);
+		}
+	}
+
+	rspamd_mempool_delete(pool);
+}
+
 }
 
 } /* namespace rspamd::html */