Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions libs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,7 @@ For sitemap sources, additional parameters can be provided, e.g.:
- `web_path`: The URL of the XML sitemap to crawl
- `filter_urls`: JSON array of URL patterns to filter pages (optional)
- `header_template`: JSON object for custom HTTP headers (optional)
- `continue_on_failure`: Whether to skip pages that fail to load instead of aborting the crawl (optional, default: `true`)

Technically, all parameters of the `SitemapLoader` from LangChain can be provided.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ async def aextract_content(
if meta_function is not None:
sitemap_loader_parameters["meta_function"] = meta_function

if "continue_on_failure" not in sitemap_loader_parameters:
sitemap_loader_parameters["continue_on_failure"] = True

document_loader = SitemapLoader(**sitemap_loader_parameters)
documents = []
try:
Expand Down Expand Up @@ -162,6 +165,21 @@ def _parse_sitemap_loader_parameters(
sitemap_loader_parameters[x.key] = json.loads(x.value)
except (json.JSONDecodeError, TypeError):
sitemap_loader_parameters[x.key] = x.value
elif x.key == "continue_on_failure":
sitemap_loader_parameters[x.key] = self._normalize_boolean(x.value)
else:
sitemap_loader_parameters[x.key] = int(x.value) if x.value.isdigit() else x.value
return sitemap_loader_parameters, parser_override

def _normalize_boolean(self, value: str) -> Optional[bool]:
if isinstance(value, bool):
return value
if isinstance(value, str):
normalized = value.strip().lower()
if normalized in ("true", "1", "yes", "y", "on"):
return True
if normalized in ("false", "0", "no", "n", "off"):
return False
if isinstance(value, (int, float)):
return bool(value)
return None
13 changes: 8 additions & 5 deletions libs/extractor-api-lib/tests/sitemap_extractor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,10 @@ async def test_aextract_content_minimal_parameters(self, mock_sitemap_loader_cla

# Verify
assert len(result) == 1
mock_sitemap_loader_class.assert_called_once_with(web_path="https://example.com/sitemap.xml")
mock_sitemap_loader_class.assert_called_once_with(
web_path="https://example.com/sitemap.xml",
continue_on_failure=True,
)

@pytest.mark.asyncio
@patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader")
Expand Down Expand Up @@ -428,8 +431,8 @@ async def test_aextract_content_edge_case_empty_kwargs(self, mock_sitemap_loader

# Verify
assert result == []
# Should still call SitemapLoader but with no additional parameters
mock_sitemap_loader_class.assert_called_once_with()
# Should still call SitemapLoader but with default failure handling
mock_sitemap_loader_class.assert_called_once_with(continue_on_failure=True)

@pytest.mark.asyncio
@patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader")
Expand All @@ -441,7 +444,7 @@ async def test_aextract_content_mixed_parameter_types(self, mock_sitemap_loader_
kwargs=[
KeyValuePair(key="web_path", value="https://example.com/sitemap.xml"),
KeyValuePair(key="max_depth", value="3"), # Will be converted to int
KeyValuePair(key="continue_on_failure", value="true"), # Will remain string
KeyValuePair(key="continue_on_failure", value="true"), # Will be converted to bool
KeyValuePair(key="filter_urls", value='["pattern1", "pattern2"]'), # Will be parsed as JSON
KeyValuePair(
key="header_template", value='{"Authorization": "Bearer token123"}'
Expand All @@ -462,7 +465,7 @@ async def test_aextract_content_mixed_parameter_types(self, mock_sitemap_loader_
call_args = mock_sitemap_loader_class.call_args[1]
assert call_args["web_path"] == "https://example.com/sitemap.xml"
assert call_args["max_depth"] == 3 # Converted to int
assert call_args["continue_on_failure"] == "true" # Remained string
assert call_args["continue_on_failure"] is True # Converted to bool
assert call_args["filter_urls"] == ["pattern1", "pattern2"] # Parsed JSON
assert call_args["header_template"] == {"Authorization": "Bearer token123"} # Parsed JSON
assert call_args["custom_param"] == "custom_value" # Remained string
5 changes: 5 additions & 0 deletions services/frontend/libs/admin-app/data-access/document.api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ export interface SitemapConfig {
headerTemplate: string;
name: string;
parser?: 'docusaurus' | 'astro' | 'generic';
continueOnFailure?: boolean;
}

export class DocumentAPI {
Expand Down Expand Up @@ -99,6 +100,10 @@ export class DocumentAPI {
payload.push({ key: 'sitemap_parser', value: config.parser });
}

if (typeof config.continueOnFailure === 'boolean') {
payload.push({ key: 'continue_on_failure', value: String(config.continueOnFailure) });
}

// add filter_urls only if provided
if (config.filterUrls && config.filterUrls.trim()) {
// Convert multiline string to array and filter out empty lines
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ const confluenceCql = ref('');
const sitemapFilterUrls = ref('');
const sitemapHeaderTemplate = ref('');
const sitemapParser = ref<'docusaurus' | 'astro' | 'generic' | undefined>(undefined);
const sitemapContinueOnFailure = ref(true);

const error = computed(() => store.error);

Expand Down Expand Up @@ -102,6 +103,7 @@ const handleConfluenceUpload = () => {
filterUrls: sitemapFilterUrls.value,
headerTemplate: sitemapHeaderTemplate.value,
parser,
continueOnFailure: sitemapContinueOnFailure.value,
});
}

Expand Down Expand Up @@ -237,6 +239,15 @@ const getErrorMessage = (errorType: string) => {
<textarea v-model="sitemapFilterUrls" placeholder="Filter URLs (optional) - one regex pattern per line" class="textarea textarea-bordered w-full" rows="3"></textarea>
<label for="sitemapHeaderTemplate" class="sr-only">Headers JSON</label>
<textarea v-model="sitemapHeaderTemplate" placeholder="Headers (optional) - JSON format: {&quot;Authorization&quot;: &quot;Bearer token&quot;}" class="textarea textarea-bordered w-full" rows="2"></textarea>
<label class="flex items-center justify-between text-sm">
<span>{{ t('documents.sitemapContinueOnFailure') }}</span>
<input
v-model="sitemapContinueOnFailure"
type="checkbox"
class="checkbox checkbox-sm"
:title="t('documents.sitemapContinueOnFailureHint')"
/>
</label>
</div>
<p class="text-xs opacity-50 mb-4">{{ t('documents.sitemapLoadDescription') }}</p>
<button class="btn btn-sm btn-accent" @click="handleSitemapUpload">
Expand Down
2 changes: 2 additions & 0 deletions services/frontend/libs/i18n/admin/de.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
"sitemapParserAstro": "Parser: Astro / Starlight",
"sitemapParserDocusaurus": "Parser: Docusaurus",
"sitemapParserGeneric": "Parser: Generisch",
"sitemapContinueOnFailure": "Bei Fehlern fortfahren",
"sitemapContinueOnFailureHint": "Fehlgeschlagene Seiten überspringen, statt den Crawl abzubrechen",
"loadSitemap": "Sitemap laden",
"fileTypeNotAllowedTitle": "Dateityp nicht erlaubt",
"fileTypeNotAllowedDescription": "Erlaubte Dateitypen:",
Expand Down
2 changes: 2 additions & 0 deletions services/frontend/libs/i18n/admin/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
"sitemapParserAstro": "Parser: Astro / Starlight",
"sitemapParserDocusaurus": "Parser: Docusaurus",
"sitemapParserGeneric": "Parser: Generic",
"sitemapContinueOnFailure": "Continue on failure",
"sitemapContinueOnFailureHint": "Skip pages that fail to load instead of aborting the crawl",
"loadSitemap": "Load Sitemap",
"select": "Select",
"chat": "Start chat",
Expand Down
Loading