diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java index 630c2034195..e9970ff5dfc 100644 --- a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java +++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java @@ -1,5 +1,5 @@ /* - * Copyright 2023-2024 the original author or authors. + * Copyright 2023-2025 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,8 +37,8 @@ import org.springframework.ai.document.Document; import org.springframework.ai.document.DocumentReader; import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig; -import org.springframework.core.io.DefaultResourceLoader; import org.springframework.core.io.Resource; +import org.springframework.core.io.support.PathMatchingResourcePatternResolver; /** * Reads the given Markdown resource and groups headers, paragraphs, or text divided by @@ -51,9 +51,9 @@ public class MarkdownDocumentReader implements DocumentReader { /** - * The resource points to the Markdown document. + * The resources read by this document reader. */ - private final Resource markdownResource; + private final Resource[] markdownResources; /** * Configuration to a parsing process. @@ -67,48 +67,72 @@ public class MarkdownDocumentReader implements DocumentReader { /** * Create a new {@link MarkdownDocumentReader} instance. - * @param markdownResource the resource to read + * @param markdownResources the resources to read, will be resolved via + * {@link PathMatchingResourcePatternResolver} */ - public MarkdownDocumentReader(String markdownResource) { - this(new DefaultResourceLoader().getResource(markdownResource), MarkdownDocumentReaderConfig.defaultConfig()); + public MarkdownDocumentReader(String markdownResources) { + this(markdownResources, MarkdownDocumentReaderConfig.defaultConfig()); } /** * Create a new {@link MarkdownDocumentReader} instance. - * @param markdownResource the resource to read + * @param markdownResources the resources to read, will be resolved via + * {@link PathMatchingResourcePatternResolver} * @param config the configuration to use */ - public MarkdownDocumentReader(String markdownResource, MarkdownDocumentReaderConfig config) { - this(new DefaultResourceLoader().getResource(markdownResource), config); + public MarkdownDocumentReader(String markdownResources, MarkdownDocumentReaderConfig config) { + this(resolveResources(markdownResources), config); } /** - * Create a new {@link MarkdownDocumentReader} instance. + * Create a new {@link MarkdownDocumentReader} instance using a single + * {@link Resource}. * @param markdownResource the resource to read */ public MarkdownDocumentReader(Resource markdownResource, MarkdownDocumentReaderConfig config) { - this.markdownResource = markdownResource; + this(List.of(markdownResource), config); + } + + /** + * Create a new {@link MarkdownDocumentReader} instance using already resolved + * {@link Resource resources}. + * @param markdownResources the resources to read + */ + public MarkdownDocumentReader(List markdownResources, MarkdownDocumentReaderConfig config) { + this.markdownResources = markdownResources.toArray(new Resource[0]); this.config = config; this.parser = Parser.builder().build(); } + private static List resolveResources(String markdownResources) { + try { + return List.of(new PathMatchingResourcePatternResolver().getResources(markdownResources)); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + /** * Extracts and returns a list of documents from the resource. * @return List of extracted {@link Document} */ @Override public List get() { - try (var input = this.markdownResource.getInputStream()) { - Node node = this.parser.parseReader(new InputStreamReader(input)); - + List documents = new ArrayList<>(); + for (Resource markdownResource : this.markdownResources) { DocumentVisitor documentVisitor = new DocumentVisitor(this.config); - node.accept(documentVisitor); + try (var input = markdownResource.getInputStream()) { + Node node = this.parser.parseReader(new InputStreamReader(input)); - return documentVisitor.getDocuments(); - } - catch (IOException e) { - throw new RuntimeException(e); + node.accept(documentVisitor); + documents.addAll(documentVisitor.getDocuments()); + } + catch (IOException e) { + throw new RuntimeException(e); + } } + return documents; } /** diff --git a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java index 0ad03edf91c..e2928eb4163 100644 --- a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java +++ b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java @@ -28,10 +28,49 @@ import static org.assertj.core.groups.Tuple.tuple; /** + * Unit tests for {@link MarkdownDocumentReader}. + * * @author Piotr Olaszewski + * @author shown.Ji + * @author Eric Bottard */ class MarkdownDocumentReaderTest { + @Test + void testDirPathSingle() { + MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/dir-test-1/*.md"); + + List documents = reader.get(); + + assertThat(documents).hasSize(2) + .extracting(Document::getMetadata, Document::getText) + .containsOnly(tuple(Map.of(), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."), + tuple(Map.of("category", "blockquote"), + "Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit.")); + } + + @Test + void testDirPathMultiple() { + MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/dir-test-2/*.md"); + List documents = reader.get(); + + assertThat(documents).hasSize(6) + .extracting(Document::getMetadata, Document::getText) + .containsOnly(tuple(Map.of("category", "header_1", "title", "This is a fancy header name"), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim."), + tuple(Map.of("category", "header_3", "title", "Header 3"), + "Aenean eu leo eu nibh tristique posuere quis quis massa."), + tuple(Map.of("category", "header_1", "title", "Header 1a"), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."), + tuple(Map.of("category", "header_1", "title", "Header 1b"), + "Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh."), + tuple(Map.of("category", "header_2", "title", "Header 2b"), + "Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero."), + tuple(Map.of("category", "header_2", "title", "Header 2c"), + "Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit.")); + } + @Test void testOnlyHeadersWithParagraphs() { MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/only-headers.md"); diff --git a/document-readers/markdown-reader/src/test/resources/dir-test-1/blockquote.md b/document-readers/markdown-reader/src/test/resources/dir-test-1/blockquote.md new file mode 100644 index 00000000000..41781437f55 --- /dev/null +++ b/document-readers/markdown-reader/src/test/resources/dir-test-1/blockquote.md @@ -0,0 +1,7 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed +nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. + +> Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget +> sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a +> porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum +> suscipit. diff --git a/document-readers/markdown-reader/src/test/resources/dir-test-1/blockquote.txt b/document-readers/markdown-reader/src/test/resources/dir-test-1/blockquote.txt new file mode 100644 index 00000000000..41781437f55 --- /dev/null +++ b/document-readers/markdown-reader/src/test/resources/dir-test-1/blockquote.txt @@ -0,0 +1,7 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed +nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. + +> Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget +> sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a +> porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum +> suscipit. diff --git a/document-readers/markdown-reader/src/test/resources/dir-test-2/only-headers.md b/document-readers/markdown-reader/src/test/resources/dir-test-2/only-headers.md new file mode 100644 index 00000000000..19c6073c096 --- /dev/null +++ b/document-readers/markdown-reader/src/test/resources/dir-test-2/only-headers.md @@ -0,0 +1,20 @@ +# Header 1a + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed +nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. + +# Header 1b + +Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed +sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh. + +## Header 2b + +Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien +odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. + +# Header 1c + +## Header 2c + +Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit. \ No newline at end of file diff --git a/document-readers/markdown-reader/src/test/resources/dir-test-2/with-formatting.md b/document-readers/markdown-reader/src/test/resources/dir-test-2/with-formatting.md new file mode 100644 index 00000000000..1198728f22f --- /dev/null +++ b/document-readers/markdown-reader/src/test/resources/dir-test-2/with-formatting.md @@ -0,0 +1,9 @@ +# This is a fancy header name + +Lorem ipsum dolor sit amet, **consectetur adipiscing elit**. Donec tincidunt velit non bibendum gravida. Cras accumsan +tincidunt ornare. Donec hendrerit consequat tellus *blandit* accumsan. Aenean aliquam metus at ***arcu elementum*** +dignissim. + +### Header 3 + +Aenean eu leo eu nibh tristique _posuere quis quis massa_. \ No newline at end of file