Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2023-2024 the original author or authors.
* Copyright 2023-2025 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -37,8 +37,8 @@
import org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentReader;
import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource;
import org.springframework.core.io.support.PathMatchingResourcePatternResolver;

/**
* Reads the given Markdown resource and groups headers, paragraphs, or text divided by
Expand All @@ -51,9 +51,9 @@
public class MarkdownDocumentReader implements DocumentReader {

/**
* The resource points to the Markdown document.
* The resources read by this document reader.
*/
private final Resource markdownResource;
private final Resource[] markdownResources;

/**
* Configuration to a parsing process.
Expand All @@ -67,48 +67,72 @@ public class MarkdownDocumentReader implements DocumentReader {

/**
* Create a new {@link MarkdownDocumentReader} instance.
* @param markdownResource the resource to read
* @param markdownResources the resources to read, will be resolved via
* {@link PathMatchingResourcePatternResolver}
*/
public MarkdownDocumentReader(String markdownResource) {
this(new DefaultResourceLoader().getResource(markdownResource), MarkdownDocumentReaderConfig.defaultConfig());
public MarkdownDocumentReader(String markdownResources) {
this(markdownResources, MarkdownDocumentReaderConfig.defaultConfig());
}

/**
* Create a new {@link MarkdownDocumentReader} instance.
* @param markdownResource the resource to read
* @param markdownResources the resources to read, will be resolved via
* {@link PathMatchingResourcePatternResolver}
* @param config the configuration to use
*/
public MarkdownDocumentReader(String markdownResource, MarkdownDocumentReaderConfig config) {
this(new DefaultResourceLoader().getResource(markdownResource), config);
public MarkdownDocumentReader(String markdownResources, MarkdownDocumentReaderConfig config) {
this(resolveResources(markdownResources), config);
}

/**
* Create a new {@link MarkdownDocumentReader} instance.
* Create a new {@link MarkdownDocumentReader} instance using a single
* {@link Resource}.
* @param markdownResource the resource to read
*/
public MarkdownDocumentReader(Resource markdownResource, MarkdownDocumentReaderConfig config) {
this.markdownResource = markdownResource;
this(List.of(markdownResource), config);
}

/**
* Create a new {@link MarkdownDocumentReader} instance using already resolved
* {@link Resource resources}.
* @param markdownResources the resources to read
*/
public MarkdownDocumentReader(List<Resource> markdownResources, MarkdownDocumentReaderConfig config) {
this.markdownResources = markdownResources.toArray(new Resource[0]);
this.config = config;
this.parser = Parser.builder().build();
}

private static List<Resource> resolveResources(String markdownResources) {
try {
return List.of(new PathMatchingResourcePatternResolver().getResources(markdownResources));
}
catch (IOException e) {
throw new RuntimeException(e);
}
}

/**
* Extracts and returns a list of documents from the resource.
* @return List of extracted {@link Document}
*/
@Override
public List<Document> get() {
try (var input = this.markdownResource.getInputStream()) {
Node node = this.parser.parseReader(new InputStreamReader(input));

List<Document> documents = new ArrayList<>();
for (Resource markdownResource : this.markdownResources) {
DocumentVisitor documentVisitor = new DocumentVisitor(this.config);
node.accept(documentVisitor);
try (var input = markdownResource.getInputStream()) {
Node node = this.parser.parseReader(new InputStreamReader(input));

return documentVisitor.getDocuments();
}
catch (IOException e) {
throw new RuntimeException(e);
node.accept(documentVisitor);
documents.addAll(documentVisitor.getDocuments());
}
catch (IOException e) {
throw new RuntimeException(e);
}
}
return documents;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,49 @@
import static org.assertj.core.groups.Tuple.tuple;

/**
* Unit tests for {@link MarkdownDocumentReader}.
*
* @author Piotr Olaszewski
* @author shown.Ji
* @author Eric Bottard
*/
class MarkdownDocumentReaderTest {

@Test
void testDirPathSingle() {
MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/dir-test-1/*.md");

List<Document> documents = reader.get();

assertThat(documents).hasSize(2)
.extracting(Document::getMetadata, Document::getText)
.containsOnly(tuple(Map.of(),
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."),
tuple(Map.of("category", "blockquote"),
"Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit."));
}

@Test
void testDirPathMultiple() {
MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/dir-test-2/*.md");
List<Document> documents = reader.get();

assertThat(documents).hasSize(6)
.extracting(Document::getMetadata, Document::getText)
.containsOnly(tuple(Map.of("category", "header_1", "title", "This is a fancy header name"),
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim."),
tuple(Map.of("category", "header_3", "title", "Header 3"),
"Aenean eu leo eu nibh tristique posuere quis quis massa."),
tuple(Map.of("category", "header_1", "title", "Header 1a"),
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."),
tuple(Map.of("category", "header_1", "title", "Header 1b"),
"Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh."),
tuple(Map.of("category", "header_2", "title", "Header 2b"),
"Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero."),
tuple(Map.of("category", "header_2", "title", "Header 2c"),
"Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit."));
}

@Test
void testOnlyHeadersWithParagraphs() {
MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/only-headers.md");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed
nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue.

> Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget
> sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a
> porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum
> suscipit.
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed
nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue.

> Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget
> sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a
> porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum
> suscipit.
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Header 1a

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed
nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue.

# Header 1b

Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed
sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh.

## Header 2b

Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien
odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero.

# Header 1c

## Header 2c

Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# This is a fancy header name

Lorem ipsum dolor sit amet, **consectetur adipiscing elit**. Donec tincidunt velit non bibendum gravida. Cras accumsan
tincidunt ornare. Donec hendrerit consequat tellus *blandit* accumsan. Aenean aliquam metus at ***arcu elementum***
dignissim.

### Header 3

Aenean eu leo eu nibh tristique _posuere quis quis massa_.