From b33444da0a5aab98c6adf15dd530cf39aae7c8d2 Mon Sep 17 00:00:00 2001 From: rivkode Date: Thu, 23 May 2024 10:24:09 +0900 Subject: [PATCH 1/5] refactor: Collectors.joining() -> String.join() - Use String.join() for better readability and simplicity --- .../springframework/ai/reader/pdf/PagePdfDocumentReader.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/PagePdfDocumentReader.java b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/PagePdfDocumentReader.java index 6b48ecdb790..fbdac2113f8 100644 --- a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/PagePdfDocumentReader.java +++ b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/PagePdfDocumentReader.java @@ -121,7 +121,7 @@ public List get() { && pagesPerDocument >= this.config.pagesPerDocument) { pagesPerDocument = 0; - var aggregatedPageTextGroup = pageTextGroupList.stream().collect(Collectors.joining()); + var aggregatedPageTextGroup = String.join("", pageTextGroupList); if (StringUtils.hasText(aggregatedPageTextGroup)) { readDocuments.add(toDocument(aggregatedPageTextGroup, startPageNumber, pageNumber)); } @@ -150,7 +150,7 @@ public List get() { pdfTextStripper.removeRegion(PDF_PAGE_REGION); } if (!CollectionUtils.isEmpty(pageTextGroupList)) { - readDocuments.add(toDocument(pageTextGroupList.stream().collect(Collectors.joining()), startPageNumber, + readDocuments.add(toDocument(String.join("", pageTextGroupList), startPageNumber, pageNumber)); } logger.info("Processing {} pages", totalPages); From 9431a64364728855501e499705d8103f6c28cd0b Mon Sep 17 00:00:00 2001 From: rivkode Date: Thu, 23 May 2024 10:46:04 +0900 Subject: [PATCH 2/5] fix : doesNotContain -> contains --- .../ai/reader/pdf/PagePdfDocumentReaderTests.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/PagePdfDocumentReaderTests.java b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/PagePdfDocumentReaderTests.java index f42d7ef3d2a..e61135728bc 100644 --- a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/PagePdfDocumentReaderTests.java +++ b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/PagePdfDocumentReaderTests.java @@ -50,9 +50,9 @@ public void classpathRead() { assertThat(docs).hasSize(4); - String allText = docs.stream().map(d -> d.getContent()).collect(Collectors.joining(System.lineSeparator())); + String allText = docs.stream().map(Document::getContent).collect(Collectors.joining(System.lineSeparator())); - assertThat(allText).doesNotContain( + assertThat(allText).contains( List.of("Page 1 of 4", "Page 2 of 4", "Page 3 of 4", "Page 4 of 4", "PDF Bookmark Sample")); } From 27402fc1a6793067cbcb7f54f077fd1e10386ca6 Mon Sep 17 00:00:00 2001 From: rivkode Date: Thu, 23 May 2024 11:15:54 +0900 Subject: [PATCH 3/5] chore: Remove unnecessary import --- .../org/springframework/ai/reader/pdf/PagePdfDocumentReader.java | 1 - 1 file changed, 1 deletion(-) diff --git a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/PagePdfDocumentReader.java b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/PagePdfDocumentReader.java index fbdac2113f8..b34d02ba1a4 100644 --- a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/PagePdfDocumentReader.java +++ b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/PagePdfDocumentReader.java @@ -19,7 +19,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.stream.Collectors; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; From 5c88660a85f5ef030b9778dbbb5e164299b30f1c Mon Sep 17 00:00:00 2001 From: rivkode Date: Fri, 24 May 2024 11:01:11 +0900 Subject: [PATCH 4/5] fix: rollback String.join() -> Collectors.joining() --- .../springframework/ai/reader/pdf/PagePdfDocumentReader.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/PagePdfDocumentReader.java b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/PagePdfDocumentReader.java index b34d02ba1a4..6b48ecdb790 100644 --- a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/PagePdfDocumentReader.java +++ b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/PagePdfDocumentReader.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; @@ -120,7 +121,7 @@ public List get() { && pagesPerDocument >= this.config.pagesPerDocument) { pagesPerDocument = 0; - var aggregatedPageTextGroup = String.join("", pageTextGroupList); + var aggregatedPageTextGroup = pageTextGroupList.stream().collect(Collectors.joining()); if (StringUtils.hasText(aggregatedPageTextGroup)) { readDocuments.add(toDocument(aggregatedPageTextGroup, startPageNumber, pageNumber)); } @@ -149,7 +150,7 @@ public List get() { pdfTextStripper.removeRegion(PDF_PAGE_REGION); } if (!CollectionUtils.isEmpty(pageTextGroupList)) { - readDocuments.add(toDocument(String.join("", pageTextGroupList), startPageNumber, + readDocuments.add(toDocument(pageTextGroupList.stream().collect(Collectors.joining()), startPageNumber, pageNumber)); } logger.info("Processing {} pages", totalPages); From 3efddbaf8ca467837464ad2874a4cb6e56e4f8ac Mon Sep 17 00:00:00 2001 From: jonghun Date: Sun, 26 May 2024 07:10:41 +0900 Subject: [PATCH 5/5] style: method name classpathRead() -> withoutHeadersAndFooters() --- .../ai/reader/pdf/PagePdfDocumentReaderTests.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/PagePdfDocumentReaderTests.java b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/PagePdfDocumentReaderTests.java index e61135728bc..cad075afef1 100644 --- a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/PagePdfDocumentReaderTests.java +++ b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/PagePdfDocumentReaderTests.java @@ -32,7 +32,7 @@ public class PagePdfDocumentReaderTests { @Test - public void classpathRead() { + public void withoutHeadersAndFooters() { PagePdfDocumentReader pdfReader = new PagePdfDocumentReader("classpath:/sample1.pdf", PdfDocumentReaderConfig.builder() @@ -50,9 +50,9 @@ public void classpathRead() { assertThat(docs).hasSize(4); - String allText = docs.stream().map(Document::getContent).collect(Collectors.joining(System.lineSeparator())); + String allText = docs.stream().map(d -> d.getContent()).collect(Collectors.joining(System.lineSeparator())); - assertThat(allText).contains( + assertThat(allText).doesNotContain( List.of("Page 1 of 4", "Page 2 of 4", "Page 3 of 4", "Page 4 of 4", "PDF Bookmark Sample")); }