Skip to content

Commit

Permalink
Update MarkdownReader to parse text before first header (#13327)
Browse files Browse the repository at this point in the history
  • Loading branch information
joelrorseth committed May 7, 2024
1 parent 4431e99 commit 8cb9690
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
from pathlib import Path
from fsspec import AbstractFileSystem
from fsspec.implementations.local import LocalFileSystem
from typing import Any, Dict, List, Optional, Tuple, cast

from typing import Any, Dict, List, Optional, Tuple
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document

Expand Down Expand Up @@ -44,34 +43,31 @@ def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]
lines = markdown_text.split("\n")

current_header = None
current_text = ""
current_lines = []

for line in lines:
header_match = re.match(r"^#+\s", line)
if header_match:
if current_header is not None:
if current_text == "" or None:
continue
markdown_tups.append((current_header, current_text))
# Upon first header, skip if current text chunk is empty
if current_header is not None or len(current_lines) > 0:
markdown_tups.append((current_header, "\n".join(current_lines)))

current_header = line
current_text = ""
current_lines.clear()
else:
current_text += line + "\n"
markdown_tups.append((current_header, current_text))

if current_header is not None:
# pass linting, assert keys are defined
markdown_tups = [
(re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value))
for key, value in markdown_tups
]
else:
markdown_tups = [
(key, re.sub("<.*?>", "", value)) for key, value in markdown_tups
]

return markdown_tups
current_lines.append(line)

# Append final text chunk
markdown_tups.append((current_header, "\n".join(current_lines)))

# Postprocess the tuples before returning
return [
(
key if key is None else re.sub(r"#", "", key).strip(),
re.sub(r"<.*?>", "", value),
)
for key, value in markdown_tups
]

def remove_images(self, content: str) -> str:
"""Remove images in markdown content."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ license = "MIT"
maintainers = ["FarisHijazi", "Haowjy", "ephe-meral", "hursh-desai", "iamarunbrahma", "jon-chuang", "mmaatouk", "ravi03071991", "sangwongenip", "thejessezhang"]
name = "llama-index-readers-file"
readme = "README.md"
version = "0.1.20"
version = "0.1.21"

[tool.poetry.dependencies]
python = ">=3.8.1,<4.0"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from llama_index.readers.file import MarkdownReader


def test_parse_markdown_starting_with_header() -> None:
reader = MarkdownReader()
markdown_text = "# ABC\nabc\n# DEF\ndef"
expected_tups = [("ABC", "abc"), ("DEF", "def")]
assert reader.markdown_to_tups(markdown_text) == expected_tups


def test_parse_markdown_with_text_before_first_header() -> None:
reader = MarkdownReader()
markdown_text = "abc\n# ABC\ndef"
expected_tups = [(None, "abc"), ("ABC", "def")]
assert reader.markdown_to_tups(markdown_text) == expected_tups


def test_parse_markdown_with_empty_lines_before_first_header() -> None:
reader = MarkdownReader()
markdown_text = "\n\n\n# ABC\ndef"
expected_tups = [(None, "\n\n"), ("ABC", "def")]
assert reader.markdown_to_tups(markdown_text) == expected_tups


def test_parse_markdown_with_no_headers() -> None:
reader = MarkdownReader()
markdown_text = "abc\ndef"
expected_tups = [(None, "abc\ndef")]
assert reader.markdown_to_tups(markdown_text) == expected_tups


def test_parse_markdown_with_only_headers() -> None:
reader = MarkdownReader()
markdown_text = "# ABC\n# DEF"
expected_tups = [("ABC", ""), ("DEF", "")]
assert reader.markdown_to_tups(markdown_text) == expected_tups


def test_parse_empty_markdown() -> None:
reader = MarkdownReader()
markdown_text = ""
expected_tups = [(None, "")]
assert reader.markdown_to_tups(markdown_text) == expected_tups


def test_parse_omits_trailing_newline_before_new_header() -> None:
reader = MarkdownReader()

markdown_text = ("\n" * 4) + "# ABC\nabc"
expected_tups = [(None, "\n" * 3), ("ABC", "abc")]
assert reader.markdown_to_tups(markdown_text) == expected_tups

markdown_text = ("\n" * 4) + "# ABC\nabc\n"
expected_tups = [(None, "\n" * 3), ("ABC", "abc\n")]
assert reader.markdown_to_tups(markdown_text) == expected_tups

markdown_text = "\n" * 4
expected_tups = [(None, "\n" * 4)]
assert reader.markdown_to_tups(markdown_text) == expected_tups

0 comments on commit 8cb9690

Please sign in to comment.