From 693b2537418a1346a44713c28ce5f700f7d5ce46 Mon Sep 17 00:00:00 2001 From: jon-chuang <9093549+jon-chuang@users.noreply.github.com> Date: Sat, 8 Jul 2023 23:36:08 +0800 Subject: [PATCH] feat(node_parser): `MetadataExtractor` - Feature Augmentation via node parser post-processing (#6764) Co-authored-by: jon-chuang Co-authored-by: Jerry Liu --- .../MetadataExtractionSEC.ipynb | 655 ++++++++++++++++++ docs/how_to/customization/custom_documents.md | 14 + docs/how_to/index/metadata_extraction.md | 71 ++ docs/how_to/index/usage_pattern.md | 1 + llama_index/node_parser/__init__.py | 8 +- .../node_parser/extractors/__init__.py | 18 + .../extractors/metadata_extractors.py | 334 +++++++++ llama_index/node_parser/interface.py | 17 +- llama_index/node_parser/simple.py | 9 + tests/node_parser/metadata_extractor.py | 37 + 10 files changed, 1161 insertions(+), 3 deletions(-) create mode 100644 docs/examples/metadata_extraction/MetadataExtractionSEC.ipynb create mode 100644 docs/how_to/index/metadata_extraction.md create mode 100644 llama_index/node_parser/extractors/__init__.py create mode 100644 llama_index/node_parser/extractors/metadata_extractors.py create mode 100644 tests/node_parser/metadata_extractor.py diff --git a/docs/examples/metadata_extraction/MetadataExtractionSEC.ipynb b/docs/examples/metadata_extraction/MetadataExtractionSEC.ipynb new file mode 100644 index 0000000000000..6f66ab9b9affe --- /dev/null +++ b/docs/examples/metadata_extraction/MetadataExtractionSEC.ipynb @@ -0,0 +1,655 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b07531d9-7473-480d-bee6-c1ee4cbc207c", + "metadata": {}, + "source": [ + "# Extracting Metadata for Better Document Indexing and Understanding\n", + "\n", + "In many cases, especially with long documents, a chunk of text may lack the context necessary to disambiguate the chunk from other similar chunks of text. One method of addressing this is manually labelling each chunk in our dataset or knowledge base. However, this can be labour intensive and time consuming for a large number or continually updated set of documents.\n", + "\n", + "To combat this, we use LLMs to extract certain contextual information relevant to the document to better help the retrieval and language models disambiguate similar-looking passages.\n", + "\n", + "We do this through our brand-new `MetadataExtractor` modules." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "91c807cc-1334-4f92-8a9e-9ccd702f3578", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0adb8e4a-6728-4073-8256-8b3be4ab1e64", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from llama_index import ListIndex, LLMPredictor\n", + "from langchain import OpenAI\n", + "from llama_index import download_loader, VectorStoreIndex, ServiceContext\n", + "from llama_index.schema import MetadataMode" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "a0231dff-7443-46bf-9b9d-759198d3408e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "llm_predictor = LLMPredictor(\n", + " llm=OpenAI(temperature=0, model_name=\"text-davinci-003\", max_tokens=512)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2db2cf90-f295-4a3d-a47c-4b2b1dd2d7c5", + "metadata": {}, + "source": [ + "We create a node parser that extracts the document title and hypothetical question embeddings relevant to the document chunk.\n", + "\n", + "We also show how to instantiate the `SummaryExtractor` and `KeywordExtractor`, as well as how to create your own custom extractor \n", + "based on the `MetadataFeatureExtractor` base class" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "3bda151d-6fb8-427e-82fc-0f3bb469d705", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from llama_index.node_parser import SimpleNodeParser\n", + "from llama_index.node_parser.extractors import (\n", + " MetadataExtractor,\n", + " SummaryExtractor,\n", + " QuestionsAnsweredExtractor,\n", + " TitleExtractor,\n", + " KeywordExtractor,\n", + " MetadataFeatureExtractor,\n", + ")\n", + "from llama_index.langchain_helpers.text_splitter import TokenTextSplitter\n", + "\n", + "text_splitter = TokenTextSplitter(separator=\" \", chunk_size=512, chunk_overlap=128)\n", + "\n", + "\n", + "class CustomExtractor(MetadataFeatureExtractor):\n", + " def extract(self, nodes):\n", + " metadata_list = [\n", + " {\n", + " \"custom\": node.metadata[\"document_title\"]\n", + " + \"\\n\"\n", + " + node.metadata[\"excerpt_keywords\"]\n", + " }\n", + " for node in nodes\n", + " ]\n", + " return metadata_list\n", + "\n", + "\n", + "metadata_extractor = MetadataExtractor(\n", + " extractors=[\n", + " TitleExtractor(nodes=5),\n", + " QuestionsAnsweredExtractor(questions=3),\n", + " # SummaryExtractor(summaries=[\"prev\", \"self\"]),\n", + " # KeywordExtractor(keywords=10),\n", + " # CustomExtractor()\n", + " ],\n", + ")\n", + "\n", + "node_parser = SimpleNodeParser(\n", + " text_splitter=text_splitter,\n", + " metadata_extractor=metadata_extractor,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "c72c45a9-dcad-4925-b2f7-d25fe5d80c2d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from llama_index import SimpleDirectoryReader, DocumentSummaryIndex" + ] + }, + { + "cell_type": "markdown", + "id": "e4e54937-e9e7-48ed-8600-72cd2f3c529b", + "metadata": {}, + "source": [ + "We first load the 10k annual SEC report for Uber and Lyft for the years 2019 and 2020 respectively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2e5ef50-82ef-4936-bbc2-c022f67007a0", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p data\n", + "!wget -O \"data/10k-132.pdf\" \"https://www.dropbox.com/scl/fi/6dlqdk6e2k1mjhi8dee5j/uber.pdf?rlkey=2jyoe49bg2vwdlz30l76czq6g&dl=1\"\n", + "!wget -O \"data/10k-vFinal.pdf\" \"https://www.dropbox.com/scl/fi/qn7g3vrk5mqb18ko4e5in/lyft.pdf?rlkey=j6jxtjwo8zbstdo4wz3ns8zoj&dl=1\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "38a46bf6-9539-4ac2-ad97-eb909992b94d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Note the uninformative document file name, which may be a common scenario in a production setting\n", + "uber_docs = SimpleDirectoryReader(input_files=[\"data/10k-132.pdf\"]).load_data()\n", + "uber_front_pages = uber_docs[0:3]\n", + "uber_content = uber_docs[63:69]\n", + "uber_docs = uber_front_pages + uber_content" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "269f8ecc-489d-435f-9d81-a9c64fd4d400", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "uber_nodes = node_parser.get_nodes_from_documents(uber_docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "8da4d824-d518-4d37-8322-a35adac05157", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'page_label': '2',\n", + " 'file_name': '10k-132.pdf',\n", + " 'document_title': 'Uber Technologies, Inc. 2019 Annual Report: Revolutionizing Mobility and Logistics Across 69 Countries and 111 Million MAPCs with $65 Billion in Gross Bookings',\n", + " 'questions_this_excerpt_can_answer': '\\n\\n1. How many countries does Uber Technologies, Inc. operate in?\\n2. What is the total number of MAPCs served by Uber Technologies, Inc.?\\n3. How much gross bookings did Uber Technologies, Inc. generate in 2019?'}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "uber_nodes[1].metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "93e70bfb-6c02-401b-be91-3827f358b22c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Note the uninformative document file name, which may be a common scenario in a production setting\n", + "lyft_docs = SimpleDirectoryReader(input_files=[\"data/10k-vFinal.pdf\"]).load_data()\n", + "lyft_front_pages = lyft_docs[0:3]\n", + "lyft_content = lyft_docs[68:73]\n", + "lyft_docs = lyft_front_pages + lyft_content" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e3720b40-c50c-4185-aaf4-289ff8ab057e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "lyft_nodes = node_parser.get_nodes_from_documents(lyft_docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "98740f96-afdd-45ff-bcc0-2c50965a7349", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'page_label': '2',\n", + " 'file_name': '10k-vFinal.pdf',\n", + " 'document_title': \"Lyft, Inc. 2021 Annual Meeting of Stockholders: Filing and Attestation of Management's Assessment Report, Filer Status, Internal Control Assessment, Shell Company Status, Market Value of Common Stock, and Analysis of Historical Financial Performance.\",\n", + " 'questions_this_excerpt_can_answer': '\\n\\n1. What is the status of the registrant as an accelerated filer?\\n2. Has the registrant filed a report on and attestation to its management’s assessment of the effectiveness of its internal control over financial reporting?\\n3. What is the total number of shares of Class A and Class B common stock outstanding as of February 22, 2021?'}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lyft_nodes[2].metadata" + ] + }, + { + "cell_type": "markdown", + "id": "ddd5805f-c459-40ae-a21c-5fa0de750a60", + "metadata": {}, + "source": [ + "Since we are asking fairly sophisticated questions, we utilize a subquestion query engine for all QnA pipelines below, and prompt it to pay more attention to the relevance of the retrieved sources. " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "302bb085-86cc-4b76-a452-67bc826b292d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from llama_index.question_gen.llm_generators import LLMQuestionGenerator\n", + "from llama_index.question_gen.prompts import DEFAULT_SUB_QUESTION_PROMPT_TMPL\n", + "\n", + "service_context = ServiceContext.from_defaults(\n", + " llm_predictor=llm_predictor, node_parser=node_parser\n", + ")\n", + "question_gen = LLMQuestionGenerator.from_defaults(\n", + " service_context=service_context,\n", + " prompt_template_str=\"\"\"\n", + " Follow the example, but instead of giving a question, always prefix the question \n", + " with: 'By first identifying and quoting the most relevant sources, '. \n", + " \"\"\"\n", + " + DEFAULT_SUB_QUESTION_PROMPT_TMPL,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "12cd959f-80ac-4d10-9246-d27cc6c1096a", + "metadata": {}, + "source": [ + "## Querying an Index With No Extra Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "37dd8992-3716-44da-9309-154fb5946e98", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LLM sees:\n", + " [Excerpt from document]\n", + "page_label: 65\n", + "file_name: 10k-132.pdf\n", + "Excerpt:\n", + "-----\n", + "See the section titled “Reconciliations of Non-GAAP Financial Measures” for our definition and a \n", + "reconciliation of net income (loss) attributable to Uber Technologies, Inc. to Adjusted EBITDA. \n", + " \n", + " Year Ended December 31, 2017 to 2018 2018 to 2019 \n", + "(In millions, exce pt percenta ges) 2017 2018 2019 % Chan ge % Chan ge \n", + "Adjusted EBITDA ................................ $ (2,642) $ (1,847) $ (2,725) 30% (48)%\n", + "-----\n" + ] + } + ], + "source": [ + "from copy import deepcopy\n", + "\n", + "nodes_no_metadata = deepcopy(uber_nodes) + deepcopy(lyft_nodes)\n", + "for node in nodes_no_metadata:\n", + " node.metadata = {\n", + " k: node.metadata[k] for k in node.metadata if k in [\"page_label\", \"file_name\"]\n", + " }\n", + "print(\"LLM sees:\\n\", (nodes_no_metadata)[9].get_content(metadata_mode=MetadataMode.LLM))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b8ff619d-67ed-4263-bfc7-2a7a1b7320e7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from llama_index import VectorStoreIndex\n", + "from llama_index.vector_stores import FaissVectorStore\n", + "from llama_index.query_engine import SubQuestionQueryEngine\n", + "from llama_index.tools import QueryEngineTool, ToolMetadata" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "028a65d7-8065-4798-acec-1c3486633e14", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "index_no_metadata = VectorStoreIndex(nodes=nodes_no_metadata)\n", + "engine_no_metadata = index_no_metadata.as_query_engine(similarity_top_k=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "73ea9e05-ff5a-49b6-8e52-139d156cde47", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "final_engine_no_metadata = SubQuestionQueryEngine.from_defaults(\n", + " query_engine_tools=[\n", + " QueryEngineTool(\n", + " query_engine=engine_no_metadata,\n", + " metadata=ToolMetadata(\n", + " name=\"sec_filing_documents\",\n", + " description=\"financial information on companies\",\n", + " ),\n", + " )\n", + " ],\n", + " question_gen=question_gen,\n", + " use_async=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "fd5a3e51-e252-4e24-bc2b-fbc32ce078dd", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated 4 sub questions.\n", + "\u001b[36;1m\u001b[1;3m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, what was the cost due to research and development for Uber in 2019 in millions of USD?\n", + "\u001b[0m\u001b[33;1m\u001b[1;3m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, what was the cost due to sales and marketing for Uber in 2019 in millions of USD?\n", + "\u001b[0m\u001b[38;5;200m\u001b[1;3m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, what was the cost due to research and development for Lyft in 2019 in millions of USD?\n", + "\u001b[0m\u001b[32;1m\u001b[1;3m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, what was the cost due to sales and marketing for Lyft in 2019 in millions of USD?\n", + "\u001b[0m\u001b[38;5;200m\u001b[1;3m[sec_filing_documents] A: \n", + "\n", + "According to the excerpt from page_label: 69 and file_name: 10k-132.pdf, the cost due to research and development for Lyft in 2019 was 15% of total revenue, or $1.5 billion in millions of USD.\n", + "\u001b[0m\u001b[33;1m\u001b[1;3m[sec_filing_documents] A: \n", + "\n", + "According to the excerpt from page_label 66 of the 10k-132.pdf document, the cost due to sales and marketing for Uber in 2019 was $397.8 million in thousands of USD, or $397,800,000 in millions of USD. This cost was primarily attributable to continued investments within Uber's non-Rides offerings and an increase in corporate overhead as the business grows.\n", + "\u001b[0m\u001b[36;1m\u001b[1;3m[sec_filing_documents] A: \n", + "\n", + "According to the excerpt from page 69 of the document 10k-132.pdf, research and development expenses for Uber in 2019 were $909.1 million in thousands of USD, which equates to $909.1 million in millions of USD. This was 9% of the total costs and expenses for the year, and accounted for 34% of the total revenue.\n", + "\u001b[0m\u001b[32;1m\u001b[1;3m[sec_filing_documents] A: \n", + "\n", + "According to the excerpt from page_label 72 of the 10k-vFinal.pdf document, the cost due to sales and marketing for Lyft in 2019 was $275.1 million in thousands of USD. This can be seen in the following excerpt: \n", + "\n", + "\"Sales and marketing $ 196,437 $ 194,184 $ 163,858 $ 180,951 $ 275,129 \n", + "Year Ended December 31,2019 to 2020 \n", + "% Change2018 to 2019 \n", + "% Change\"\n", + "\u001b[0mAnswer: \n", + "{\n", + " \"Uber\": {\n", + " \"Research and Development\": 909.1,\n", + " \"Sales and Marketing\": 397.8\n", + " },\n", + " \"Lyft\": {\n", + " \"Research and Development\": 1.5,\n", + " \"Sales and Marketing\": 275.1\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "response_no_metadata = final_engine_no_metadata.query(\n", + " \"\"\"\n", + " What was the cost due to research and development v.s. sales and marketing for uber and lyft in 2019 in millions of USD?\n", + " Give your answer as a JSON.\n", + " \"\"\"\n", + ")\n", + "print(response_no_metadata.response)\n", + "# Correct answer:\n", + "# {\"Uber\": {\"Research and Development\": 4836, \"Sales and Marketing\": 4626},\n", + "# \"Lyft\": {\"Research and Development\": 1505.6, \"Sales and Marketing\": 814 }}" + ] + }, + { + "cell_type": "markdown", + "id": "e9dafdad-c18c-4e0f-8a35-b691ca73e1f2", + "metadata": {}, + "source": [ + "**RESULT**: As we can see, the QnA agent does not seem to know where to look for the right documents. As a result it gets only 1/4 of the subquestions right." + ] + }, + { + "cell_type": "markdown", + "id": "9878905f-c47d-46b2-9ad9-063538e717e1", + "metadata": {}, + "source": [ + "## Querying an Index With Extracted Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "97f00a18-e9e6-47db-bef5-cbf5bb5016be", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LLM sees:\n", + " [Excerpt from document]\n", + "page_label: 65\n", + "file_name: 10k-132.pdf\n", + "document_title: Uber Technologies, Inc. 2019 Annual Report: Revolutionizing Mobility and Logistics Across 69 Countries and 111 Million MAPCs with $65 Billion in Gross Bookings\n", + "questions_this_excerpt_can_answer: \n", + "\n", + "1. What is Uber Technologies, Inc.'s definition of Adjusted EBITDA?\n", + "2. How much did Adjusted EBITDA change from 2017 to 2018?\n", + "3. How much did Adjusted EBITDA change from 2018 to 2019?\n", + "Excerpt:\n", + "-----\n", + "See the section titled “Reconciliations of Non-GAAP Financial Measures” for our definition and a \n", + "reconciliation of net income (loss) attributable to Uber Technologies, Inc. to Adjusted EBITDA. \n", + " \n", + " Year Ended December 31, 2017 to 2018 2018 to 2019 \n", + "(In millions, exce pt percenta ges) 2017 2018 2019 % Chan ge % Chan ge \n", + "Adjusted EBITDA ................................ $ (2,642) $ (1,847) $ (2,725) 30% (48)%\n", + "-----\n" + ] + } + ], + "source": [ + "print(\n", + " \"LLM sees:\\n\",\n", + " (uber_nodes + lyft_nodes)[9].get_content(metadata_mode=MetadataMode.LLM),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "c7d255de-3034-4035-93bc-45d535ce1700", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "index = VectorStoreIndex(nodes=uber_nodes + lyft_nodes)\n", + "engine = index.as_query_engine(similarity_top_k=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "bbe42516-a2ca-4986-9012-cb15682323f5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "final_engine = SubQuestionQueryEngine.from_defaults(\n", + " query_engine_tools=[\n", + " QueryEngineTool(\n", + " query_engine=engine,\n", + " metadata=ToolMetadata(\n", + " name=\"sec_filing_documents\",\n", + " description=\"financial information on companies.\",\n", + " ),\n", + " )\n", + " ],\n", + " question_gen=question_gen,\n", + " use_async=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "f48ac2d9-58e9-4b98-9bad-b8ce1eea7934", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated 4 sub questions.\n", + "\u001b[36;1m\u001b[1;3m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, what was the cost due to research and development for Uber in 2019 in millions of USD?\n", + "\u001b[0m\u001b[33;1m\u001b[1;3m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, what was the cost due to sales and marketing for Uber in 2019 in millions of USD?\n", + "\u001b[0m\u001b[38;5;200m\u001b[1;3m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, what was the cost due to research and development for Lyft in 2019 in millions of USD?\n", + "\u001b[0m\u001b[32;1m\u001b[1;3m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, what was the cost due to sales and marketing for Lyft in 2019 in millions of USD?\n", + "\u001b[0m\u001b[38;5;200m\u001b[1;3m[sec_filing_documents] A: \n", + "\n", + "According to the excerpt from the document, Lyft spent $1,505 million on research and development in 2019. This was 34% of total costs and expenses for the year.\n", + "\u001b[0m\u001b[33;1m\u001b[1;3m[sec_filing_documents] A: \n", + "\n", + "According to the excerpt from page 69 of the Uber Technologies, Inc. 2019 Annual Report, the cost due to sales and marketing for Uber in 2019 was $4,626 million in USD. This cost was driven by investments in non-Rides offerings, corporate overhead, and Driver incentives.\n", + "\u001b[0m\u001b[36;1m\u001b[1;3m[sec_filing_documents] A: \n", + "\n", + "According to the excerpt from page 69 of the document, Uber Technologies, Inc. spent $4.836 billion on research and development in 2019, which was driven by a 22% increase in MAPCs due to global expansion of their Eats product offerings combined with wider market adoption of their Rides product, and overall growth in their other offerings.\n", + "\u001b[0m\u001b[32;1m\u001b[1;3m[sec_filing_documents] A: \n", + "\n", + "According to the excerpt from page 69 of the document titled \"Lyft, Inc. 2021 Annual Meeting of Stockholders: Filing and Attestation of Management's Assessment Report, Filer Status, Internal Control Assessment, Shell Company Status, Market Value of Common Stock, and Analysis of Historical Financial Performance,\" the cost due to sales and marketing for Lyft in 2019 was $814.122 million in USD. This can be found in the table under the heading \"2020 2019 2018 (in thousands)\" which states that \"Sales and marketing $416,331 $814,122 $803,751.\"\n", + "\u001b[0mAnswer: \n", + "{\n", + " \"Uber\": {\n", + " \"Research and Development\": 4.836,\n", + " \"Sales and Marketing\": 4.626\n", + " },\n", + " \"Lyft\": {\n", + " \"Research and Development\": 1.505,\n", + " \"Sales and Marketing\": 0.814\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "response = final_engine.query(\n", + " \"\"\"\n", + " What was the cost due to research and development v.s. sales and marketing for uber and lyft in 2019 in millions of USD?\n", + " Give your answer as a JSON.\n", + " \"\"\"\n", + ")\n", + "print(response.response)\n", + "# Correct answer:\n", + "# {\"Uber\": {\"Research and Development\": 4836, \"Sales and Marketing\": 4626},\n", + "# \"Lyft\": {\"Research and Development\": 1505.6, \"Sales and Marketing\": 814 }}" + ] + }, + { + "cell_type": "markdown", + "id": "8bee6d91-84f4-4bde-89dc-d010f9aebc3e", + "metadata": {}, + "source": [ + "**RESULT**: As we can see, the LLM answers the questions correctly." + ] + }, + { + "cell_type": "markdown", + "id": "14826bae-4032-4886-87a8-50f9f28d7ace", + "metadata": {}, + "source": [ + "### Challenges Identified in the Problem Domain\n", + "\n", + "In this example, we observed that the search quality as provided by vector embeddings was rather poor. This was likely due to highly dense financial documents that were likely not representative of the training set for the model.\n", + "\n", + "In order to improve the search quality, other methods of neural search that employ more keyword-based approaches may help, such as ColBERTv2/PLAID. In particular, this would help in matching on particular keywords to identify high-relevance chunks.\n", + "\n", + "Other valid steps may include utilizing models that are fine-tuned on financial datasets such as Bloomberg GPT.\n", + "\n", + "Finally, we can help to further enrich the metadata by providing more contextual information regarding the surrounding context that the chunk is located in.\n", + "\n", + "### Improvements to this Example\n", + "Generally, this example can be improved further with more rigorous evaluation of both the metadata extraction accuracy, and the accuracy and recall of the QnA pipeline. Further, incorporating a larger set of documents as well as the full length documents, which may provide more confounding passages that are difficult to disambiguate, could further stresss test the system we have built and suggest further improvements. " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "llama_index_jon", + "language": "python", + "name": "llama_index_jon" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/how_to/customization/custom_documents.md b/docs/how_to/customization/custom_documents.md index 5b711d2c7bcab..1ba16f4fb8c07 100644 --- a/docs/how_to/customization/custom_documents.md +++ b/docs/how_to/customization/custom_documents.md @@ -131,3 +131,17 @@ document = Document( print("The LLM sees this: \n", document.get_content(metadata_mode=MetadataMode.LLM)) print("The Embedding model sees this: \n", document.get_content(metadata_mode=MetadataMode.EMBED)) ``` + + +## Advanced - Automatic Metadata Extraction + +We have initial examples of using LLMs themselves to perform metadata extraction. + +Take a look here! + +```{toctree} +--- +maxdepth: 1 +--- +/examples/metadata_extraction/MetadataExtractionSEC.ipynb +``` \ No newline at end of file diff --git a/docs/how_to/index/metadata_extraction.md b/docs/how_to/index/metadata_extraction.md new file mode 100644 index 0000000000000..d5e2b3b2ccbe5 --- /dev/null +++ b/docs/how_to/index/metadata_extraction.md @@ -0,0 +1,71 @@ +# Metadata Extraction + + +## Introduction +In many cases, especially with long documents, a chunk of text may lack the context necessary to disambiguate the chunk from other similar chunks of text. + +To combat this, we use LLMs to extract certain contextual information relevant to the document to better help the retrieval and language models disambiguate similar-looking passages. + +We show this in an [example notebook](https://github.com/jerryjliu/llama_index/blob/main/examples/metadata_extraction/MetadataExtractionSEC.ipynb) and demonstrate its effectiveness in processing long documents. + +## Usage + +First, we define a metadata extractor that takes in a list of feature extractors that will be processed in sequence. + +We then feed this to the node parser, which will add the additional metadata to each node. +```python +from llama_index.node_parser import SimpleNodeParser +from llama_index.node_parser.extractors import ( + MetadataExtractor, + SummaryExtractor, + QuestionsAnsweredExtractor, + TitleExtractor, + KeywordExtractor, +) + +metadata_extractor = MetadataExtractor( + extractors=[ + TitleExtractor(nodes=5), + QuestionsAnsweredExtractor(questions=3), + SummaryExtractor(summaries=["prev", "self"]), + KeywordExtractor(keywords=10), + ], +) + +node_parser = SimpleNodeParser( + metadata_extractor=metadata_extractor, +) +``` + +Here is an sample of extracted metadata: + +``` +{'page_label': '2', + 'file_name': '10k-132.pdf', + 'document_title': 'Uber Technologies, Inc. 2019 Annual Report: Revolutionizing Mobility and Logistics Across 69 Countries and 111 Million MAPCs with $65 Billion in Gross Bookings', + 'questions_this_excerpt_can_answer': '\n\n1. How many countries does Uber Technologies, Inc. operate in?\n2. What is the total number of MAPCs served by Uber Technologies, Inc.?\n3. How much gross bookings did Uber Technologies, Inc. generate in 2019?', + 'prev_section_summary': "\n\nThe 2019 Annual Report provides an overview of the key topics and entities that have been important to the organization over the past year. These include financial performance, operational highlights, customer satisfaction, employee engagement, and sustainability initiatives. It also provides an overview of the organization's strategic objectives and goals for the upcoming year.", + 'section_summary': '\nThis section discusses a global tech platform that serves multiple multi-trillion dollar markets with products leveraging core technology and infrastructure. It enables consumers and drivers to tap a button and get a ride or work. The platform has revolutionized personal mobility with ridesharing and is now leveraging its platform to redefine the massive meal delivery and logistics industries. The foundation of the platform is its massive network, leading technology, operational excellence, and product expertise.', + 'excerpt_keywords': '\nRidesharing, Mobility, Meal Delivery, Logistics, Network, Technology, Operational Excellence, Product Expertise, Point A, Point B'} +``` + +## Custom Extractors + +If the provided extractors do not fit your needs, you can also define a custom extractor like so: +```python +from llama_index.node_parser.extractors import MetadataFeatureExtractor + +class CustomExtractor(MetadataFeatureExtractor): + def extract(self, nodes) -> List[Dict]: + metadata_list = [ + { + "custom": node.metadata["document_title"] + + "\n" + + node.metadata["excerpt_keywords"] + } + for node in nodes + ] + return metadata_list +``` + +In a more advanced example, it can also make use of an `llm_predictor` to extract features from the node content and the existing metadata. Refer to the [source code of the provided metadata extractors](https://github.com/jerryjliu/llama_index/blob/main/llama_index/node_parser/extractors/metadata_extractors.py) for more details. \ No newline at end of file diff --git a/docs/how_to/index/usage_pattern.md b/docs/how_to/index/usage_pattern.md index 2d65383186169..b2389c4271576 100644 --- a/docs/how_to/index/usage_pattern.md +++ b/docs/how_to/index/usage_pattern.md @@ -81,5 +81,6 @@ Read more about how to deal with data sources that change over time with `Index` --- maxdepth: 1 --- +metadata_extraction.md document_management.md ``` diff --git a/llama_index/node_parser/__init__.py b/llama_index/node_parser/__init__.py index b33740e7f2610..78a3a1fbf14ea 100644 --- a/llama_index/node_parser/__init__.py +++ b/llama_index/node_parser/__init__.py @@ -1,6 +1,10 @@ """Node parsers.""" -from llama_index.node_parser.simple import SimpleNodeParser from llama_index.node_parser.interface import NodeParser +from llama_index.node_parser.simple import SimpleNodeParser + -__all__ = ["SimpleNodeParser", "NodeParser"] +__all__ = [ + "SimpleNodeParser", + "NodeParser", +] diff --git a/llama_index/node_parser/extractors/__init__.py b/llama_index/node_parser/extractors/__init__.py new file mode 100644 index 0000000000000..06924240d4bdd --- /dev/null +++ b/llama_index/node_parser/extractors/__init__.py @@ -0,0 +1,18 @@ +from llama_index.node_parser.extractors.metadata_extractors import ( + MetadataExtractor, + SummaryExtractor, + QuestionsAnsweredExtractor, + TitleExtractor, + KeywordExtractor, + MetadataFeatureExtractor, +) + +__all__ = [ + "MetadataExtractor", + "MetadataExtractorBase", + "SummaryExtractor", + "QuestionsAnsweredExtractor", + "TitleExtractor", + "KeywordExtractor", + "MetadataFeatureExtractor", +] diff --git a/llama_index/node_parser/extractors/metadata_extractors.py b/llama_index/node_parser/extractors/metadata_extractors.py new file mode 100644 index 0000000000000..ed6d5443386a4 --- /dev/null +++ b/llama_index/node_parser/extractors/metadata_extractors.py @@ -0,0 +1,334 @@ +""" +Metadata extractors for nodes. Applied as a post processor to node parsing. +Currently, only `TextNode` is supported. + +Supported metadata: +Node-level: + - `SummaryExtractor`: Summary of each node, and pre and post nodes + - `QuestionsAnsweredExtractor`: Questions that the node can answer + - `KeywordsExtractor`: Keywords that uniquely identify the node +Document-level: + - `TitleExtractor`: Document title, possible inferred across multiple nodes + +Unimplemented (contributions welcome): +Subsection: + - Position of node in subsection hierarchy (and associated subtitles) + - Hierarchically organized summary + +The prompts used to generate the metadata are specifically aimed to help +disambiguate the document or subsection from other similar documents or subsections. +(similar with contrastive learning) +""" + +from abc import abstractmethod +import json +from typing import List, Optional, Sequence, cast, Dict +from functools import reduce + +from llama_index.llm_predictor.base import BaseLLMPredictor, LLMPredictor +from llama_index.node_parser.interface import BaseExtractor +from llama_index.prompts.base import Prompt +from llama_index.schema import BaseNode, TextNode + + +class MetadataFeatureExtractor(BaseExtractor): + is_text_node_only = True + + @abstractmethod + def extract(self, nodes: Sequence[BaseNode]) -> List[Dict]: + """Extracts metadata for a sequence of nodes, returning a list of + metadata dictionaries corresponding to each node. + + Args: + nodes (Sequence[Document]): nodes to extract metadata from + + """ + + +DEFAULT_NODE_TEXT_TEMPLATE = """\ +[Excerpt from document]\n{metadata_str}\n\ +Excerpt:\n-----\n{content}\n-----\n""" + + +class MetadataExtractor(BaseExtractor): + """Metadata extractor.""" + + def __init__( + self, + extractors: Sequence[MetadataFeatureExtractor], + node_text_template: str = DEFAULT_NODE_TEXT_TEMPLATE, + disable_template_rewrite: bool = False, + ) -> None: + self._extractors = extractors + self._node_text_template = node_text_template + self._disable_template_rewrite = disable_template_rewrite + + def extract(self, nodes: Sequence[BaseNode]) -> List[Dict]: + """Extract metadata from a document. + + Args: + nodes (Sequence[BaseNode]): nodes to extract metadata from + + """ + metadata_list: List[Dict] = [{} for _ in nodes] + for extractor in self._extractors: + cur_metadata_list = extractor.extract(nodes) + for i, metadata in enumerate(metadata_list): + metadata.update(cur_metadata_list[i]) + + return metadata_list + + def process_nodes( + self, + nodes: List[BaseNode], + excluded_embed_metadata_keys: Optional[List[str]] = None, + excluded_llm_metadata_keys: Optional[List[str]] = None, + ) -> List[BaseNode]: + """Post process nodes parsed from documents. + + Allows extractors to be chained. + + Args: + nodes (List[BaseNode]): nodes to post-process + excluded_embed_metadata_keys (Optional[List[str]]): + keys to exclude from embed metadata + excluded_llm_metadata_keys (Optional[List[str]]): + keys to exclude from llm metadata + """ + for extractor in self._extractors: + cur_metadata_list = extractor.extract(nodes) + for idx, node in enumerate(nodes): + node.metadata.update(cur_metadata_list[idx]) + + for idx, node in enumerate(nodes): + if excluded_embed_metadata_keys is not None: + node.excluded_embed_metadata_keys.extend(excluded_embed_metadata_keys) + if excluded_llm_metadata_keys is not None: + node.excluded_llm_metadata_keys.extend(excluded_llm_metadata_keys) + if not self._disable_template_rewrite: + if isinstance(node, TextNode): + cast(TextNode, node).text_template = self._node_text_template + return nodes + + +DEFAULT_TITLE_NODE_TEMPLATE = """\ +Context: {context_str}. Give a title that summarizes all of \ +the unique entities, titles or themes found in the context. Title: """ + + +DEFAULT_TITLE_COMBINE_TEMPLATE = """\ +{context_str}. Based on the above candidate titles and content, \ +what is the comprehensive title for this document? Title: """ + + +class TitleExtractor(MetadataFeatureExtractor): + """Title extractor. Useful for long documents. Extracts `document_title` + metadata field. + Args: + llm_predictor (Optional[BaseLLMPredictor]): LLM predictor + nodes (int): number of nodes from front to use for title extraction + node_template (str): template for node-level title clues extraction + combine_template (str): template for combining node-level clues into + a document-level title + """ + + is_text_node_only = False # can work for mixture of text and non-text nodes + + def __init__( + self, + llm_predictor: Optional[BaseLLMPredictor] = None, + nodes: int = 5, + node_template: str = DEFAULT_TITLE_NODE_TEMPLATE, + combine_template: str = DEFAULT_TITLE_COMBINE_TEMPLATE, + ) -> None: + """Init params.""" + if nodes < 1: + raise ValueError("num_nodes must be >= 1") + self._nodes = nodes + self._node_template = node_template + self._combine_template = combine_template + self._llm_predictor = llm_predictor or LLMPredictor() + + def extract(self, nodes: Sequence[BaseNode]) -> List[Dict]: + nodes_to_extract_title: List[BaseNode] = [] + for node in nodes: + if len(nodes_to_extract_title) >= self._nodes: + break + if self.is_text_node_only and not isinstance(node, TextNode): + continue + nodes_to_extract_title.append(node) + + if len(nodes_to_extract_title) == 0: + # Could not extract title + return [] + + title_candidates = [ + self._llm_predictor.predict( + Prompt(template=self._node_template), + context_str=cast(TextNode, node).text, + ) + for node in nodes_to_extract_title + ] + if len(nodes_to_extract_title) > 1: + titles = reduce( + lambda x, y: x + "," + y, title_candidates[1:], title_candidates[0] + ) + + title = self._llm_predictor.predict( + Prompt(template=self._combine_template), + context_str=titles, + ) + else: + title = title_candidates[ + 0 + ] # if single node, just use the title from that node + + metadata_list = [{"document_title": title.strip(' \t\n\r"')} for node in nodes] + return metadata_list + + +class KeywordExtractor(MetadataFeatureExtractor): + """Keyword extractor. Node-level extractor. Extracts + `excerpt_keywords` metadata field. + Args: + llm_predictor (Optional[BaseLLMPredictor]): LLM predictor + keywords (int): number of keywords to extract + """ + + def __init__( + self, + llm_predictor: Optional[BaseLLMPredictor] = None, + keywords: int = 5, + ) -> None: + """Init params.""" + self._llm_predictor = llm_predictor or LLMPredictor() + if keywords < 1: + raise ValueError("num_keywords must be >= 1") + self._keywords = keywords + + def extract(self, nodes: Sequence[BaseNode]) -> List[Dict]: + metadata_list: List[Dict] = [] + for node in nodes: + if self.is_text_node_only and not isinstance(node, TextNode): + metadata_list.append({}) + continue + + # TODO: figure out a good way to allow users to customize keyword template + keywords = self._llm_predictor.predict( + Prompt( + template=f"""\ +{{context_str}}. Give {self._keywords} unique keywords for this \ +document. Format as comma separated. Keywords: """ + ), + context_str=cast(TextNode, node).text, + ) + # node.metadata["excerpt_keywords"] = keywords + metadata_list.append({"excerpt_keywords": keywords}) + return metadata_list + + +class QuestionsAnsweredExtractor(MetadataFeatureExtractor): + """ + Questions answered extractor. Node-level extractor. + Extracts `questions_this_excerpt_can_answer` metadata field. + Args: + llm_predictor (Optional[BaseLLMPredictor]): LLM predictor + questions (int): number of questions to extract + prompt_template (str): template for question extraction, + embedding_only (bool): whether to use embedding only + """ + + def __init__( + self, + llm_predictor: Optional[BaseLLMPredictor] = None, + questions: int = 5, + prompt_template: Optional[str] = None, + embedding_only: bool = True, + ) -> None: + """Init params.""" + if questions < 1: + raise ValueError("questions must be >= 1") + self._llm_predictor = llm_predictor or LLMPredictor() + self._questions = questions + self._prompt_template = prompt_template + self._embedding_only = embedding_only + + def extract(self, nodes: Sequence[BaseNode]) -> List[Dict]: + metadata_list: List[Dict] = [] + for node in nodes: + if self.is_text_node_only and not isinstance(node, TextNode): + metadata_list.append({}) + continue + # Extract the title from the first node + # TODO: figure out a good way to allow users to customize template + questions = self._llm_predictor.predict( + Prompt( + template=self._prompt_template + or f"""\ +{{context_str}}. Given the contextual information, \ +generate {self._questions} questions this document can provide \ +specific answers to which are unlikely to be found elsewhere: \ +""" + ), + context_str=f"""\ +metadata: {json.dumps(node.metadata)} \ +content: {cast(TextNode, node).text}""", + ) + if self._embedding_only: + node.excluded_llm_metadata_keys = ["questions_this_excerpt_can_answer"] + metadata_list.append({"questions_this_excerpt_can_answer": questions}) + return metadata_list + + +DEFAULT_SUMMARY_EXTRACT_TEMPLATE = """\ +Here is the content of the section: {context_str}. \ +Summarize the key topics and entities of the section. Summary: """ + + +class SummaryExtractor(MetadataFeatureExtractor): + """ + Summary extractor. Node-level extractor with adjacent sharing. + Extracts `section_summary`, `prev_section_summary`, `next_section_summary` + metadata fields + Args: + llm_predictor (Optional[BaseLLMPredictor]): LLM predictor + summaries (List[str]): list of summaries to extract: 'self', 'prev', 'next' + prompt_template (str): template for summary extraction""" + + def __init__( + self, + llm_predictor: Optional[BaseLLMPredictor] = None, + summaries: List[str] = ["self"], + prompt_template: str = DEFAULT_SUMMARY_EXTRACT_TEMPLATE, + ): + self._llm_predictor = llm_predictor or LLMPredictor() + # validation + if not all([s in ["self", "prev", "next"] for s in summaries]): + raise ValueError("summaries must be one of ['self', 'prev', 'next']") + self._self_summary = "self" in summaries + self._prev_summary = "prev" in summaries + self._next_summary = "next" in summaries + self._prompt_template = prompt_template + + def extract(self, nodes: Sequence[BaseNode]) -> List[Dict]: + if not all([isinstance(node, TextNode) for node in nodes]): + raise ValueError("Only `TextNode` is allowed for `Summary` extractor") + node_summaries = [ + self._llm_predictor.predict( + Prompt(template=self._prompt_template), + context_str=cast(TextNode, node).text, + ) + for node in nodes + ] + + # Extract node-level summary metadata + metadata_list: List[Dict] = [{} for _ in nodes] + for i, metadata in enumerate(metadata_list): + if i > 0 and self._prev_summary: + metadata["prev_section_summary"] = node_summaries[i - 1] + if i < len(nodes) - 1 and self._next_summary: + metadata["next_section_summary"] = node_summaries[i + 1] + if self._self_summary: + metadata["section_summary"] = node_summaries[i] + + return metadata_list diff --git a/llama_index/node_parser/interface.py b/llama_index/node_parser/interface.py index 6b3af2f04b9b0..409a2726c907d 100644 --- a/llama_index/node_parser/interface.py +++ b/llama_index/node_parser/interface.py @@ -1,5 +1,5 @@ """Node parser interface.""" -from typing import List, Sequence +from typing import List, Sequence, Dict from abc import ABC, abstractmethod @@ -22,3 +22,18 @@ def get_nodes_from_documents( documents (Sequence[Document]): documents to parse """ + + +class BaseExtractor(ABC): + """Base interface for feature extractor.""" + + @abstractmethod + def extract( + self, + nodes: List[BaseNode], + ) -> List[Dict]: + """Post process nodes parsed from documents. + + Args: + nodes (List[BaseNode]): nodes to extract from + """ diff --git a/llama_index/node_parser/simple.py b/llama_index/node_parser/simple.py index 72f0191968e62..c7ca69d9566ff 100644 --- a/llama_index/node_parser/simple.py +++ b/llama_index/node_parser/simple.py @@ -6,6 +6,7 @@ from llama_index.constants import DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE from llama_index.langchain_helpers.text_splitter import TextSplitter, TokenTextSplitter from llama_index.node_parser.interface import NodeParser +from llama_index.node_parser.extractors.metadata_extractors import MetadataExtractor from llama_index.node_parser.node_utils import get_nodes_from_document from llama_index.utils import get_tqdm_iterable from llama_index.schema import Document @@ -30,6 +31,7 @@ def __init__( include_metadata: bool = True, include_prev_next_rel: bool = True, callback_manager: Optional[CallbackManager] = None, + metadata_extractor: Optional[MetadataExtractor] = None, ) -> None: """Init params.""" self.callback_manager = callback_manager or CallbackManager([]) @@ -38,6 +40,7 @@ def __init__( ) self._include_metadata = include_metadata self._include_prev_next_rel = include_prev_next_rel + self._metadata_extractor = metadata_extractor @classmethod def from_defaults( @@ -47,6 +50,7 @@ def from_defaults( include_metadata: bool = True, include_prev_next_rel: bool = True, callback_manager: Optional[CallbackManager] = None, + metadata_extractor: Optional[MetadataExtractor] = None, ) -> "SimpleNodeParser": callback_manager = callback_manager or CallbackManager([]) chunk_size = chunk_size or DEFAULT_CHUNK_SIZE @@ -64,6 +68,7 @@ def from_defaults( include_metadata=include_metadata, include_prev_next_rel=include_prev_next_rel, callback_manager=callback_manager, + metadata_extractor=metadata_extractor, ) def get_nodes_from_documents( @@ -95,6 +100,10 @@ def get_nodes_from_documents( include_prev_next_rel=self._include_prev_next_rel, ) all_nodes.extend(nodes) + + if self._metadata_extractor is not None: + self._metadata_extractor.process_nodes(all_nodes) + self.callback_manager.on_event_end( CBEventType.NODE_PARSING, payload={EventPayload.NODES: all_nodes}, diff --git a/tests/node_parser/metadata_extractor.py b/tests/node_parser/metadata_extractor.py new file mode 100644 index 0000000000000..cc6ec5b8266a5 --- /dev/null +++ b/tests/node_parser/metadata_extractor.py @@ -0,0 +1,37 @@ +from llama_index.node_parser import SimpleNodeParser +from llama_index.node_parser.extractors import ( + MetadataExtractor, + SummaryExtractor, + QuestionsAnsweredExtractor, + TitleExtractor, + KeywordExtractor, +) +from llama_index.indices.service_context import ServiceContext +from llama_index import Document + + +def test_metadata_extractor(mock_service_context: ServiceContext) -> None: + metadata_extractor = MetadataExtractor( + extractors=[ + TitleExtractor(nodes=5), + QuestionsAnsweredExtractor(questions=3), + SummaryExtractor(summaries=["prev", "self"]), + KeywordExtractor(keywords=10), + ], + ) + + node_parser = SimpleNodeParser( + metadata_extractor=metadata_extractor, + ) + + document = Document( + text="sample text", + metadata={"filename": "README.md", "category": "codebase"}, + ) + + nodes = node_parser.get_nodes_from_documents([document]) + + assert "document_title" in nodes[0].metadata + assert "questions_this_excerpt_can_answer" in nodes[0].metadata + assert "section_summary" in nodes[0].metadata + assert "excerpt_keywords" in nodes[0].metadata