<a href="https://colab.research.google.com/github/sugarforever/LlamaIndex-Tutorials/blob/main/05_Documents_Nodes/05_Documents_Nodes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Documents & Nodes

In [1]:
!pip install -q -U llama-index

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.1/868.1 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m66.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m77.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/143.4 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.0/40.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h

## Construct Document

In [4]:
from llama_index import Document
text_list = ["hello", "world"]
documents = [Document(text=t) for t in text_list]

In [5]:
documents

[Document(id_='7335bb8a-56dc-4074-8e79-0406237cadb1', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='7debe8d278fe6c55c45f979269ab268102d75f8d48644d244cd0050dae0846ac', text='hello', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='30041a91-a8e4-448f-971a-8203c0c77e06', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='12d52a924ac6bc76ec4101c5d1f55bb5b5365d5f4c524a21d6175a4b049a1962', text='world', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')]

## Customize Document

### Metadata customization

1. Customize in document construction

In [19]:
from llama_index import Document
document = Document(
  text='Hello World',
  metadata={
    'filename': 'hello_world.pdf',
    'category': 'science'
  }
)
document

Document(id_='e38ed14a-b41b-439e-858a-10ac815b95bb', embedding=None, metadata={'filename': 'hello_world.pdf', 'category': 'science'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='5b1ac0318e8c80ae4c7d187cd698067606fd001256065952e78d7f6229d62e26', text='Hello World', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

2. Customize after document is constructed

In [20]:
document.metadata = {'filename': 'hello_world_v2.pdf'}
document

Document(id_='e38ed14a-b41b-439e-858a-10ac815b95bb', embedding=None, metadata={'filename': 'hello_world_v2.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='5b1ac0318e8c80ae4c7d187cd698067606fd001256065952e78d7f6229d62e26', text='Hello World', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

3. Customize in `SimpleDirectoryReader` usage

In [7]:
!rm -rf data && mkdir data
!echo 'hello llama!' > data/hello_llama.txt

from llama_index import SimpleDirectoryReader
filenama_hook = lambda filename: {'file_name': filename}
documents = SimpleDirectoryReader('./data', file_metadata=filenama_hook).load_data()
documents

[Document(id_='15db2314-2da5-4221-bd71-54c2e66b4908', embedding=None, metadata={'file_name': 'data/hello_llama.txt'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='9787e1e664b99bd6bd78fbcc816ab052d8f0808e5fcda36a9d89d360ede017b6', text='hello llama!\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')]

### Document ID Customization

In [18]:
from llama_index import Document
document = Document(text='Hello World')
document.doc_id = "xxxx-yyyy"
document

Document(id_='xxxx-yyyy', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='a339d80c761418702b627851cbd963eb722f12dfeb360d441f9123c2d3d4fcf7', text='Hello World', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

## Construct Node

Nodes are a first-class citizen in LlamaIndex. Developers can choose to define Nodes and all its attributes directly or parse source Documents into Nodes through the `NodeParser` classes.

1. Construct directly

In [9]:
from llama_index.schema import TextNode
node = TextNode(text="hello world", id_="1234-5678")
node

TextNode(id_='1234-5678', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='351b5b79b4cf0d96b24050f568933500d93160ec03e5e996aa31c29cf2d2f654', text='hello world', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

2. Construct by NodeParser

  We will reuse the `documents` variable loaded above.

In [10]:
from llama_index.node_parser import SimpleNodeParser

parser = SimpleNodeParser.from_defaults()
nodes = parser.get_nodes_from_documents(documents)
nodes

[nltk_data] Downloading package punkt to /tmp/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[TextNode(id_='45d7982c-c9d1-4384-b0e0-f32defff4722', embedding=None, metadata={'file_name': 'data/hello_llama.txt'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='15db2314-2da5-4221-bd71-54c2e66b4908', node_type=None, metadata={'file_name': 'data/hello_llama.txt'}, hash='9787e1e664b99bd6bd78fbcc816ab052d8f0808e5fcda36a9d89d360ede017b6')}, hash='9d29d9fb54e3e9054a39b57a34665527f97c3bf22b6f3cd2b7fdc1d72d440527', text='hello llama!', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')]

## Customize Node


1. Define nodes relationships

In [21]:
from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo

hello_node = TextNode(text="Hello", id_="1111-1111")
world_node = TextNode(text="World", id_="2222-2222")

hello_node.relationships[NodeRelationship.NEXT] = RelatedNodeInfo(node_id=world_node.node_id, metadata={"created_by": "VerySmallWoods"})
world_node.relationships[NodeRelationship.PREVIOUS] = RelatedNodeInfo(node_id=hello_node.node_id)
nodes = [hello_node, world_node]
nodes

[TextNode(id_='1111-1111', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='2222-2222', node_type=None, metadata={'created_by': 'VerySmallWoods'}, hash=None)}, hash='378b8aef7d6589c0b81c83cb8eaa637088d5ae46b9c27fd5ce9d4c56ed676b57', text='Hello', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 TextNode(id_='2222-2222', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='1111-1111', node_type=None, metadata={}, hash=None)}, hash='30cc2bcfbe4881f1655525a2a613acae3ab9cc0800b5bf21cac89fce84e8197e', text='World', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')]

2. Customize node ID

In [16]:
from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo

hello_node = TextNode(text="Hello", id_="1111-1111")
hello_node.id_ = '3333-3333'
hello_node

TextNode(id_='xx', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='378b8aef7d6589c0b81c83cb8eaa637088d5ae46b9c27fd5ce9d4c56ed676b57', text='Hello', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')