In [None]:
#!pip install PyMuPDF
#%pip install llama-index-extractors-entity
#!pip uninstall nltk
#!pip install nltk
#!python.exe -m pip install --upgrade pip


* Creating Nodes and Documents
* Retrievers
* Implementing Metadata Extraction

In [17]:
from llama_index.core.schema import Document
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.vector_stores import MetadataFilter, MetadataFilters, FilterOperator

photos = [
    Document(
        text="A photo of the Eiffel Tower at sunset",
        metadata={
            "location": "Paris, France",
            "date_taken": "2021-07-15",
            "camera": "Nikon D3500",
        },
    ),
    Document(
        text="A photo of the Great Wall of China",
        metadata={
            "location": "Beijing, China",
            "date_taken": "2021-10-03",
            "camera": "Canon EOS Rebel T7",
        },
    ),
]
#print(photos)
index = VectorStoreIndex.from_documents(photos)
filters = MetadataFilters(
    filters=[
        MetadataFilter(key="location", operator=FilterOperator.EQ, value="Paris, France"),
    ]
)
#retriever = index.as_retriever(filters=filters)
retriever = index.as_retriever()
results = retriever.retrieve("show photos from china ")
print(results)

[NodeWithScore(node=TextNode(id_='20871271-acf4-4896-98f1-777dcf469f6f', embedding=None, metadata={'location': 'Beijing, China', 'date_taken': '2021-10-03', 'camera': 'Canon EOS Rebel T7'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='6d259d44-46a9-4104-8efc-8ace0efb082f', node_type='4', metadata={'location': 'Beijing, China', 'date_taken': '2021-10-03', 'camera': 'Canon EOS Rebel T7'}, hash='07653394bd612b9ee1610062dc42f41976b8965717f6b137c79141e6cf2decd9')}, metadata_template='{key}: {value}', metadata_separator='\n', text='A photo of the Great Wall of China', mimetype='text/plain', start_char_idx=0, end_char_idx=34, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), score=0.8399970647123353), NodeWithScore(node=TextNode(id_='bdd97940-053f-4a35-9c1a-9753f98a89e0', embedding=None, metadata={'location': 'Paris, France', 'date_taken': '2021-07-15', 'camera': 'Nikon D3500'}, exc

In [6]:
retriever = index.as_retriever()
results = retriever.retrieve("show photos from paris")
print(results)

[NodeWithScore(node=TextNode(id_='8761fdd8-1533-4236-a9af-7bf17b26b3fc', embedding=None, metadata={'location': 'Paris, France', 'date_taken': '2021-07-15', 'camera': 'Nikon D3500'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='3156a57e-d785-4c1e-b12d-366ac94b4c21', node_type='4', metadata={'location': 'Paris, France', 'date_taken': '2021-07-15', 'camera': 'Nikon D3500'}, hash='42afb3506f0764729a021f4cd728e45b530e634e82744b9ae490ab9869e25afd')}, metadata_template='{key}: {value}', metadata_separator='\n', text='A photo of the Eiffel Tower at sunset', mimetype='text/plain', start_char_idx=0, end_char_idx=37, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), score=0.834497249103706), NodeWithScore(node=TextNode(id_='e91f0647-e383-4ef4-b2fb-75047edee8ab', embedding=None, metadata={'location': 'Beijing, China', 'date_taken': '2021-10-03', 'camera': 'Canon EOS Rebel T7'}, excluded_

* Transformation


In [None]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,    
)
from llama_index.extractors.entity import EntityExtractor
from llama_index.readers.file.docs import PDFReader
from pathlib import Path

loader = PDFReader()
documents = loader.load_data(file=Path("./data/Cristiano_Ronaldo.pdf"))

transformations = [
    SentenceSplitter(chunk_size=512, chunk_overlap=50),
    EntityExtractor(prediction_threshold=0.5),
    SummaryExtractor(summaries=["prev", "self"]),
    KeywordExtractor(keywords=10),
    TitleExtractor(),    
    QuestionsAnsweredExtractor(questions=3)
]

cindex = VectorStoreIndex.from_documents(documents, transformations=transformations)


In [19]:
from llama_index.core import load_index_from_storage, StorageContext

storage_context = StorageContext.from_defaults(persist_dir="./cr7_indexed_data")

loaded_index = load_index_from_storage(storage_context)


In [30]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine(streaming=True, similarity_top_k=5)
response_stream = query_engine.query(
    "whats his total goal in 2008-09 manchester united",
)
print(response_stream)

Cristiano Ronaldo's total goal in the 2008-09 season for Manchester United was 26.


In [31]:
response_stream.print_response_stream()

Cristiano Ronaldo's total goal in the 2008-09 season for Manchester United was 26.
