In [None]:
%%capture
!pip install llama-index==0.10.37 llama-index-embeddings-openai==0.1.9 qdrant-client==1.9.1 llama-index-vector-stores-qdrant==0.2.8 llama-index-llms-openai==0.1.19

In [None]:
import os
import sys
from getpass import getpass
import nest_asyncio

from IPython.display import Markdown, display

from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv("")

sys.path.append('../helpers')

from utils import setup_llm, setup_embed_model, setup_vector_store

In [None]:
OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] or getpass("Enter your OpenAI API key: ")

In [None]:
QDRANT_URL = ":memory:"

In [None]:
QDRANT_API_KEY = os.environ['QDRANT_API_KEY'] or  getpass("Enter your Qdrant API Key:")

In [None]:
from llama_index.core.settings import Settings
from utils import setup_llm, setup_embed_model

setup_llm(
    provider="openai",
    api_key=OPENAI_API_KEY, 
    model="gpt-4o", 
    temperature=0.75, 
    system_prompt="""Use ONLY the provided context and generate a complete, coherent answer to the user's query. 
    Your response must be grounded in the provided context and relevant to the essence of the user's query.
    """
    )

setup_embed_model(
    provider="openai",
    model="text-embedding-3-small",
    api_key=OPENAI_API_KEY
    )

In [None]:
import random
from utils import get_documents_from_docstore, group_documents_by_author, sample_documents

documents = get_documents_from_docstore("../data/words-of-the-senpais")

random.seed(42)

documents_by_author = group_documents_by_author(documents)

senpai_documents = sample_documents(documents_by_author, num_samples=25)

# 🗃️ Metadata for Nodes

Metadata provides additional context or information about the nodes.

During retrieval we can leverage this additional context and information, for more precise and relevant retrieval. However, the effectiveness of this approach depends on the quality and relevance of the metadata tags used. The most simplest way to add metadata is to do so manually. 

Let's add some metadata for what each of our Senpai's are known for.

In [None]:
known_for = {
    "Naval Ravikant": "Known for his insights on how to build wealth and achieve happiness through developing specific knowledge, embracing accountability, playing long-term games, and understanding the power of compound interest in all areas of life.",
    "Balaji Srinivasan": "Has insights on how to think independently, identify opportunities, and build a better future through the strategic application of technology and clear reasoning.",
    "Paul Graham": "Provides advice on the hacker mindset, arguing that hackers are really makers and creators - akin to painters - who can leverage their unique way of thinking to push boundaries, challenge the status quo, and shape the future through technology and entrepreneurship.",
    "Nassim Nicholas Taleb": "Argues for 'Skin in the Game', that is having a personal stake in the outcome is necessary for fairness as it aligns incentives and exposes individuals to both the potential rewards and risks of their decisions.",
    "Seneca": "Offers timeless advice on how to cultivate wisdom, build mental resilience, and live a life of purpose and contentment by focusing on what is essential, mastering one's emotions, and aligning oneself with nature.",
    "Bruce Lee": "Offers profound wisdom on self-improvement, personal growth, and martial arts philosophy, emphasizing the importance of adaptability, self-expression, and embracing one's own unique path in life, "
}

In [None]:
for document in senpai_documents:
    document.metadata['known_for'] = known_for.get(document.metadata['author']) 

In [None]:
senpai_documents[42].metadata


# Automatically Extract Metadata

Metadata extraction in LlamaIndex is a process that helps to disambiguate similar-looking passages of text, especially in long documents. 

This is achieved by using LLMs to extract contextual information relevant to the document. This information aids the retrieval and language models in distinguishing between similar passages.

In LlamaIndex, metadata extraction is performed using various feature extractors within the [`MetadataExtractor`](https://github.com/run-llama/llama_index/tree/954398e1957027a364d0d332fee61733ad322f8b/llama-index-core/llama_index/core/extractors) class. 

These extractors include:

 - `SummaryExtractor`: This extractor automatically generates a summary over a set of Nodes.

 - `QuestionsAnsweredExtractor`: This extractor identifies a set of questions that each Node can answer.

 - `TitleExtractor`: This extractor identifies a title over the context of each Node.

 - `KeywordExtractor`: Keywords that uniquely identify the node



In [None]:
from llama_index.core.extractors import  SummaryExtractor, QuestionsAnsweredExtractor, TitleExtractor, KeywordExtractor

In [None]:
print(SummaryExtractor().prompt_template)

In [None]:
print(QuestionsAnsweredExtractor().prompt_template)

In [None]:
print(TitleExtractor().node_template)

#### KeywordExtractor has it's prompt template buried in an LLM call, and not an attribute.

Here's what it is in [the source code](https://github.com/run-llama/llama_index/blob/954398e1957027a364d0d332fee61733ad322f8b/llama-index-core/llama_index/core/extractors/metadata_extractors.py#L198):

```python
f"""\
{{context_str}}. Give {self.keywords} unique keywords for this \
document. Format as comma separated. Keywords: 
```

# Automated Metadata Extraction

Let's perform some automated metadata extraction for better retrieval results. 

We'll employ two extractors: 

 - `QuestionAnsweredExtractor` to generates question/answer pairs from a piece of text
 
 - `SummaryExtractor` to extracts summaries, not only within the current text, but also within adjacent texts. 
 
This strategy leads to higher quality answer given retrieved results.

To do this, we define metadata extractors:
 
 - `qa_extractor`
 
 - `summary_extractor`

 Note the use of `MetadataMode.EMBED` this specifies how metadata is handled when generating embeddings for a document or node. When you call the `get_content()` function on a document and specify `MetadataMode.EMBED`, it returns the content of the document with the metadata that is visible to the embedding model.

 We'll also use `GPT-3.5-Turbo` to generate the metadata.

#### 👨🏽‍💻 I encourage you to try out the other metadata extractors and see what your results look like. 

For example, you can try the `KeywordExtractor` or  `TitleExtractor` like so:

```python

keyword_extractor = KeywordExtractor(keywords=10, llm=llm)

title_extractor = TitleExtractor(nodes=5, llm=llm)

```

In [None]:
from llama_index.core.schema import MetadataMode
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.extractors import SummaryExtractor, QuestionsAnsweredExtractor
from llama_index.llms.openai import OpenAI

qa_llm = OpenAI(model="gpt-4o")

text_splitter = TokenTextSplitter(chunk_size=256, chunk_overlap=16)

qa_extractor = QuestionsAnsweredExtractor(
    questions=2, 
    llm=qa_llm, 
    metadata_mode=MetadataMode.EMBED,
    embed_model=Settings.embed_model,
    )

summary_extractor = SummaryExtractor(
    summaries=["prev", "self", "next"], 
    llm=qa_llm,
    )


### 👷🏽‍♂️ 🗂️ Ingest to Qdrant and Build the Index 

In the last few videos we did the node splitting first and then ingested to Qdrant. That was to make the pattern clear to you and give you a sense of how splitting works.

But, we can actually just do this kind of stuff directly using the ingetsion pipeline.

Note, I will leave it up to you to experiment using with one, or both of the extractors and fiddling with the hyperparameters.

The parsing here took ~30 minutes.


In [None]:
from llama_index.core import StorageContext
from llama_index.core.settings import Settings

from utils import create_index, create_query_engine, ingest, setup_vector_store

COLLECTION_NAME = "words-of-the-senpai-qa-plus-summaries-nodes"

qa_summaries_vector_store = setup_vector_store(QDRANT_URL, QDRANT_API_KEY, COLLECTION_NAME)

transforms = [text_splitter, qa_extractor, summary_extractor, Settings.embed_model]

qa_summaries = ingest(
    documents=senpai_documents,
    transformations=transforms,
    vector_store=qa_summaries_vector_store
)
qa_summaries_index = create_index(
    from_where="vector_store",
    vector_store=qa_summaries_vector_store,
    embed_model=Settings.embed_model,
    )

  6%|▋         | 23/361 [00:21<04:09,  1.35it/s]

In [None]:
len(qa_summaries)

In [None]:
qa_summaries[100].__dict__

In [None]:
print(qa_summaries[100].get_content(metadata_mode="all"))

### 🔧 Setup Query Engine and Pipeline

In [None]:
from llama_index.core import PromptTemplate
from utils import create_query_engine
from prompts import HYPE_ANSWER_GEN_PROMPT

HYPE_ANSWER_GEN_PROMPT_TEMPLATE = PromptTemplate(HYPE_ANSWER_GEN_PROMPT)

qa_summaries_query_engine = create_query_engine(
    index=qa_summaries_index, 
    mode="query",
    response_mode="compact",
    similiarty_top_k=5,
    vector_store_query_mode="mmr", 
    vector_store_kwargs={"mmr_threshold": 0.42},
    )

qa_summaries_query_engine.update_prompts({'response_synthesizer:text_qa_template':HYPE_ANSWER_GEN_PROMPT_TEMPLATE})

In [None]:
from utils import create_query_pipeline

from llama_index.core.query_pipeline import InputComponent

input_component = InputComponent()

qa_summaries_chain = [input_component, qa_summaries_query_engine]

qa_summaries_query_pipeline = create_query_pipeline(qa_summaries_chain)

In [None]:
qa_summaries_query_pipeline.run(input="How can I ensure unswerving decision-making in my life?")