# Introduction to Advanced RAG in LlamaIndex

In [None]:
%pip install nest_asyncio



In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
%pip install -U llama-index

Collecting llama-index
  Downloading llama_index-0.12.23-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-agent-openai<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_agent_openai-0.4.6-py3-none-any.whl.metadata (727 bytes)
Collecting llama-index-cli<0.5.0,>=0.4.1 (from llama-index)
  Downloading llama_index_cli-0.4.1-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.13.0,>=0.12.23 (from llama-index)
  Downloading llama_index_core-0.12.23.post2-py3-none-any.whl.metadata (2.5 kB)
Collecting llama-index-embeddings-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.6.8-py3-none-any.whl.metadata (3.6 kB)
Collecting llama-index-llms-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_llms_openai-0.3.25-py3-none-any.whl.metadata (3.3 kB)


## Extract

In [None]:
from llama_index.core import SimpleDirectoryReader

docs = SimpleDirectoryReader(input_dir="/content/data").load_data()

# file name as id
# docs_nam_as_id = SimpleDirectoryReader(input_dir="./data", filename_as_id=True).load_data()

In [None]:
len(docs)  # one per page

53

In [None]:
import pprint
pprint.pprint(docs)

[Document(id_='7b6a48e7-64b6-413d-866c-5b4c08c4c25a', embedding=None, metadata={'page_label': '1', 'file_name': 'DeepSeek_V3.pdf', 'file_path': '/content/data/DeepSeek_V3.pdf', 'file_type': 'application/pdf', 'file_size': 1667109, 'creation_date': '2025-03-08', 'last_modified_date': '2025-03-08'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='DeepSeek-V3 Technical Report\nDeepSeek-AI\nresearch@deepseek.com\nAbstract\nWe present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total\nparameters with 37B activated for each token. To achieve efficient inference and cost-effective\ntraining, DeepSeek-V3 adopts Multi-he

## Transform

In [None]:
# hide some keys from llm

docs[0].__dict__ # too much data about one doc

{'id_': '7b6a48e7-64b6-413d-866c-5b4c08c4c25a',
 'embedding': None,
 'metadata': {'page_label': '1',
  'file_name': 'DeepSeek_V3.pdf',
  'file_path': '/content/data/DeepSeek_V3.pdf',
  'file_type': 'application/pdf',
  'file_size': 1667109,
  'creation_date': '2025-03-08',
  'last_modified_date': '2025-03-08'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {},
 'metadata_template': '{key}: {value}',
 'metadata_separator': '\n',
 'text_resource': MediaResource(embeddings=None, data=None, text='DeepSeek-V3 Technical Report\nDeepSeek-AI\nresearch@deepseek.com\nAbstract\nWe present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total\nparameters with 37B activated for each token. To achieve efficient inference

In [None]:
# quick example of what the LLM and Embeddings see when with a test document

from llama_index.core import Document
from llama_index.core.schema import MetadataMode

document = Document(
    text="This is a super-customized document",
    metadata={
        "file_name": "super_secret_document.txt",
        "category": "deepseek",
        "author": "LlamaIndex",
    },
    # excluded_embed_metadata_keys=["file_name"],
    # excluded_llm_metadata_keys=["category"],
    metadata_seperator="\n",
    metadata_template="{key}:{value}",
    text_template="Metadata:\n{metadata_str}\n-----\nContent:\n{content}",
)

# print(
#     "The LLM sees this: \n",
#     document.get_content(metadata_mode=MetadataMode.LLM),
# )
print(
    "The Embedding model sees this: \n",
    document.get_content(metadata_mode=MetadataMode.EMBED),
)

The Embedding model sees this: 
 Metadata:
file_name:super_secret_document.txt
category:deepseek
author:LlamaIndex
-----
Content:
This is a super-customized document


In [None]:
from llama_index.core.schema import MetadataMode

# print(docs[0].get_content(metadata_mode=MetadataMode.LLM))   # what the llm sees
print(docs[0].get_content(metadata_mode=MetadataMode.EMBED)) # what embeddings see. in this case, same thing

page_label: 1
file_path: /content/data/DeepSeek_V3.pdf

DeepSeek-V3 Technical Report
DeepSeek-AI
research@deepseek.com
Abstract
We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total
parameters with 37B activated for each token. To achieve efficient inference and cost-effective
training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architec-
tures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers
an auxiliary-loss-free strategy for load balancing and sets a multi-token prediction training
objective for stronger performance. We pre-train DeepSeek-V3 on 14.8 trillion diverse and
high-quality tokens, followed by Supervised Fine-Tuning and Reinforcement Learning stages to
fully harness its capabilities. Comprehensive evaluations reveal that DeepSeek-V3 outperforms
other open-source models and achieves performance comparable to leading closed-source
models. Despite its excellent performance, DeepSeek-

In [None]:
for doc in docs:
    # define the content/metadata template
    doc.text_template = "Metadata:\n{metadata_str}\n---\nContent:\n{content}"

    # exclude page label from embedding
    if "page_label" not in doc.excluded_embed_metadata_keys:
        doc.excluded_embed_metadata_keys.append("page_label")

In [None]:
# after editing the content seen by embedings

print(docs[0].get_content(metadata_mode=MetadataMode.EMBED))

Metadata:
file_path: /content/data/DeepSeek_V3.pdf
---
Content:
DeepSeek-V3 Technical Report
DeepSeek-AI
research@deepseek.com
Abstract
We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total
parameters with 37B activated for each token. To achieve efficient inference and cost-effective
training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architec-
tures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers
an auxiliary-loss-free strategy for load balancing and sets a multi-token prediction training
objective for stronger performance. We pre-train DeepSeek-V3 on 14.8 trillion diverse and
high-quality tokens, followed by Supervised Fine-Tuning and Reinforcement Learning stages to
fully harness its capabilities. Comprehensive evaluations reveal that DeepSeek-V3 outperforms
other open-source models and achieves performance comparable to leading closed-source
models. Despite its excellent performance, D

Here are other, more advanced transformations. Some require an LLM to work. We will use Qwen 2.5 32B Instruct 128k through Groq, which is an affordble, high-rate model. It should be enough to extract Q&As and titles from the documents.

In [None]:
%pip install -Uq llama-index-llms-groq

In [None]:
from llama_index.llms.openai import OpenAI
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OPENAI API key: ")

Enter your OPENAI API key: ··········


In [None]:
llm_transformations = OpenAI(model="gpt-4o-mini", api_key=os.environ["OPENAI_API_KEY"])

In [None]:
# other transformations

from llama_index.core.extractors import (
    TitleExtractor,
    QuestionsAnsweredExtractor,
)
from llama_index.core.node_parser import SentenceSplitter

text_splitter = SentenceSplitter(
    separator=" ", chunk_size=1024, chunk_overlap=128
)
title_extractor = TitleExtractor(llm=llm_transformations, nodes=5)
qa_extractor = QuestionsAnsweredExtractor(llm=llm_transformations, questions=3)


from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(
    transformations=[
        text_splitter,
        title_extractor,
        qa_extractor
    ]
)

nodes = pipeline.run(
    documents=docs,
    in_place=True,
    show_progress=True,
)

Parsing nodes:   0%|          | 0/53 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:01<00:00,  1.58s/it]
100%|██████████| 2/2 [00:00<00:00,  2.68it/s]
100%|██████████| 1/1 [00:00<00:00,  1.70it/s]
100%|██████████| 1/1 [00:01<00:00,  1.26s/it]
100%|██████████| 1/1 [00:00<00:00,  1.89it/s]
100%|██████████| 1/1 [00:00<00:00,  1.68it/s]
100%|██████████| 1/1 [00:00<00:00,  1.75it/s]
100%|██████████| 2/2 [00:00<00:00,  3.56it/s]
100%|██████████| 2/2 [00:01<00:00,  1.11it/s]
100%|██████████| 1/1 [00:00<00:00,  1.44it/s]
100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
100%|██████████| 1/1 [00:00<00:00,  1.58it/s]
100%|██████████| 2/2 [00:01<00:00,  1.69it/s]
100%|██████████| 1/1 [00:00<00:00,  1.14it/s]
100%|██████████| 1/1 [00:00<00:00,  1.71it/s]
100%|██████████| 1/1 [00:00<00:00,  1.18it/s]
100%|██████████| 1/1 [00:00<00:00,  1.29it/s]
100%|██████████| 1/1 [00:00<00:00,  2.18it/s]
100%|██████████| 1/1 [00:00<00:00,  1.35it/s]
100%|██████████| 1/1 [00:01<00:00,  1.27s/it]
100%|██████████| 1/1 [00:00<00:00,  1.34it/s]
100%|██████████| 1/1 [00:01<00:00,

By default, Llamaindex uses OpenAI's embedding models. But you can choose to load a free model from HuggingFace too (but it it will be slower).

In [None]:
len(nodes)

70

In [None]:
import pprint

# pprint.pprint(nodes[0].__dict__)

print(nodes[0].get_content(metadata_mode=MetadataMode.LLM))

[Excerpt from document]
page_label: 1
file_path: /content/data/DeepSeek_V3.pdf
document_title: "DeepSeek-V3: Enhancing Performance and Efficiency in Mixture-of-Experts Language Models"
questions_this_excerpt_can_answer: Based on the provided excerpt from the document "DeepSeek-V3: Enhancing Performance and Efficiency in Mixture-of-Experts Language Models," here are three specific questions that can be answered using the context:

1. **What is the total number of parameters in the DeepSeek-V3 model, and how many of those are activated for each token?**
   - Answer: DeepSeek-V3 has a total of 671 billion parameters, with 37 billion activated for each token.

2. **What innovative strategies does DeepSeek-V3 employ to enhance its performance and efficiency compared to previous versions?**
   - Answer: DeepSeek-V3 adopts a Multi-head Latent Attention (MLA) architecture, an auxiliary-loss-free strategy for load balancing, and a multi-token prediction training objective.

3. **How does the tr

## Index

In [None]:
%pip install -Uq llama-index-embeddings-huggingface

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m113.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Embeddings

from llama_index.embeddings.huggingface import HuggingFaceEmbedding

hf_embeddings = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

test_embed = hf_embeddings.get_text_embedding("Hello my name is ali")
print(test_embed)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[-0.07689619064331055, 0.0013940933858975768, 0.032628051936626434, -0.0328279472887516, -0.029612097889184952, -0.033744264394044876, 0.10675576329231262, -0.011162916198372841, 0.0819464921951294, -0.03390379250049591, 0.006559085566550493, -0.085511215031147, 0.049771517515182495, -0.00915818102657795, 0.04296262562274933, -0.009315067902207375, 0.047079283744096756, -0.00946254376322031, -0.12118915468454361, -0.030095258727669716, -0.002533972728997469, 0.017935240641236305, -0.009500307030975819, -0.014419395476579666, 0.010631861165165901, 0.000514488376211375, 0.023990532383322716, 0.0264907144010067, -0.03405412659049034, -0.06939774751663208, -0.0452418252825737, 0.05594845116138458, 0.04144037142395973, 0.028954660519957542, 0.02886425144970417, -0.004896397702395916, -0.019973069429397583, 0.03083452396094799, -0.034612126648426056, -0.016962843015789986, 0.0684610977768898, -0.039916399866342545, 0.0272450540214777, 0.02459423802793026, 0.07735758274793625, -0.015415565110

In [None]:
# create index

from llama_index.core import VectorStoreIndex

index = VectorStoreIndex(nodes, embed_model=hf_embeddings)

## Query

In [None]:
llm_querying = OpenAI(model="gpt-4o-mini", api_key=os.environ["OPENAI_API_KEY"])

query_engine = index.as_query_engine(llm=llm_querying)
response = query_engine.query(
    "این مقاله در خصوص چیست ؟ لطفا فارسی جواب بده ؟"
)

print(response)

این مقاله به بررسی پیشرفت‌های اخیر در درک زبان و ارزیابی مدل‌ها و مجموعه داده‌ها برای پاسخ به سوالات و درک مطلب می‌پردازد. همچنین، مقاله‌ای دیگر به تحلیل لایه‌های مبتنی بر Aux-Loss و لایه‌های بدون Aux-Loss در ریاضیات DM می‌پردازد.


In [None]:
response.__dict__

{'response': 'این مقاله به بررسی پیشرفت\u200cهای اخیر در درک زبان و ارزیابی مدل\u200cها و مجموعه داده\u200cها برای پاسخ به سوالات و درک مطلب می\u200cپردازد. همچنین، مقاله\u200cای دیگر به تحلیل لایه\u200cهای مبتنی بر Aux-Loss و لایه\u200cهای بدون Aux-Loss در ریاضیات DM می\u200cپردازد.',
 'source_nodes': [NodeWithScore(node=TextNode(id_='e27b3867-c103-4bcb-9b52-3a03ad9abc36', embedding=None, metadata={'page_label': '39', 'file_name': 'DeepSeek_V3.pdf', 'file_path': '/content/data/DeepSeek_V3.pdf', 'file_type': 'application/pdf', 'file_size': 1667109, 'creation_date': '2025-03-08', 'last_modified_date': '2025-03-08', 'document_title': '"Recent Advances in Language Understanding: A Comprehensive Review of Evaluation Datasets and Models for Question Answering and Reading Comprehension"', 'questions_this_excerpt_can_answer': 'Based on the provided excerpt from the document titled "Recent Advances in Language Understanding: A Comprehensive Review of Evaluation Datasets and Models for Question

## Store

In [None]:
index.storage_context.persist(persist_dir="/content/vectors")

In [None]:
from llama_index.core import StorageContext, load_index_from_storage

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="/content/vectors")

# load index
index_from_storage = load_index_from_storage(storage_context, embed_model=hf_embeddings)

In [None]:
qa = index_from_storage.as_query_engine(llm=llm_querying)

In [None]:
response = qa.query("what is this article about ?")
print(response)

The article explores the differences and characteristics of Aux-Loss-Based Layers and Aux-Loss-Free Layers within the framework of DM Mathematics. It provides a comprehensive analysis of these two types of layers, discussing their methodologies, significance, and the organization of the content related to their presentation.


# Using Vector Stores

In [None]:
%pip install -Uq chromadb
%pip install -Uq llama-index-vector-stores-chroma

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m67.3 MB/s[0m eta [36m0:00:00

In [None]:
import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

# initialize client, setting path to save data
db = chromadb.PersistentClient(path="/content/chroma_db")

# create collection
chroma_collection = db.get_or_create_collection("Deepseek")

# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# create your index
index = VectorStoreIndex(
    nodes, storage_context=storage_context, embed_model=hf_embeddings
)

# You can also load from documents and apply transformations in place
# index = VectorStoreIndex.from_documents(
#     documents, storage_context=storage_context, transformations=[]
# )

# Or you can initialize your index from your vector store and then add the nodes
# index = VectorStoreIndex.from_vector_store(
#     vector_store=vector_store, embed_model=hf_embeddings
# )
# index.insert_nodes(nodes)


# create a query engine and query
query_engine = index.as_query_engine(llm=llm_querying)

In [None]:
response = query_engine.query("what is this article about?")
print(response)

The article explores the differences and characteristics of Aux-Loss-Based Layers and Aux-Loss-Free Layers within the framework of DM Mathematics. It provides a comprehensive analysis of these two types of layers, discussing their methodologies, significance, and the organization of the content related to their presentation. The document aims to contribute to the understanding of these layers and their implications in the field of mathematics.
