In [68]:
import faiss
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core import (
    SimpleDirectoryReader,
    load_index_from_storage,
    VectorStoreIndex,
    StorageContext,
)
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core import Settings
from llama_index.core import get_response_synthesizer
import logging
import sys, os

from dotenv import load_dotenv
load_dotenv(".env")

logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [37]:
# dimensions of text-ada-embedding-002
d = 1536
faiss_index = faiss.IndexFlatL2(d)
vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# vector_store = FaissVectorStore(faiss_index=faiss_index).from_persist_dir("./storage")
# storage_context = StorageContext.from_defaults(vector_store=vector_store, persist_dir="./storage")

In [5]:
llm = AzureOpenAI(
    model=os.getenv("AZURE_OPENAI_MODEL"),
    deployment_name=os.getenv("AZURE_OPENAI_NAME"),
    api_key=os.getenv("AZURE_OPENAI_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_VERSION"),
)

# You need to deploy your own embedding model as well as your own chat completion model
embed_model = AzureOpenAIEmbedding(
    model=os.getenv("AZURE_EMBEDDING_MODEL"),
    deployment_name=os.getenv("AZURE_EMBEDDING_NAME"),
    api_key=os.getenv("AZURE_EMBEDDING_KEY"),
    azure_endpoint=os.getenv("AZURE_EMBEDDING_ENDPOINT"),
    api_version=os.getenv("AZURE_EMBEDDING_VERSION"),
)

Settings.llm = llm
Settings.embed_model = embed_model

# llamaIndex
- Example: Load resume and read from it
- Document load -> Vector Store -> Query
- https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index/

## Document Load
- https://docs.llamaindex.ai/en/stable/module_guides/loading/simpledirectoryreader/

In [None]:
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

document = Document(
    text="This is a super-customized document",
    metadata={
        "file_name": "super_secret_document.txt",
        "category": "finance",
        "author": "LlamaIndex",
    },
    excluded_llm_metadata_keys=["file_name"],
    metadata_seperator="::",
    metadata_template="{key}=>{value}",
    text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
)

print(
    "The LLM sees this: \n",
    document.get_content(metadata_mode=MetadataMode.LLM),
)
print("\n######################\n")
print(
    "The Embedding model sees this: \n",
    document.get_content(metadata_mode=MetadataMode.EMBED),
)

The LLM sees this: 
 Metadata: category=>finance::author=>LlamaIndex
-----
Content: This is a super-customized document

######################

The Embedding model sees this: 
 Metadata: file_name=>super_secret_document.txt::category=>finance::author=>LlamaIndex
-----
Content: This is a super-customized document


In [6]:
reader = SimpleDirectoryReader(input_files=["./data/my_resume.pdf"])
documents = reader.load_data()
documents

[Document(id_='897d932f-3b5a-4a1b-9d8f-e157043f84bf', embedding=None, metadata={'page_label': '1', 'file_name': 'my_resume.pdf', 'file_path': 'data/my_resume.pdf', 'file_type': 'application/pdf', 'file_size': 250772, 'creation_date': '2025-04-27', 'last_modified_date': '2025-04-27'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text="GAN SHAO HONG \n(+65) 90855781 | shaohong.g97@gmail.com | www.linkedin.com/in/g-shaohong/ | https://github.com/shaohong-g  \n \nPROFESSIONAL SUMMARY \n• Machine Learning Engineer with 3+ years of experience in developing highly scalable AI models, implementing ML \npipelines, and deploying models in cloud environ

In [8]:
vars(documents[0])

{'id_': '897d932f-3b5a-4a1b-9d8f-e157043f84bf',
 'embedding': None,
 'metadata': {'page_label': '1',
  'file_name': 'my_resume.pdf',
  'file_path': 'data/my_resume.pdf',
  'file_type': 'application/pdf',
  'file_size': 250772,
  'creation_date': '2025-04-27',
  'last_modified_date': '2025-04-27'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {},
 'metadata_template': '{key}: {value}',
 'metadata_separator': '\n',
 'text_resource': MediaResource(embeddings=None, data=None, text="GAN SHAO HONG \n(+65) 90855781 | shaohong.g97@gmail.com | www.linkedin.com/in/g-shaohong/ | https://github.com/shaohong-g  \n \nPROFESSIONAL SUMMARY \n• Machine Learning Engineer with 3+ years of experience in developing highly scalable AI models, implemen

## Test Chunking (Independent section)

In [1]:
import sys, os, json
import pandas as pd

import tiktoken
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler, LlamaDebugHandler
from llama_index.core.schema import MetadataMode
from llama_index.core.node_parser import JSONNodeParser, SentenceSplitter
from llama_index.core import (
    Document,
    VectorStoreIndex,
)
from dotenv import load_dotenv
load_dotenv(".env")

True

In [2]:
EMBEDDING_ENDPOINT = os.getenv("AZURE_EMBEDDING_ENDPOINT")
EMBEDDING_API_KEY=os.getenv("AZURE_EMBEDDING_KEY")
EMBEDDING_API_VERSION=os.getenv("AZURE_EMBEDDING_VERSION")
EMBEDDING_DEPLOYMENT_NAME = os.getenv("AZURE_EMBEDDING_NAME")
EMBEDDING_MODEL_NAME= os.getenv("AZURE_EMBEDDING_MODEL")

llama_debug = LlamaDebugHandler(print_trace_on_end=True)

token_counter2 = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model(EMBEDDING_MODEL_NAME).encode
)
def get_token_info2(reset=False):
    print(
        "Embedding Tokens: ",
        token_counter2.total_embedding_token_count,
        "\n",
        "LLM Prompt Tokens: ",
        token_counter2.prompt_llm_token_count,
        "\n",
        "LLM Completion Tokens: ",
        token_counter2.completion_llm_token_count,
        "\n",
        "Total LLM Token Count: ",
        token_counter2.total_llm_token_count,
        "\n",
    )
    if reset:
        token_counter2.reset_counts()

# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

embed_model = AzureOpenAIEmbedding(
    model=EMBEDDING_MODEL_NAME,
    deployment_name=EMBEDDING_DEPLOYMENT_NAME,
    api_key=EMBEDDING_API_KEY,
    azure_endpoint=EMBEDDING_ENDPOINT,
    api_version=EMBEDDING_API_VERSION,
    callback_manager=CallbackManager([token_counter2, llama_debug])
)


In [15]:
POKEMON_FILE_SAVED = "./data/pokemon_updated.csv"
df = pd.read_csv(POKEMON_FILE_SAVED)
df.head()

Unnamed: 0.1,Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,...,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary,description
0,0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,...,1,65,65,45,grass,poison,6.9,1,0,### Bulbasaur: Detailed Summary\n\n#### 1. **C...
1,1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,...,2,80,80,60,grass,poison,13.0,1,0,### Ivysaur: Detailed Summary\n\n#### 1. **Cha...
2,2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,...,3,122,120,80,grass,poison,100.0,1,0,### **Venusaur: Detailed Summary**\n\nVenusaur...
3,3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,...,4,60,50,65,fire,,8.5,1,0,### Detailed Summary of Charmander\n\nCharmand...
4,4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,...,5,80,65,80,fire,,19.0,1,0,### Detailed Summary of Charmeleon\n\nCharmele...


In [16]:
df["description"] = df["description"].apply(lambda x: x[:1000])
df_json = df.to_json(orient = "records")
df_json = json.loads(df_json)[:3]

documents = []
for each_pokemon in df_json:
    documents.append(
        Document(
            # text=json.dumps(each_pokemon),
            text=json.dumps(each_pokemon["description"]),
            metadata={
                "name": each_pokemon["name"],
                "generation": each_pokemon["generation"],
                "type1": each_pokemon["type1"],
                "type2": each_pokemon["type2"],
                "is_legendary": each_pokemon["is_legendary"]
            },
            id_= each_pokemon["name"],
            metadata_seperator="::",
            metadata_template="{key}=>{value}",
            text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
            # excluded_embed_metadata_keys=["name","generation","type1","type2","is_legendary"],
            # excluded_llm_metadata_keys=["name","generation","type1","type2","is_legendary"]
        )
    )

In [60]:
import re
delimiter = "::|=>|\\s"
target_sentence = documents[2].get_metadata_str().replace("::", " ")
split_only_related_words = re.split(delimiter, target_sentence)
encoder = tiktoken.encoding_for_model(EMBEDDING_MODEL_NAME).encode

print(target_sentence)
print(split_only_related_words)

print(len(target_sentence), len( split_only_related_words ) )
len(tiktoken.encoding_for_model(EMBEDDING_MODEL_NAME).encode(target_sentence)), len(tiktoken.encoding_for_model(EMBEDDING_MODEL_NAME).encode(" ".join(split_only_related_words) ))

# len(tiktoken.encoding_for_model(EMBEDDING_MODEL_NAME).encode(r'"### Bulbasaur: Detailed Summary\\n\\n#### 1. **Characteristics**\\nBulbasaur is a dual-type **Grass/Poison** Pok\\u00e9mon and') )

name=>Venusaur generation=>1 type1=>grass type2=>poison is_legendary=>0
['name', 'Venusaur', 'generation', '1', 'type1', 'grass', 'type2', 'poison', 'is_legendary', '0']
71 10


(22, 17)

In [48]:
list(map(lambda x: (len(x.text), len(x.text.split()), x.text), documents))

[(1026,
  153,
  '"### Bulbasaur: Detailed Summary\\n\\n#### 1. **Characteristics**\\nBulbasaur is a dual-type **Grass/Poison** Pok\\u00e9mon and is the first Pok\\u00e9mon in the National Pok\\u00e9dex (#001). It is known for its unique appearance, which features a small, quadrupedal body with a plant bulb growing on its back. \\n\\n- **Physical Appearance**: Bulbasaur has a teal-blue body with darker spots scattered across its skin. Its large, red eyes and pointed ears give it a cute yet determined look. The bulb on its back is its most defining feature, which grows as it evolves and eventually blooms into a large flower in its final evolution, Venusaur.\\n- **Height and Weight**: Bulbasaur stands at 0.7 meters (2\'04\\") tall and weighs 6.9 kilograms (15.2 lbs).\\n- **Abilities**: Bulbasaur typically has the ability **Overgrow**, which boosts the power of Grass-type moves when its HP is low. Some Bulbasaur may also have the hidden ability **Chlorophyll**, which increases its speed i

In [89]:
node_parser = SentenceSplitter(chunk_size=100, chunk_overlap=50, callback_manager=CallbackManager([llama_debug]))

nodes = node_parser.get_nodes_from_documents(
    documents, show_progress=True
)

Parsing nodes:   0%|          | 0/3 [00:00<?, ?it/s]

In [90]:
len(encoder(nodes[1].get_content(metadata_mode=MetadataMode.EMBED)))


109

In [91]:
list(map(lambda x: x.text, nodes))

['"### Bulbasaur: Detailed Summary\\n\\n#### 1. **Characteristics**\\nBulbasaur is a dual-type **Grass/Poison** Pok\\u00e9mon and is the first Pok\\u00e9mon in the National Pok\\u00e9dex (#001).',
 '**Characteristics**\\nBulbasaur is a dual-type **Grass/Poison** Pok\\u00e9mon and is the first Pok\\u00e9mon in the National Pok\\u00e9dex (#001). It is known for its unique appearance, which features a small, quadrupedal body with a plant bulb growing on its back.',
 'It is known for its unique appearance, which features a small, quadrupedal body with a plant bulb growing on its back. \\n\\n- **Physical Appearance**: Bulbasaur has a teal-blue body with darker spots scattered across its skin. Its large, red eyes and pointed ears give it a cute yet determined look.',
 '\\n\\n- **Physical Appearance**: Bulbasaur has a teal-blue body with darker spots scattered across its skin. Its large, red eyes and pointed ears give it a cute yet determined look. The bulb on its back is its most defining fe

## Insert parsed documents to vectorstore

In [38]:
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
index.storage_context.persist()

INFO:httpx:HTTP Request: POST https://black-exposure.openai.azure.com/openai/deployments/black-ada-002/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"
HTTP Request: POST https://black-exposure.openai.azure.com/openai/deployments/black-ada-002/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"
HTTP Request: POST https://black-exposure.openai.azure.com/openai/deployments/black-ada-002/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"
HTTP Request: POST https://black-exposure.openai.azure.com/openai/deployments/black-ada-002/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"
HTTP Request: POST https://black-exposure.openai.azure.com/openai/deployments/black-ada-002/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"
HTTP Request: POST https://black-exposure.openai.azure.com/openai/deployments/black-ada-002/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"
HTTP Request: POST https://black-exposure.openai.azure.com/openai/deployments/black-ada-002/embeddings?api-version=2023-05-15

In [39]:
index = load_index_from_storage(storage_context=storage_context)

INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.
Loading all indices.
Loading all indices.
Loading all indices.
Loading all indices.
Loading all indices.


In [None]:
query = "How many jobs have the applicants done? Give the company name, role as well as time."
query = "How many jobs have the applicants done from 2022-2023?"

response_synthesizer = get_response_synthesizer(response_mode="compact") # refine , context_only, accumulate, compact (default)
query_engine = index.as_query_engine(response_synthesizer=response_synthesizer)
response = query_engine.query(query)

INFO:httpx:HTTP Request: POST https://black-exposure.openai.azure.com/openai/deployments/black-ada-002/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"
HTTP Request: POST https://black-exposure.openai.azure.com/openai/deployments/black-ada-002/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"
HTTP Request: POST https://black-exposure.openai.azure.com/openai/deployments/black-ada-002/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"
HTTP Request: POST https://black-exposure.openai.azure.com/openai/deployments/black-ada-002/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"
HTTP Request: POST https://black-exposure.openai.azure.com/openai/deployments/black-ada-002/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"
HTTP Request: POST https://black-exposure.openai.azure.com/openai/deployments/black-ada-002/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"
HTTP Request: POST https://black-exposure.openai.azure.com/openai/deployments/black-ada-002/embeddings?api-version=2023-05-15

In [81]:

print(response.get_formatted_sources())
print("query was:", query)
print("answer was:", response)

> Source (Doc id: a6455d01-5069-484e-ab17-94bf8c26202e): Neuron Mobility  
Data Engineering Intern May 2022 – Aug 2022 
• Pipeline migration from existing...

> Source (Doc id: 153458dc-dc80-4b58-b479-11b22aea1ee8): GAN SHAO HONG 
(+65) 90855781 | shaohong.g97@gmail.com | www.linkedin.com/in/g-shaohong/ | https:...
query was: How many jobs have the applicants done from 2022-2023?
answer was: The applicant has done four jobs from 2022 to 2023:

1. **Neuron Mobility** (Data Engineering Intern) - May 2022 to August 2022  
2. **Infineon Technologies** (Machine Learning and Test Verification Intern) - August 2022 to January 2023  
3. **Asurion** (Intern – Technology, Data Science, AI & Machine Learning) - January 2023 to May 2023  
4. **UBS** (Software Engineer, full-stack) - August 2023 to January 2025 (part of this role falls in 2023).


In [None]:
import json
nodes = [json.loads(x.to_json()) for x in response.source_nodes]
node1 = nodes[0]
node1.keys()

dict_keys(['node', 'score', 'class_name'])

In [63]:
print(node1["score"], nodes[1]["score"])
node1["node"]

0.48575446009635925 0.4998507499694824


{'id_': 'a6455d01-5069-484e-ab17-94bf8c26202e',
 'embedding': None,
 'metadata': {'page_label': '2',
  'file_name': 'my_resume.pdf',
  'file_path': 'data/my_resume.pdf',
  'file_type': 'application/pdf',
  'file_size': 250772,
  'creation_date': '2025-04-27',
  'last_modified_date': '2025-04-27'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {'1': {'node_id': 'abb5dcdd-6203-42a7-99fe-16177741713b',
   'node_type': '4',
   'metadata': {'page_label': '2',
    'file_name': 'my_resume.pdf',
    'file_path': 'data/my_resume.pdf',
    'file_type': 'application/pdf',
    'file_size': 250772,
    'creation_date': '2025-04-27',
    'last_modified_date': '2025-04-27'},
   'hash': '05b49a6b9291d425546cdaa3a50e53191769ff57a18d8517f3f8ce58934

# Example: Pokemon

In [1]:
import sys, os, json
import pandas as pd

import tiktoken
import faiss
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler, LlamaDebugHandler
from llama_index.core.schema import MetadataMode
from llama_index.core.llms import ChatMessage
from llama_index.core.node_parser import JSONNodeParser, SentenceSplitter
from llama_index.core import (
    Document,
    SimpleDirectoryReader,
    load_index_from_storage,
    VectorStoreIndex,
    StorageContext,
    get_response_synthesizer
)
from llama_index.vector_stores.faiss import FaissVectorStore

from dotenv import load_dotenv
load_dotenv(".env")

True

In [2]:
AZURE_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_API_KEY=os.getenv("AZURE_OPENAI_KEY")
AZURE_API_VERSION=os.getenv("AZURE_OPENAI_VERSION")
AZURE_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_NAME")
AZURE_MODEL_NAME= os.getenv("AZURE_OPENAI_MODEL")

EMBEDDING_ENDPOINT = os.getenv("AZURE_EMBEDDING_ENDPOINT")
EMBEDDING_API_KEY=os.getenv("AZURE_EMBEDDING_KEY")
EMBEDDING_API_VERSION=os.getenv("AZURE_EMBEDDING_VERSION")
EMBEDDING_DEPLOYMENT_NAME = os.getenv("AZURE_EMBEDDING_NAME")
EMBEDDING_MODEL_NAME= os.getenv("AZURE_EMBEDDING_MODEL")

llama_debug = LlamaDebugHandler(print_trace_on_end=True)

token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model(AZURE_MODEL_NAME).encode
)
token_counter2 = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model(EMBEDDING_MODEL_NAME).encode
)

def get_token_info(reset=False):
    print(
        "Embedding Tokens: ",
        token_counter.total_embedding_token_count,
        "\n",
        "LLM Prompt Tokens: ",
        token_counter.prompt_llm_token_count,
        "\n",
        "LLM Completion Tokens: ",
        token_counter.completion_llm_token_count,
        "\n",
        "Total LLM Token Count: ",
        token_counter.total_llm_token_count,
        "\n",
    )
    if reset:
        token_counter.reset_counts()
def get_token_info2(reset=False):
    print(
        "Embedding Tokens: ",
        token_counter2.total_embedding_token_count,
        "\n",
        "LLM Prompt Tokens: ",
        token_counter2.prompt_llm_token_count,
        "\n",
        "LLM Completion Tokens: ",
        token_counter2.completion_llm_token_count,
        "\n",
        "Total LLM Token Count: ",
        token_counter2.total_llm_token_count,
        "\n",
    )
    if reset:
        token_counter2.reset_counts()

In [3]:
# import logging
# import sys

# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

llm = AzureOpenAI(
    engine=AZURE_DEPLOYMENT_NAME,
    model=AZURE_MODEL_NAME,
    # temperature=0.0,
    azure_endpoint=AZURE_ENDPOINT,
    api_key=AZURE_API_KEY,
    api_version=AZURE_API_VERSION,
    callback_manager=CallbackManager([token_counter, llama_debug])
)
embed_model = AzureOpenAIEmbedding(
    model=EMBEDDING_MODEL_NAME,
    deployment_name=EMBEDDING_DEPLOYMENT_NAME,
    api_key=EMBEDDING_API_KEY,
    azure_endpoint=EMBEDDING_ENDPOINT,
    api_version=EMBEDDING_API_VERSION,
    callback_manager=CallbackManager([token_counter2, llama_debug])
)


In [4]:
# dimensions of text-embedding-3-large
d = 3072 # 3072
VECTOR_STORE_LOC = "./storage_row_doc"
VECTOR_STORE_LOC = "./storage_row_text"
VECTOR_STORE_LOC = "./storage_row_node"

## Add Description to dataset with LLMs (One-time)

In [None]:
POKEMON_FILE = "./data/pokemon.csv"
POKEMON_FILE_SAVED = "./data/pokemon_updated.csv"
df = pd.read_csv(POKEMON_FILE)
df.head()

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,4,60,50,65,fire,,8.5,1,0
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,5,80,65,80,fire,,19.0,1,0


In [None]:
POKEMON_DESCRIPTION_PROMPT = """
Give me a detailed summary of the pokemon, {name}.
You should provide the following details:
1. Characteristics 
2. Behavior
3. Pokedex description
4. Notable trainers who owns them
5. Famous scene in Pokemon franchise
"""
##### Commented this to avoid running again #####
df["description"] = df["name"].apply(lambda x: str(llm.complete(POKEMON_DESCRIPTION_PROMPT.format(name=x))) )
df.to_csv(POKEMON_FILE_SAVED)

##### Tokens spent #####
get_token_info(reset=True)

Embedding Tokens:  0 
 LLM Prompt Tokens:  47120 
 LLM Completion Tokens:  755556 
 Total LLM Token Count:  802676 



## Indexing and store at index store
- https://www.reddit.com/r/LlamaIndex/comments/1ab57n4/any_ideas_for_getting_statistics_about_internal/

In [5]:
POKEMON_FILE_SAVED = "./data/pokemon_updated.csv"
df = pd.read_csv(POKEMON_FILE_SAVED)
df.head()

Unnamed: 0.1,Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,...,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary,description
0,0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,...,1,65,65,45,grass,poison,6.9,1,0,### Bulbasaur: Detailed Summary\n\n#### 1. **C...
1,1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,...,2,80,80,60,grass,poison,13.0,1,0,### Ivysaur: Detailed Summary\n\n#### 1. **Cha...
2,2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,...,3,122,120,80,grass,poison,100.0,1,0,### **Venusaur: Detailed Summary**\n\nVenusaur...
3,3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,...,4,60,50,65,fire,,8.5,1,0,### Detailed Summary of Charmander\n\nCharmand...
4,4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,...,5,80,65,80,fire,,19.0,1,0,### Detailed Summary of Charmeleon\n\nCharmele...


### Vector Search

In [6]:
faiss_index = faiss.IndexFlatL2(d)
vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
df_json = df.to_json(orient = "records")
df_json = json.loads(df_json)

documents = []
for each_pokemon in df_json:
    documents.append(
        Document(
            # text=json.dumps(each_pokemon),
            text=json.dumps(each_pokemon["description"]).replace("\n", " "),
            metadata={
                "name": each_pokemon["name"],
                "generation": each_pokemon["generation"],
                "type1": each_pokemon["type1"],
                "type2": each_pokemon["type2"],
                "is_legendary": each_pokemon["is_legendary"]
            },
            id_= each_pokemon["name"],
            metadata_seperator="::",
            metadata_template="{key}=>{value}",
            text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
            # excluded_embed_metadata_keys=["name"],
            # excluded_llm_metadata_keys=["name","generation","type1","type2","is_legendary"]
        )
    )

# parser = JSONNodeParser()
# nodes = parser.get_nodes_from_documents([document], show_progress=True)

In [10]:
document = documents[4]
print(
    "The LLM sees this: \n",
    document.get_content(metadata_mode=MetadataMode.LLM),
)
print("\n##########################\n")
print(
    "The Embedding model sees this: \n",
    document.get_content(metadata_mode=MetadataMode.EMBED),
)

The LLM sees this: 
 Metadata: name=>Charmeleon::generation=>1::type1=>fire::type2=>None::is_legendary=>0
-----
Content: "### Detailed Summary of Charmeleon\n\nCharmeleon is a Fire-type Pok\u00e9mon introduced in Generation I. It is the evolved form of Charmander and evolves into Charizard at level 36. Below is a detailed breakdown of Charmeleon\u2019s characteristics, behavior, Pok\u00e9dex description, notable trainers, and famous scenes in the Pok\u00e9mon franchise.\n\n---\n\n### **1. Characteristics**\n- **Appearance**: Charmeleon is a bipedal, reptilian Pok\u00e9mon with a red-orange body, a long tail tipped with a flame, and a horn-like structure on the back of its head. Its claws are sharp, and it has a more aggressive and mature look compared to its pre-evolution, Charmander.\n- **Height**: 1.1 meters (3'07\").\n- **Weight**: 19.0 kilograms (41.9 lbs).\n- **Type**: Fire.\n- **Abilities**: Charmeleon typically has the ability *Blaze*, which boosts the power of Fire-type moves w

In [16]:
index = VectorStoreIndex.from_documents(documents, 
                                        storage_context=storage_context, 
                                        embed_model=embed_model, 
                                        show_progress=True,
                                        transformations=[SentenceSplitter(chunk_size=1024, chunk_overlap=20)]
                                        )

Parsing nodes:   0%|          | 0/801 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1479 [00:00<?, ?it/s]

In [None]:
len(index.docstore.docs.get('96646498-32a9-411c-aed8-c5e95bdd4c33').text.split(" "))

532

In [25]:
index.storage_context.persist(persist_dir = VECTOR_STORE_LOC)

In [74]:
node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=100)

nodes = node_parser.get_nodes_from_documents(
    documents, show_progress=True
)

Parsing nodes:   0%|          | 0/801 [00:00<?, ?it/s]

In [84]:
i = 2
print(nodes[2+i].text[-100:])
print(nodes[3+i].text[:100])
nodes[5].get_content(MetadataMode.EMBED)

re uses a Venusaur during battles with the player, showcasing its Mega Evolution.\n\n---\n\n### **5.
Famous Scene in Pok\u00e9mon Franchise**\nOne of the most memorable scenes involving Venusaur occurs


'Metadata: name=>Venusaur::generation=>1::type1=>grass::type2=>poison::is_legendary=>0\n-----\nContent: Famous Scene in Pok\\u00e9mon Franchise**\\nOne of the most memorable scenes involving Venusaur occurs in the **Pok\\u00e9mon Origins** mini-series. In the climactic battle between Red and Blue, Red\\u2019s Venusaur faces off against Blue\\u2019s Charizard. The intense battle showcases Venusaur\\u2019s strategic use of moves like Solar Beam and Sleep Powder, highlighting its strength and versatility. The scene culminates in Venusaur\\u2019s victory, solidifying its role as one of Red\\u2019s most trusted Pok\\u00e9mon.\\n\\nAnother iconic moment is Venusaur\\u2019s appearance in **Pok\\u00e9mon: The First Movie**, where it battles alongside Blastoise and Charizard against their cloned counterparts created by Mewtwo. The battle is intense and visually striking, showcasing Venusaur\\u2019s raw power and determination.\\n\\n---\\n\\nVenusaur remains a fan-favorite Pok\\u00e9mon due to i

In [None]:
retriever = index.as_retriever(similarity_top_k=10)

In [None]:
##### Only Text with chunking #####

nodes = retriever.retrieve("However, it can be reckless and overconfident, especially when it underestimates its opponent")
# nodes = retriever.retrieve("What are some pokemon which can be reckless and overconfident, especially when it underestimates its opponent?")

for each_node in nodes:
    result = each_node.to_dict()
    print(f"{result['node']['metadata']['name']} ({result['score']}) -- {result['node']['text']}")

**********
Trace: query
    |_CBEventType.RETRIEVE -> 1.302882 seconds
      |_CBEventType.EMBEDDING -> 1.300432 seconds
**********
Rampardos (1.3677854537963867) -- However, Ash's Pikachu and Turtwig manage to outmaneuver it, showcasing the importance of strategy over brute strength. This battle highlights Rampardos' raw power and its role as a challenging Gym Leader Pok\u00e9mon.\n- **In the Games**: In the Sinnoh games, Roark's Rampardos is a significant challenge for players early in their journey. Its high Attack stat and moves like Headbutt can easily knock out unprepared teams, making it a memorable opponent for many players.\n\n---\n\nRampardos remains a fan-favorite fossil Pok\u00e9mon due to its unique design, incredible strength, and ties to prehistoric creatures. Its role as Roark's ace Pok\u00e9mon and its reputation as a glass cannon make it a standout in the Pok\u00e9mon franchise."
Buzzwole (1.379696249961853) -- This behavior is both comical and intimidating, showcasin

### BM25 Retriever
- https://docs.llamaindex.ai/en/stable/examples/retrievers/bm25_retriever/

In [14]:
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.response.notebook_utils import display_source_node
import Stemmer

Matplotlib is building the font cache; this may take a moment.


In [7]:
docstore = SimpleDocumentStore.from_persist_dir(persist_dir=VECTOR_STORE_LOC)

In [16]:
bm25_retriever = BM25Retriever.from_defaults(
    docstore=docstore,
    similarity_top_k=3,
    # Optional: We can pass in the stemmer and set the language for stopwords
    # This is important for removing stopwords and stemming the query + text
    # The default is english for both
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)

In [None]:
retrieved_nodes = bm25_retriever.retrieve(
    "However, it can be reckless and overconfident, especially when it underestimates its opponent"
)
for node in retrieved_nodes:
    display_source_node(node, source_length=5000)

**Node ID:** 224a4daf-c96a-452a-a81d-c6e728a5a5e9<br>**Similarity:** 8.339054107666016<br>**Text:** "### Charizard: Detailed Summary\n\nCharizard is one of the most iconic Pok\u00e9mon in the franchise, beloved by fans for its powerful design, fiery abilities, and prominent role in the Pok\u00e9mon series. Below is a detailed breakdown of Charizard's characteristics, behavior, Pok\u00e9dex description, notable trainers, and famous moments in the Pok\u00e9mon franchise.\n\n---\n\n### 1. **Characteristics**\n- **Species**: Flame Pok\u00e9mon\n- **Type**: Fire/Flying\n- **Height**: 5'07\" (1.7 m)\n- **Weight**: 199.5 lbs (90.5 kg)\n- **Evolutionary Line**: Charmander \u2192 Charmeleon \u2192 Charizard\n- **Mega Evolutions**: Charizard has two Mega Evolutions, Mega Charizard X (Fire/Dragon type) and Mega Charizard Y (Fire/Flying type), introduced in *Pok\u00e9mon X and Y*.\n- **Gigantamax Form**: Charizard also has a Gigantamax form introduced in *Pok\u00e9mon Sword and Shield*, which gives it a massive, fiery appearance with flames erupting from its wings and body.\n\nCharizard resembles a large, orange dragon with a long tail tipped with a flame. Its wings are blue on the inside and allow it to fly at high speeds. Despite its dragon-like appearance, it is not a Dragon-type Pok\u00e9mon in its base form.\n\n---\n\n### 2. **Behavior**\nCharizard is known for its fiery temperament and pride. It is a powerful and competitive Pok\u00e9mon that loves to battle strong opponents. However, it can be reckless and overconfident, especially when it underestimates its opponent. Charizard\u2019s flame burns hotter when it is excited or angry, and it is said to never use its fire breath on weaker opponents, showing a sense of honor.\n\nCharizard is also highly loyal to trainers who earn its respect, but it can be disobedient if it feels its trainer is inexperienced or unworthy, as seen in the Pok\u00e9mon anime.\n\n---\n\n### 3. **Pok\u00e9dex Description**\nCharizard has appeared in nearly every Pok\u00e9mon game, and its Pok\u00e9dex entries often emphasize its fiery breath and flying abilities. Below are some notable Pok\u00e9dex descriptions:\n\n- **Pok\u00e9mon Red/Blue**: \"Spits fire that is hot enough to melt boulders. Known to cause forest fires unintentionally.\"\n- **Pok\u00e9mon Gold**: \"If Charizard becomes furious, the flame at the tip of its tail flares up in a whitish-blue color.\"\n- **Pok\u00e9mon Sword**: \"It spits fire that is hot enough to melt boulders. It may cause forest fires by blowing flames.\"\n- **Pok\u00e9mon Shield**: \"The flame inside its body burns hotter than 3,600 degrees Fahrenheit. When Charizard roars, that temperature climbs even higher.\"\n\nThese entries highlight its destructive power and the intensity of its flames.\n\n---\n\n### 4. **Notable Trainers Who Own Charizard**\nCharizard has been owned by several notable trainers in the Pok\u00e9mon franchise, including:\n\n- **Ash Ketchum**: Ash\u2019s Charizard is one of his most famous Pok\u00e9mon. It started as a Charmander abandoned by its previous trainer, Damian, and evolved into a powerful but initially disobedient Charizard. Over time, Ash earned its respect, and it became one of his most reliable and iconic Pok\u00e9mon.\n- **Leon**: The Champion of the Galar region in *Pok\u00e9mon Sword and Shield* is known for his Gigantamax Charizard, which serves as his signature Pok\u00e9mon.<br>

**Node ID:** 9374b01e-0394-451f-a439-f56a45ad3e84<br>**Similarity:** 4.360943794250488<br>**Text:** The evolution marks a turning point in the Pok\u00e9mon\u2019s personality, as it becomes more confident and assertive while still retaining its loyalty to its owner.\n\nAnother notable scene is when Granbull is used in battles during various tournaments and showcases its powerful jaw attacks, such as *Crunch* and *Play Rough*. Its intimidating presence often surprises opponents who underestimate its strength due to its timid nature.\n\n---\n\nGranbull remains a fan-favorite Pok\u00e9mon for its unique combination of a fierce appearance and gentle personality. Its role in the franchise highlights themes of loyalty, courage, and the importance of not judging others based on their looks."<br>

**Node ID:** 2f8b9c1b-8108-404b-b531-9ec23fb9f679<br>**Similarity:** 3.9626479148864746<br>**Text:** "### Detailed Summary of Rhyhorn\n\n#### 1. **Characteristics**\nRhyhorn is a dual-type **Ground/Rock Pok\u00e9mon** introduced in Generation I. It is known for its rugged, rhinoceros-like appearance, with a body covered in thick, gray, rock-like armor that provides excellent defense. Its body is quadrupedal, with short, sturdy legs and a spiked tail. Rhyhorn has a single, prominent horn on its snout, which it uses for charging and attacking opponents. Its eyes are small and fierce, giving it an intimidating look.\n\nRhyhorn is a relatively large Pok\u00e9mon, standing at **3'03\" (1.0 m)** tall and weighing **253.5 lbs (115.0 kg)**. Its physical strength and durability make it a formidable Pok\u00e9mon in battle, especially when utilizing its high Attack and Defense stats. However, it is not known for its speed or agility, and its Special Defense is notably weak.\n\n#### 2. **Behavior**\nRhyhorn is a straightforward and somewhat reckless Pok\u00e9mon. It is known for charging at anything it sees, often without thinking. This behavior stems from its instinctive nature and its reliance on brute strength rather than strategy. However, Rhyhorn's intelligence is relatively low, and it has difficulty remembering things for long periods. For example, it may forget why it started charging in the first place.\n\nDespite its aggressive tendencies, Rhyhorn is not inherently malicious. It is simply driven by its instincts and its natural affinity for physical combat. In the wild, Rhyhorn is often seen roaming rocky terrains, deserts, and mountainous regions, where it can use its sturdy body to navigate harsh environments.\n\n#### 3. **Pokedex Description**\nRhyhorn's Pok\u00e9dex entries across various games emphasize its physical power, durability, and lack of intelligence. Here are some notable examples:\n\n- **Red/Blue:** \"Its massive bones are 1,000 times harder than human bones. It can easily knock a trailer flying.\"\n- **Yellow:** \"A Pok\u00e9mon with a one-track mind. Once it charges, it won\u2019t stop running until it falls asleep.\"\n- **Gold:** \"It is inept at turning because of its four short legs. It can only charge and run in one direction.\"\n- **Ruby/Sapphire:** \"Rhyhorn runs in a straight line, smashing everything in its path. It is not bothered even if it rushes headlong into a block of steel. This Pok\u00e9mon may feel some pain from the collision the next day, however.\"\n\nThese entries highlight Rhyhorn's incredible physical strength and endurance, as well as its somewhat clumsy and single-minded nature.\n\n#### 4. **Notable Trainers Who Own Them**\nRhyhorn has been owned by several notable trainers in the Pok\u00e9mon franchise, including:\n\n- **Blaine (Gym Leader):** In the Pok\u00e9mon Adventures manga, Blaine is shown using a Rhyhorn during battles.\n- **Kiawe (Anime):** In the Pok\u00e9mon Sun and Moon anime, Kiawe's family owns a Rhyhorn that is used for farm work.\n- **Serena (Anime):** In the Pok\u00e9mon X and Y anime, Serena is shown to have experience riding Rhyhorn, as her mother is a famous Rhyhorn racer. While Serena does not own a Rhyhorn herself, her connection to Rhyhorn racing is a significant part of her backstory.\n\n#### 5. **Famous Scene in the Pok\u00e9mon Franchise**\nOne of the most memorable scenes involving Rhyhorn occurs in the **Pok\u00e9mon X and Y anime**, where Serena is shown practicing Rhyhorn racing. This scene is significant because it highlights Serena's upbringing and her initial struggles to find her own path in life. Her mother, Grace, is a renowned Rhyhorn racer, and Serena is expected to follow in her footsteps. However, Serena ultimately decides to pursue her own dreams, which sets her on the path to becoming a Pok\u00e9mon Performer.\n\nAnother notable moment is in the **Pok\u00e9mon Adventures manga**, where Rhyhorn is used in intense battles, showcasing its raw power and durability.<br>

### KeywordTableIndex
- https://docs.llamaindex.ai/en/stable/examples/query_engine/CustomRetrievers/

In [37]:
from llama_index.core import SimpleKeywordTableIndex, VectorStoreIndex


faiss_index = faiss.IndexFlatL2(d)
vector_store = FaissVectorStore(faiss_index=faiss_index).from_persist_dir(persist_dir=VECTOR_STORE_LOC)
storage_context = StorageContext.from_defaults(vector_store=vector_store, persist_dir=VECTOR_STORE_LOC)
index = load_index_from_storage(storage_context=storage_context, embed_model=embed_model, callback_manager=CallbackManager([token_counter, llama_debug]), transformations=[SentenceSplitter(chunk_size=1024, chunk_overlap=20)] )


keyword_index = SimpleKeywordTableIndex(list(index.docstore.docs.values()), storage_context=storage_context, llm=llm)


INFO:root:Loading llama_index.vector_stores.faiss.base from ./storage_row_node/default__vector_store.json.
Loading llama_index.vector_stores.faiss.base from ./storage_row_node/default__vector_store.json.
DEBUG:llama_index.core.storage.kvstore.simple_kvstore:Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage_row_node/docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage_row_node/docstore.json.
DEBUG:fsspec.local:open file: /Users/gsh/Documents/2 - Codes/All-About-LLM/storage_row_node/docstore.json
open file: /Users/gsh/Documents/2 - Codes/All-About-LLM/storage_row_node/docstore.json
DEBUG:llama_index.core.storage.kvstore.simple_kvstore:Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage_row_node/index_store.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage_row_node/index_store.json.
DEBUG:fsspec.local:open file: /Users/gsh/Documents/2 - Codes/All-About-LLM/storage_row_node/index_store.json


In [52]:
# import QueryBundle
from llama_index.core import QueryBundle

# import NodeWithScore
# from llama_index.core.schema import NodeWithScore

# Retrievers
from llama_index.core.retrievers import (
    # BaseRetriever,
    # VectorIndexRetriever,
    KeywordTableSimpleRetriever,
)

from typing import List

keyword_retriever = KeywordTableSimpleRetriever(index=keyword_index, llm=llm, callback_manager=CallbackManager([token_counter, llama_debug]))
keyword_retriever.retrieve(QueryBundle("However, it can be reckless and overconfident, especially when it underestimates its opponent"))

INFO:llama_index.core.indices.keyword_table.retrievers:> Starting query: However, it can be reckless and overconfident, especially when it underestimates its opponent
> Starting query: However, it can be reckless and overconfident, especially when it underestimates its opponent
INFO:llama_index.core.indices.keyword_table.retrievers:query keywords: ['opponent', 'reckless', 'overconfident', 'underestimates', 'especially', 'however']
query keywords: ['opponent', 'reckless', 'overconfident', 'underestimates', 'especially', 'however']
INFO:llama_index.core.indices.keyword_table.retrievers:> Extracted keywords: ['opponent', 'however']
> Extracted keywords: ['opponent', 'however']
DEBUG:llama_index.core.indices.keyword_table.retrievers:> Querying with idx: b16eb1c4-9cb1-4c59-ab3a-9a0c47ec679d: Clair's Kingdra is infamous among players for i...
> Querying with idx: b16eb1c4-9cb1-4c59-ab3a-9a0c47ec679d: Clair's Kingdra is infamous among players for i...
DEBUG:llama_index.core.indices.keyword_ta

[NodeWithScore(node=TextNode(id_='b16eb1c4-9cb1-4c59-ab3a-9a0c47ec679d', embedding=None, metadata={'name': 'Kingdra', 'generation': 2, 'type1': 'water', 'type2': 'dragon', 'is_legendary': 0}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='Kingdra', node_type='4', metadata={'name': 'Kingdra', 'generation': 2, 'type1': 'water', 'type2': 'dragon', 'is_legendary': 0}, hash='e40c0fc668ea79f909755d8a3e08583f5a8dfe45f677f37c0e8d5c4ddfa1e350'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='358579a9-0fa1-4184-8250-02286b5180ff', node_type='1', metadata={'name': 'Kingdra', 'generation': 2, 'type1': 'water', 'type2': 'dragon', 'is_legendary': 0}, hash='0a21006532b8d340309f71b2306086bfaeaf512b9b52ec6b459bbd343e8f4b2c')}, metadata_template='{key}=>{value}', metadata_separator='::', text='Clair\'s Kingdra is infamous among players for its high stats and lack of weaknesses in Generation II, as the on

In [55]:
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
keyword_query_engine = RetrieverQueryEngine(
    retriever=keyword_retriever,
    response_synthesizer=get_response_synthesizer(llm=llm, callback_manager=CallbackManager([token_counter, llama_debug])), 
    callback_manager=CallbackManager([token_counter, llama_debug])
)
response = keyword_query_engine.query(
    "However, it can be reckless and overconfident, especially when it underestimates its opponent"
)
print(response.source_nodes)
print(str(response))
get_token_info(True)
get_token_info2(True)

INFO:llama_index.core.indices.keyword_table.retrievers:> Starting query: However, it can be reckless and overconfident, especially when it underestimates its opponent
> Starting query: However, it can be reckless and overconfident, especially when it underestimates its opponent
INFO:llama_index.core.indices.keyword_table.retrievers:query keywords: ['opponent', 'reckless', 'overconfident', 'underestimates', 'especially', 'however']
query keywords: ['opponent', 'reckless', 'overconfident', 'underestimates', 'especially', 'however']
INFO:llama_index.core.indices.keyword_table.retrievers:> Extracted keywords: ['opponent', 'however']
> Extracted keywords: ['opponent', 'however']
DEBUG:llama_index.core.indices.keyword_table.retrievers:> Querying with idx: b16eb1c4-9cb1-4c59-ab3a-9a0c47ec679d: Clair's Kingdra is infamous among players for i...
> Querying with idx: b16eb1c4-9cb1-4c59-ab3a-9a0c47ec679d: Clair's Kingdra is infamous among players for i...
DEBUG:llama_index.core.indices.keyword_ta

In [49]:
[x.get_content() for x in response.source_nodes]

['Clair\'s Kingdra is infamous among players for its high stats and lack of weaknesses in Generation II, as the only Dragon-type moves available at the time were Dragon Rage and Twister, which were not very effective. This made Kingdra a formidable opponent and a defining moment for many trainers in their journey through Johto.\\n\\nIn the Pok\\u00e9mon anime, Kingdra makes a notable appearance during Ash\'s battle against Clair. Clair\'s Kingdra demonstrates its strength and strategy, using moves like *Smokescreen* to lower visibility and *Hydro Pump* to deal massive damage. The battle showcases Kingdra\'s ability to dominate in both offense and defense, solidifying its reputation as a powerful Dragon Pok\\u00e9mon.\\n\\n---\\n\\n### Summary\\nKingdra is a majestic and powerful Pok\\u00e9mon that embodies the mystery and strength of the ocean. With its unique Water/Dragon typing, it is a versatile and challenging opponent in battles. Its association with notable trainers like Clair an

## Test RAG Techniques
- https://docs.llamaindex.ai/en/stable/examples/vector_stores/AzureAISearchIndexDemo/#query-mode
- https://docs.llamaindex.ai/en/stable/examples/retrievers/bm25_retriever/
- https://docs.llamaindex.ai/en/stable/examples/query_engine/CustomRetrievers/

In [None]:

x = list(map(lambda x: x.text, index.docstore.get_nodes(['96646498-32a9-411c-aed8-c5e95bdd4c33','c46a0b71-b28c-4d35-8e96-4d8ff19fa33f'])) )
print(x[0][-25:])
print(x[1][:50])


e Pok\u00e9mon franchise.
One of the most famous scenes involving Bulbasaur 


In [53]:
index.docstore.get_node('c46a0b71-b28c-4d35-8e96-4d8ff19fa33f').get_content(MetadataMode.ALL)

'Metadata: name=>Bulbasaur::generation=>1::type1=>grass::type2=>poison::is_legendary=>0\n-----\nContent: One of the most famous scenes involving Bulbasaur is from the **original Pok\\u00e9mon anime series**, specifically in the episode titled *\\"Bulbasaur and the Hidden Village\\"*.\\n\\n- **Episode Summary**: In this episode, Ash and his friends encounter a wild Bulbasaur that is fiercely protective of an injured Pok\\u00e9mon sanctuary in the Hidden Village. Bulbasaur initially challenges Ash to a battle, showcasing its strength and determination. After Ash defeats it in battle, Bulbasaur agrees to join his team. This episode highlights Bulbasaur\'s loyalty and protective nature, traits that remain consistent throughout its time with Ash.\\n\\nAnother iconic moment is in the **Pok\\u00e9mon movie \\"Mewtwo Strikes Back\\"**, where Ash\'s Bulbasaur fights valiantly against Mewtwo\'s cloned Bulbasaur. This scene emphasizes Bulbasaur\'s courage and fighting spirit, even when facing ove

## RAG Solution
- Hybrid search with agentic flow
- https://docs.llamaindex.ai/en/stable/examples/index_structs/struct_indices/SQLIndexDemo/#query-index
- https://docs.llamaindex.ai/en/stable/examples/query_engine/SQLRouterQueryEngine/
- https://docs.llamaindex.ai/en/stable/examples/query_engine/recursive_retriever_agents/

In [15]:
# import QueryBundle
from llama_index.core import QueryBundle

# import NodeWithScore
from llama_index.core.schema import NodeWithScore

# Retrievers
from llama_index.core.retrievers import (
    BaseRetriever,
    VectorIndexRetriever
)

from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.retrievers.bm25 import BM25Retriever

from llama_index.core.agent.workflow import FunctionAgent, ReActAgent
from llama_index.core.tools import FunctionTool
from llama_index.core.agent.workflow import AgentStream, ToolCallResult

import Stemmer
from typing import List

In [16]:
get_token_info(True)
get_token_info2(True)

Embedding Tokens:  0 
 LLM Prompt Tokens:  0 
 LLM Completion Tokens:  0 
 Total LLM Token Count:  0 

Embedding Tokens:  0 
 LLM Prompt Tokens:  0 
 LLM Completion Tokens:  0 
 Total LLM Token Count:  0 



In [17]:
llm_callback_manager = CallbackManager([token_counter, llama_debug])
embdding_callback_manager = CallbackManager([token_counter2, llama_debug])

faiss_index = faiss.IndexFlatL2(d)
vector_store = FaissVectorStore(faiss_index=faiss_index).from_persist_dir(persist_dir=VECTOR_STORE_LOC)
storage_context = StorageContext.from_defaults(vector_store=vector_store, persist_dir=VECTOR_STORE_LOC)
# index = load_index_from_storage(storage_context=storage_context, embed_model=embed_model, callback_manager=CallbackManager([token_counter, llama_debug]) )
index = load_index_from_storage(storage_context=storage_context, embed_model=embed_model, callback_manager=llm_callback_manager, transformations=[SentenceSplitter(chunk_size=1024, chunk_overlap=20)] )

**********
Trace: index_construction
**********


In [19]:
test = index.as_query_engine(llm=llm)
test.query("Give me all description of Charizard in rag.")

Response(response='Charizard\'s descriptions in the Pokédex emphasize its fiery breath and immense power. Here are the notable entries:\n\n- **Pokémon Red/Blue**: "Spits fire that is hot enough to melt boulders. Known to cause forest fires unintentionally."\n- **Pokémon Gold**: "If Charizard becomes furious, the flame at the tip of its tail flares up in a whitish-blue color."\n- **Pokémon Sword**: "It spits fire that is hot enough to melt boulders. It may cause forest fires by blowing flames."\n- **Pokémon Shield**: "The flame inside its body burns hotter than 3,600 degrees Fahrenheit. When Charizard roars, that temperature climbs even higher."', source_nodes=[NodeWithScore(node=TextNode(id_='224a4daf-c96a-452a-a81d-c6e728a5a5e9', embedding=None, metadata={'name': 'Charizard', 'generation': 1, 'type1': 'fire', 'type2': 'flying', 'is_legendary': 0}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='Cha

In [51]:
similarity_top_k = 20
vector_retriever = index.as_retriever(similarity_top_k=similarity_top_k)

docstore = SimpleDocumentStore.from_persist_dir(persist_dir=VECTOR_STORE_LOC)
bm25_retriever = BM25Retriever.from_defaults(
    docstore=docstore,
    similarity_top_k=similarity_top_k,
    # Optional: We can pass in the stemmer and set the language for stopwords
    # This is important for removing stopwords and stemming the query + text
    # The default is english for both
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)

In [52]:
query_text = "reckless and overconfident, especially when it underestimates its opponent"
response1 = vector_retriever.retrieve(query_text)
print(str(response1))
get_token_info(True)
get_token_info2(True)
response2 = bm25_retriever.retrieve(query_text)
print(str(response2))
get_token_info(True)
get_token_info2(True)

**********
Trace: query
    |_CBEventType.RETRIEVE -> 1.04649 seconds
      |_CBEventType.EMBEDDING -> 1.043023 seconds
**********
[NodeWithScore(node=TextNode(id_='313e0f97-7ae4-4759-bf2c-3924840df5f7', embedding=None, metadata={'name': 'Rampardos', 'generation': 4, 'type1': 'rock', 'type2': None, 'is_legendary': 0}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='Rampardos', node_type='4', metadata={'name': 'Rampardos', 'generation': 4, 'type1': 'rock', 'type2': None, 'is_legendary': 0}, hash='0bd77c76361ab430c630323824d81a72c662b2b67a3270f52cbbd254949d8cf6'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='e22c01e1-6c44-4815-bcc9-9beb9596f4dd', node_type='1', metadata={'name': 'Rampardos', 'generation': 4, 'type1': 'rock', 'type2': None, 'is_legendary': 0}, hash='a9ce372fa312e0c80b9de6960b11bd669f2f6ecc944fce8a1c1d4f4e5353d1e2')}, metadata_template='{key}=>{value}', metadata_separator=

In [53]:
class CustomRetriever(BaseRetriever):
    """Custom retriever that performs both semantic search and hybrid search."""

    def __init__(
        self,
        vector_retriever: VectorIndexRetriever,
        keyword_retriever: BM25Retriever,
        mode: str = "AND",
        similarity_top_k=3
    ) -> None:
        """Init params."""

        self._vector_retriever = vector_retriever
        self._keyword_retriever = keyword_retriever
        if mode not in ("AND", "OR"):
            raise ValueError("Invalid mode.")
        self._mode = mode
        self._similarity_top_k=similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""

        vector_nodes = self._vector_retriever.retrieve(query_bundle)
        keyword_nodes = self._keyword_retriever.retrieve(query_bundle)
        print(vector_nodes)
        print(keyword_nodes)
        vector_ids = {n.node.node_id for n in vector_nodes}
        keyword_ids = {n.node.node_id for n in keyword_nodes}

        combined_dict = {n.node.node_id: n for n in vector_nodes}
        combined_dict.update({n.node.node_id: n for n in keyword_nodes})

        if self._mode == "AND":
            retrieve_ids = vector_ids.intersection(keyword_ids)
        else:
            retrieve_ids = vector_ids.union(keyword_ids)

        retrieve_nodes = [combined_dict[rid] for rid in retrieve_ids]
        if len(retrieve_nodes) > self._similarity_top_k:
            retrieve_nodes.sort(key= lambda x: x.get_score(), reverse=True)
            retrieve_nodes = retrieve_nodes[:self._similarity_top_k]
        return retrieve_nodes

In [54]:
custom_retriever = CustomRetriever(vector_retriever, bm25_retriever, mode="OR", similarity_top_k=3)


In [60]:
query_text = "reckless and overconfident, especially when it underestimates its opponent"
response = custom_retriever.retrieve(query_text)
print([x.get_score() for x in response])
print([x.metadata["name"] for x in response])
print(str(response))
get_token_info(True)
get_token_info2(True)

[NodeWithScore(node=TextNode(id_='313e0f97-7ae4-4759-bf2c-3924840df5f7', embedding=None, metadata={'name': 'Rampardos', 'generation': 4, 'type1': 'rock', 'type2': None, 'is_legendary': 0}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='Rampardos', node_type='4', metadata={'name': 'Rampardos', 'generation': 4, 'type1': 'rock', 'type2': None, 'is_legendary': 0}, hash='0bd77c76361ab430c630323824d81a72c662b2b67a3270f52cbbd254949d8cf6'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='e22c01e1-6c44-4815-bcc9-9beb9596f4dd', node_type='1', metadata={'name': 'Rampardos', 'generation': 4, 'type1': 'rock', 'type2': None, 'is_legendary': 0}, hash='a9ce372fa312e0c80b9de6960b11bd669f2f6ecc944fce8a1c1d4f4e5353d1e2')}, metadata_template='{key}=>{value}', metadata_separator='::', text='However, Ash\'s Pikachu and Turtwig manage to outmaneuver it, showcasing the importance of strategy over brute strength

In [65]:
query = "What are some pokemon which can be reckless and overconfident, especially when it underestimates its opponent? Do not assume. What about charilizard? Response format should be as follows: ##### Response #####\n\n <response from llm> ##### RAG #####\n\n <Show all output retrieve from rag>"
# query = "Give top 10 pokemon which is 'reckless and overconfident, especially when it underestimates its opponent'? Match the description as close as possible and do not assume! "
# query = "Is Dratini a legendery pokemon? Give me evidence and do not assume."

response_synthesizer = get_response_synthesizer(response_mode="compact", llm=llm, callback_manager=llm_callback_manager, verbose=True ) # refine , context_only, accumulate, compact (default)
query_engine = index.as_query_engine(response_synthesizer=response_synthesizer, llm=llm)
response = query_engine.query(query)
print(str(response))
get_token_info(True)

##### Response #####

Charizard is known to be a Pokémon that can be reckless and overconfident, especially when it underestimates its opponent. Its fiery temperament and pride often lead to such behavior, although it also displays a sense of honor by not using its fire breath on weaker opponents.

##### RAG #####

Charizard is known for its fiery temperament and pride. It is a powerful and competitive Pokémon that loves to battle strong opponents. However, it can be reckless and overconfident, especially when it underestimates its opponent. Charizard’s flame burns hotter when it is excited or angry, and it is said to never use its fire breath on weaker opponents, showing a sense of honor.
Embedding Tokens:  124 
 LLM Prompt Tokens:  3420 
 LLM Completion Tokens:  242 
 Total LLM Token Count:  3662 



In [67]:
# First create a tool for the agent
from llama_index.core.tools import QueryEngineTool

tool = QueryEngineTool.from_defaults(
    query_engine=query_engine,
    name="PokemonQueryEngine",
    description="Given a question about Pokemon, will return an answer.",
)
SYSTEM_PROMPT = "You are a Pokemon expert. Answer questions related to Pokemon using only PokemonQueryEngine. Return 'Unable to find' if no result can be found in PokemonQueryEngine. Do not assume."
f_agent = FunctionAgent(
    tools=[tool],
    llm=llm,
    system_prompt=SYSTEM_PROMPT
)

In [68]:
handler = f_agent.run(query)
async for ev in handler.stream_events():
    if isinstance(ev, AgentStream):
        print(f"{ev.delta}", end="", flush=True)
    elif isinstance(ev, ToolCallResult):
        print(
            f"\nCall {ev.tool_name} with {ev.tool_kwargs}\nReturned: {ev.tool_output}"
        )
response = await handler

print("\n\n")
get_token_info(True)
print(response.model_dump_json())
print(list(map(lambda x: [x.tool_name, x.tool_output.raw_output, x.tool_kwargs],response.tool_calls)))
print(str(response))

**********
Trace: chat
    |_CBEventType.LLM -> 0.0 seconds
**********

Call PokemonQueryEngine with {'input': 'Is Charizard known to be reckless and overconfident, especially when it underestimates its opponent?'}
Returned: Yes, Charizard is known to be reckless and overconfident, especially when it underestimates its opponent. This behavior is part of its fiery temperament and competitive nature.

Call PokemonQueryEngine with {'input': 'What are some Pokémon which can be reckless and overconfident, especially when it underestimates its opponent?'}
Returned: Rampardos is a Pokémon that can be reckless and overconfident, especially due to its raw power and high Attack stat. Its role as a challenging Gym Leader Pokémon highlights its tendency to rely on brute strength, which can lead to underestimating opponents who use strategy and agility to outmaneuver it.
**********
Trace: chat
    |_CBEventType.LLM -> 0.0 seconds
**********
##### Response #####

Rampardos is a Pokémon that can be r

In [86]:
query = "What are some pokemon which can be reckless and overconfident, especially when it underestimates its opponent? Do not assume. What about charilizard?"
query = "Give top 10 pokemon which is 'reckless and overconfident, especially when it underestimates its opponent'? Match the description as close as possible and do not assume! "
query = "Is Dratini a legendery pokemon? Give me evidence and do not assume."

response_synthesizer = get_response_synthesizer(response_mode="compact", llm=llm, callback_manager=CallbackManager([llama_debug]), verbose=True ) # refine , context_only, accumulate, compact (default)
query_engine = index.as_query_engine(response_synthesizer=response_synthesizer, llm=llm )
response = query_engine.query(query)
str(response)

'No, Dratini is not a legendary Pokémon. Evidence for this includes its classification in the Pokémon games and media, where it is not designated as legendary. Additionally, its evolution line (Dratini → Dragonair → Dragonite) is a standard progression, unlike legendary Pokémon, which typically do not evolve. Furthermore, Dratini\'s rarity is highlighted in Pokédex entries, but rarity alone does not make a Pokémon legendary. Its "is_legendary" status is explicitly marked as 0, confirming it is not legendary.'