# Improving Retrieval Performance by Reranker models

This notebook showcases how to do a two-stage pass for retrieval. Use `embedding-based` retrieval with a high `top-k` value
in order to maximize recall and get a large set of candidate items. Then, use `LLM-based` retrieval
to dynamically select the nodes that are actually relevant to the query.

In [1]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install cohere

Collecting cohere
  Downloading cohere-5.5.8-py3-none-any.whl (173 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m173.8/173.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3<2.0.0,>=1.34.0 (from cohere)
  Downloading boto3-1.34.143-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastavro<2.0.0,>=1.9.4 (from cohere)
  Downloading fastavro-1.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.21.2 (from cohere)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx-sse<0.5.0,>=0.4.0 (from cohere)
  Downloading httpx_sse-0.4.0-py3-no

In [1]:
!pip install -qqq llama-index llama-hub cohere langchain openai accelerate==0.21.0 bitsandbytes==0.40.2 transformers sentence_transformers InstructorEmbedding

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.9/103.9 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m173.8/173.8 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.6/983.6 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.5/328.5 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━

In [2]:
!pip install langchain sentence-transformers chromadb langchainhub

Collecting chromadb
  Downloading chromadb-0.5.4-py3-none-any.whl (581 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m581.4/581.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchainhub
  Downloading langchainhub-0.1.20-py3-none-any.whl (5.0 kB)
Collecting chroma-hnswlib==0.7.5 (from chromadb)
  Downloading chroma_hnswlib-0.7.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)
  Downloading uvicorn-0.30.1-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m7.2 MB/s[0m eta 

In [6]:

!pip install langchain_community


Collecting langchain_community
  Downloading langchain_community-0.2.7-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: langchain_community
Successfully installed langchain_community-0.2.7


In [9]:
import nest_asyncio
nest_asyncio.apply()

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

import json
import torch
from pathlib import Path
import pandas as pd
# pd.set_option("display.max_colwidth", -1)

from copy import deepcopy

# transformers
from transformers import BitsAndBytesConfig

# llama_index
from llama_index.core.prompts import PromptTemplate
from llama_index.core import download_loader, Document, VectorStoreIndex, ServiceContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.postprocessor import LLMRerank
from langchain.embeddings import HuggingFaceInstructEmbeddings
from llama_index.core.response.notebook_utils import display_source_node
from llama_index.core.query_engine import RetrieverQueryEngine
from IPython.display import Markdown, display, HTML
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.schema import QueryBundle
from llama_index.core.indices.postprocessor import SentenceTransformerRerank


In [11]:

!pip install -q cohere llama-index-postprocessor-cohere-rerank


https://docs.llamaindex.ai/en/stable/examples/node_postprocessor/SentenceTransformerRerank/

In [10]:

# Re-rank
from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.core.postprocessor import SentenceTransformerRerank




In [37]:
!pip install llama-index-llms-huggingface

Collecting llama-index-llms-huggingface
  Downloading llama_index_llms_huggingface-0.2.4-py3-none-any.whl (11 kB)
Collecting text-generation<0.8.0,>=0.7.0 (from llama-index-llms-huggingface)
  Downloading text_generation-0.7.0-py3-none-any.whl (12 kB)
Installing collected packages: text-generation, llama-index-llms-huggingface
Successfully installed llama-index-llms-huggingface-0.2.4 text-generation-0.7.0


In [11]:
from llama_index.llms.huggingface import HuggingFaceLLM




# Setup

1. In this section we will work with the QLoRA paper and create an initial set of nodes (chunk size 512).
2. We will use Open Source LLM [`zephyr-7b-alpha`](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) and embedding [`hkunlp/instructor-large`](https://huggingface.co/hkunlp/instructor-large)

# Load Data

In [18]:

from google.colab import drive
drive.mount("/LLM")


Mounted at /LLM


In [12]:
PDFReader = download_loader("PDFReader")
loader = PDFReader()
docs = loader.load_data(file=Path("/LLM/MyDrive/RAG/Qlora.pdf"))

  PDFReader = download_loader("PDFReader")


In [13]:
from llama_index.core.node_parser import SimpleNodeParser
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(docs)

In [30]:
nodes[0]

TextNode(id_='f30cd62a-c9fb-435d-a80b-e3674411ab00', embedding=None, metadata={'page_label': '1', 'file_name': 'Qlora.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='124f53e8-af63-438a-b5d5-95b83a89fd2d', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'Qlora.pdf'}, hash='ccf1895a3cd48cab967775a8b1b159b39eceb0307b2e3afed5be7e80da928c33'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='2ffad800-6a1d-48ac-a8b9-b73d17b1a3be', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='5fb03aa5eedbf5806ef0619366f82d0880ac1d1e38c70b16a44372fa860f38a4')}, text='QL ORA: Efficient Finetuning of Quantized LLMs\nTim Dettmers∗Artidoro Pagnoni∗Ari Holtzman\nLuke Zettlemoyer\nUniversity of Washington\n{dettmers,artidoro,ahai,lsz}@cs.washington.edu\nAbstract\nWe present QLORA, an efficient finetuning approach that reduces memory us-\nage enough to finetune a 65B parameter m

In [14]:
len(nodes)

78

# Models

## LLM (`zephyr-7b-alpha`)

In [16]:

# import os
# from huggingface_hub import login
# access_token='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'
# HUGGINGFACE_TOKEN = os.environ.get(access_token)
# login(token=HUGGINGFACE_TOKEN)


In [17]:

import cohere
import os
import getpass
os.environ["TrpJilIKyuyf1jFx9ju9rtj6XDsaU687CcUODNzW"] = os.getenv("COHERE_API_KEY") or getpass.getpass()
# init client
cohere_api_key = cohere.Client(os.environ["TrpJilIKyuyf1jFx9ju9rtj6XDsaU687CcUODNzW"])


··········


In [18]:
from google.colab import userdata

# huggingface and cohere api token
# hf_token = userdata.get('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')
# cohere_api_key = userdata.get('TrpJilIKyuyf1jFx9ju9rtj6XDsaU687CcUODNzW')

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


def messages_to_prompt(messages):
  prompt = ""
  for message in messages:
    if message.role == 'system':
      prompt += f"<|system|>\n{message.content}</s>\n"
    elif message.role == 'user':
      prompt += f"<|user|>\n{message.content}</s>\n"
    elif message.role == 'assistant':
      prompt += f"<|assistant|>\n{message.content}</s>\n"

  # ensure we start with a system prompt, insert blank if needed
  if not prompt.startswith("<|system|>\n"):
    prompt = "<|system|>\n</s>\n" + prompt

  # add final assistant prompt
  prompt = prompt + "<|assistant|>\n"

  return prompt


llm = HuggingFaceLLM(
    model_name="HuggingFaceH4/zephyr-7b-alpha",
    tokenizer_name="HuggingFaceH4/zephyr-7b-alpha",
    query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
    context_window=3900,
    max_new_tokens=256,
    model_kwargs={"quantization_config": quantization_config},
    # tokenizer_kwargs={},
    generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
    messages_to_prompt=messages_to_prompt,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


## Embedding (`hkunlp/instructor-large`)

https://github.com/PromtEngineer/localGPT/issues/722

In [1]:
!pip install sentence-transformers==2.2.2




In [7]:
# from InstructorEmbedding import INSTRUCTOR
# model = INSTRUCTOR('hkunlp/instructor-large')
# sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
# instruction = "Represent the Science title:"
# embeddings = model.encode([[instruction,sentence]])
# print(embeddings)


In [19]:
# DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
embed_model = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-large"
)

load INSTRUCTOR_Transformer
max_seq_length  512


In [21]:
!pip install llama-index-embeddings-langchain

Collecting llama-index-embeddings-langchain
  Downloading llama_index_embeddings_langchain-0.1.2-py3-none-any.whl (2.5 kB)
Installing collected packages: llama-index-embeddings-langchain
Successfully installed llama-index-embeddings-langchain-0.1.2


## Configure Index and Retriever

In [22]:
# ServiceContext
service_context = ServiceContext.from_defaults(llm=llm,
                                               embed_model=embed_model
                                               )

# index
vector_index = VectorStoreIndex(
    nodes, service_context=service_context
)

# configure retriever
retriever = VectorIndexRetriever(
    index=vector_index,
    similarity_top_k=10,
    service_context=service_context)

  service_context = ServiceContext.from_defaults(llm=llm,


## Initialize Re-rankers

In [25]:

# cohere_api_key = userdata.get('TrpJilIKyuyf1jFx9ju9rtj6XDsaU687CcUODNzW')
# Define all embeddings and rerankers
RERANKERS = {
    "WithoutReranker": "None",
    "CohereRerank": CohereRerank(api_key="TrpJilIKyuyf1jFx9ju9rtj6XDsaU687CcUODNzW", top_n=5),
    "bge-reranker-base": SentenceTransformerRerank(model="BAAI/bge-reranker-base", top_n=5),
    "bge-reranker-large": SentenceTransformerRerank(model="BAAI/bge-reranker-large", top_n=5)
}



config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

## Retrieval Comparisons

In [26]:
def get_retrieved_nodes(
    query_str, reranker
):
    query_bundle = QueryBundle(query_str)

    retrieved_nodes = retriever.retrieve(query_bundle)

    if reranker != "None":
      retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
    else:
        retrieved_nodes

    return retrieved_nodes


def pretty_print(df):
    return display(HTML(df.to_html().replace("\\n", "<br>")))


def visualize_retrieved_nodes(nodes) -> None:
    result_dicts = []
    for node in nodes:
        node = deepcopy(node)
        node.node.metadata = None
        node_text = node.node.get_text()
        node_text = node_text.replace("\n", " ")

        result_dict = {"Score": node.score, "Text": node_text}
        result_dicts.append(result_dict)

    pretty_print(pd.DataFrame(result_dicts))

In [None]:
RERANKERS.items()

dict_items([('WithoutReranker', 'None'), ('CohereRerank', CohereRerank(callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7fbeacade170>, model='rerank-english-v2.0', top_n=5)), ('bge-reranker-base', SentenceTransformerRerank(callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7fbea5994a60>, model='BAAI/bge-reranker-base', top_n=5)), ('bge-reranker-large', SentenceTransformerRerank(callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7fbea59ee740>, model='BAAI/bge-reranker-large', top_n=5))])

In [48]:
# query_str = "QLoRA?"

# # Loop over rerankers
# for rerank_name, reranker in RERANKERS.items():
#     print(f"Running Evaluation for Reranker: {rerank_name}")

#     query_bundle = QueryBundle(query_str)

#     retrieved_nodes = retriever.retrieve(query_bundle)

#     if reranker != "None":
#       retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
#     else:
#         retrieved_nodes

#     print(f"Visualize Retrieved Nodes for Reranker: {rerank_name}")
#     print("retrieved_nodes : ",retrieved_nodes)
#     visualize_retrieved_nodes(retrieved_nodes)


In [49]:
query_str = "What are the top features of QLoRA?"

# Function to validate and clean nodes
def clean_nodes(nodes):
    cleaned_nodes = []
    for node in nodes:
        if node.metadata is None:
            node.metadata = {}
        cleaned_nodes.append(node)
    return cleaned_nodes

# Loop over rerankers
for rerank_name, reranker in RERANKERS.items():
    print(f"Running Evaluation for Reranker: {rerank_name}")

    query_bundle = QueryBundle(query_str)

    retrieved_nodes = retriever.retrieve(query_bundle)

    # Validate and clean retrieved nodes
    retrieved_nodes = clean_nodes(retrieved_nodes)

    if reranker != "None":
        retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
        # Clean nodes again after reranking
        retrieved_nodes = clean_nodes(retrieved_nodes)
    else:
        retrieved_nodes = clean_nodes(retrieved_nodes)

    print(f"Visualize Retrieved Nodes for Reranker: {rerank_name}")
    for idx, node in enumerate(retrieved_nodes):
        print(f"Node {idx}: {node}")
    # visualize_retrieved_nodes(retrieved_nodes)


Running Evaluation for Reranker: WithoutReranker
Visualize Retrieved Nodes for Reranker: WithoutReranker
Node 0: Node ID: 3307edf4-dc70-416a-ab5d-a89f195828d9
Text: QL ORA.Using the components described above, we define QLORAfor
a single linear layer in the quantized base model with a single LoRA
adapter as follows: YBF16=XBF16doubleDequant (cFP32 1, ck-bit 2,WNF4)
+XBF16LBF16 1LBF16 2, (5) where doubleDequant (·)is defined as:
doubleDequant (cFP32 1, ck-bit 2,Wk-bit) =dequant (dequant (cFP32 1,
ck-bit 2),W...
Score:  0.874

Node 1: Node ID: 9e47c8b4-4807-4957-9d48-f30bb5574c11
Text: QLORAhas one low-precision storage data type, in our case
usually 4-bit, and one computation data type that is usually BFloat16.
In practice, this means whenever a QLORAweight tensor is used, we
dequantize the tensor to BFloat16, and then perform a matrix
multiplication in 16-bit. We now discuss the components of QL ORA
followed by a formal defi...
Score:  0.873

Node 2: Node ID: 95e0b257-01d5-4ef2-a331-a9

In [50]:
query_str = "What are the benefits of using QLOra?"

# Function to validate and clean nodes
def clean_nodes(nodes):
    cleaned_nodes = []
    for node in nodes:
        if node.metadata is None:
            node.metadata = {}
        cleaned_nodes.append(node)
    return cleaned_nodes

# Loop over rerankers
for rerank_name, reranker in RERANKERS.items():
    print(f"Running Evaluation for Reranker: {rerank_name}")

    query_bundle = QueryBundle(query_str)

    retrieved_nodes = retriever.retrieve(query_bundle)

    # Validate and clean retrieved nodes
    retrieved_nodes = clean_nodes(retrieved_nodes)

    if reranker != "None":
        retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
        # Clean nodes again after reranking
        retrieved_nodes = clean_nodes(retrieved_nodes)
    else:
        retrieved_nodes = clean_nodes(retrieved_nodes)

    print(f"Visualize Retrieved Nodes for Reranker: {rerank_name}")
    for idx, node in enumerate(retrieved_nodes):
        print(f"Node {idx}: {node}")
    # visualize_retrieved_nodes(retrieved_nodes)


Running Evaluation for Reranker: WithoutReranker
Visualize Retrieved Nodes for Reranker: WithoutReranker
Node 0: Node ID: 95e0b257-01d5-4ef2-a331-a9a04a229867
Text: This second step yields the quantized quantization constants
cFP8 2and the second level of quantization constants cFP32 1. We use
8-bit Floats with a blocksize of 256 for the second quantization as no
performance degradation is observed for 8-bit quantization, in line
with results from Dettmers and Zettlemoyer [13]. Since the cFP32 2are
positive...
Score:  0.874

Node 1: Node ID: 3307edf4-dc70-416a-ab5d-a89f195828d9
Text: QL ORA.Using the components described above, we define QLORAfor
a single linear layer in the quantized base model with a single LoRA
adapter as follows: YBF16=XBF16doubleDequant (cFP32 1, ck-bit 2,WNF4)
+XBF16LBF16 1LBF16 2, (5) where doubleDequant (·)is defined as:
doubleDequant (cFP32 1, ck-bit 2,Wk-bit) =dequant (dequant (cFP32 1,
ck-bit 2),W...
Score:  0.874

Node 2: Node ID: 43662513-790f-4953-aaac-6d

In [51]:
query_str = "What is QLoRA?"

# Function to validate and clean nodes
def clean_nodes(nodes):
    cleaned_nodes = []
    for node in nodes:
        if node.metadata is None:
            node.metadata = {}
        cleaned_nodes.append(node)
    return cleaned_nodes

# Loop over rerankers
for rerank_name, reranker in RERANKERS.items():
    print(f"Running Evaluation for Reranker: {rerank_name}")

    query_bundle = QueryBundle(query_str)

    retrieved_nodes = retriever.retrieve(query_bundle)

    # Validate and clean retrieved nodes
    retrieved_nodes = clean_nodes(retrieved_nodes)

    if reranker != "None":
        retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
        # Clean nodes again after reranking
        retrieved_nodes = clean_nodes(retrieved_nodes)
    else:
        retrieved_nodes = clean_nodes(retrieved_nodes)

    print(f"Visualize Retrieved Nodes for Reranker: {rerank_name}")
    for idx, node in enumerate(retrieved_nodes):
        print(f"Node {idx}: {node}")
    # visualize_retrieved_nodes(retrieved_nodes)


Running Evaluation for Reranker: WithoutReranker
Visualize Retrieved Nodes for Reranker: WithoutReranker
Node 0: Node ID: 9e47c8b4-4807-4957-9d48-f30bb5574c11
Text: QLORAhas one low-precision storage data type, in our case
usually 4-bit, and one computation data type that is usually BFloat16.
In practice, this means whenever a QLORAweight tensor is used, we
dequantize the tensor to BFloat16, and then perform a matrix
multiplication in 16-bit. We now discuss the components of QL ORA
followed by a formal defi...
Score:  0.884

Node 1: Node ID: 3307edf4-dc70-416a-ab5d-a89f195828d9
Text: QL ORA.Using the components described above, we define QLORAfor
a single linear layer in the quantized base model with a single LoRA
adapter as follows: YBF16=XBF16doubleDequant (cFP32 1, ck-bit 2,WNF4)
+XBF16LBF16 1LBF16 2, (5) where doubleDequant (·)is defined as:
doubleDequant (cFP32 1, ck-bit 2,Wk-bit) =dequant (dequant (cFP32 1,
ck-bit 2),W...
Score:  0.876

Node 2: Node ID: 2576d513-ff13-428f-8a70-eb

In [28]:
query_str = "What are Paged Optimizers?"

results_df = pd.DataFrame()
# Loop over rerankers
for rerank_name, reranker in RERANKERS.items():
    print(f"Running Evaluation for Reranker: {rerank_name}")

    query_bundle = QueryBundle(query_str)

    retrieved_nodes = retriever.retrieve(query_bundle)

    if reranker != "None":
      retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
    else:
        retrieved_nodes

    print(f"Visualize Retrieved Nodes for Reranker: {rerank_name}")
    visualize_retrieved_nodes(retrieved_nodes)


Running Evaluation for Reranker: WithoutReranker
Visualize Retrieved Nodes for Reranker: WithoutReranker


ValidationError: 1 validation error for TextNode
metadata
  none is not an allowed value (type=type_error.none.not_allowed)

# Evaluation

Now, we will use RetrieverEvaluator to evaluate the quality of any Retriever module.

We specify a set of different evaluation metrics: this includes hit-rate and MRR. For any given question, these will compare the quality of retrieved results from the ground-truth context.

To ease the burden of creating the eval dataset in the first place, we can rely on synthetic data generation.

## Build an Evaluation dataset of (query, context) pairs
Here we build a simple evaluation dataset over the existing text corpus.

We use our generate_question_context_pairs to generate a set of (question, context) pairs over a given unstructured text corpus. This uses the LLM to auto-generate questions from each context chunk.

We will use `Zephr-7B` LLM to generate Question-Context Pairs.

We get back a EmbeddingQAFinetuneDataset object. At a high-level this contains a set of ids mapping to queries and relevant doc chunks, as well as the corpus itself.

In [31]:
# Prompt to generate questions
qa_generate_prompt_tmpl = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.

You are a Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. The questions should not contain options, not start with Q1/ Q2. \
Restrict the questions to the context information provided.\
"""

In [37]:
# Evaluator
from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index.core.evaluation import RetrieverEvaluator

qa_dataset = generate_question_context_pairs(
    nodes, llm=llm, num_questions_per_chunk=2, qa_generate_prompt_tmpl=qa_generate_prompt_tmpl
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|▏         | 1/78 [00:09<12:10,  9.48s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  3%|▎         | 2/78 [00:14<08:54,  7.03s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  4%|▍         | 3/78 [00:22<09:02,  7.24s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  5%|▌         | 4/78 [00:30<09:13,  7.48s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  6%|▋         | 5/78 [00:37<09:03,  7.45s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  8%|▊         | 6/78 [00:44<08:49,  7.36s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  9%|▉         | 7/78 [00:51<08:26,  7.13s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 10%|█         | 8/78 [00:55<07:19,  6.27s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 12%|█▏        | 9/7

In [38]:
len(qa_dataset.corpus.keys())

78

In [47]:
qa_dataset.corpus.keys[0]

TypeError: 'builtin_function_or_method' object is not subscriptable

In [39]:
# Generated 2 questions for this chunk
qa_dataset.queries['297a0d1c-57d6-4b8b-928c-5c234cbcc02d']

KeyError: '297a0d1c-57d6-4b8b-928c-5c234cbcc02d'

In [41]:
# Extract relevant doc for this chunk
qa_dataset.relevant_docs['297a0d1c-57d6-4b8b-928c-5c234cbcc02d']

KeyError: '297a0d1c-57d6-4b8b-928c-5c234cbcc02d'

In [None]:
# Extract corpus for this relevant doc
qa_dataset.corpus['bc19a32b-006e-4ae2-98ea-6a20b7143f3c']

'Glue: A multi-\ntask benchmark and analysis platform for natural language understanding. arXiv preprint\narXiv:1804.07461 , 2018.\n[59] Y . Wang, Y . Kordi, S. Mishra, A. Liu, N. A. Smith, D. Khashabi, and H. Hajishirzi. Self-instruct:\nAligning language model with self generated instructions. arXiv preprint arXiv:2212.10560 ,\n2022.\n[60] Y . Wang, S. Mishra, P. Alipoormolabashi, Y . Kordi, A. Mirzaei, A. Arunkumar, A. Ashok, A. S.\nDhanasekaran, A. Naik, D. Stap, et al. Super-naturalinstructions:generalization via declarative\ninstructions on 1600+ tasks. In EMNLP , 2022.\n[61] Y . Wang, S. Mishra, P. Alipoormolabashi, Y . Kordi, A. Mirzaei, A. Naik, A. Ashok, A. S.\nDhanasekaran, A. Arunkumar, D. Stap, et al. Super-naturalinstructions: Generalization via\ndeclarative instructions on 1600+ nlp tasks. In Proceedings of the 2022 Conference on Empirical\nMethods in Natural Language Processing , pages 5085–5109, 2022.\n[62] J. Wei, M. Bosma, V . Y . Zhao, K. Guu, A. W. Yu, B. Lester, N.

In [40]:
# try it out on a sample query
sample_id, sample_query = list(qa_dataset.queries.items())[1]
sample_expected = qa_dataset.relevant_docs[sample_id]

retriever_evaluator = RetrieverEvaluator.from_metric_names(
        ["mrr", "hit_rate"], retriever=retriever
    )

eval_result = retriever_evaluator.evaluate(sample_query, sample_expected)
print(eval_result)

Query: What are the results of finetuning more than 1,000 models using QLORA, and how do they compare to previous SoTA models in terms of instruction following and chatbot performance? Additionally, what insights does QLORA provide regarding the trustworthiness of current chatbot benchmarks?
Metrics: {'mrr': 0.5, 'hit_rate': 1.0}



## Try it out on an entire dataset

### Define a function to display results

In [42]:
def display_results(reranker_name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metric_df = pd.DataFrame({"Reranker": [reranker_name], "hit_rate": [hit_rate], "mrr": [mrr]})

    return metric_df

In [43]:
query_str = "What are the top features of QLoRA?"

results_df = pd.DataFrame()
# Loop over rerankers
for rerank_name, reranker in RERANKERS.items():
    print(f"Running Evaluation for Reranker: {rerank_name}")

    query_bundle = QueryBundle(query_str)

    retrieved_nodes = retriever.retrieve(query_bundle)

    if reranker != "None":
      retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
    else:
        retrieved_nodes

    retriever_evaluator = RetrieverEvaluator.from_metric_names(
        ["mrr", "hit_rate"], retriever=retriever
    )

    eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

    current_df = display_results(rerank_name, eval_results)
    results_df = pd.concat([results_df, current_df], ignore_index=True)

Running Evaluation for Reranker: WithoutReranker
Running Evaluation for Reranker: CohereRerank
Running Evaluation for Reranker: bge-reranker-base
Running Evaluation for Reranker: bge-reranker-large


In [44]:
results_df

Unnamed: 0,Reranker,hit_rate,mrr
0,WithoutReranker,0.916667,0.708794
1,CohereRerank,0.916667,0.708794
2,bge-reranker-base,0.916667,0.708794
3,bge-reranker-large,0.916667,0.708794


# END