# Improving Retrieval Performance by Reranker models

This notebook showcases how to do a two-stage pass for retrieval. Use `embedding-based` retrieval with a high `top-k` value
in order to maximize recall and get a large set of candidate items. Then, use `LLM-based` retrieval
to dynamically select the nodes that are actually relevant to the query.

In [1]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
#!pip install cohere

In [2]:
!pip install -qqq llama-index 
!pip install -qqq llama-hub 
#!pip install -qqq cohere 
#!pip install -qqq langchain 
#!pip install -qqq openai 
!pip install -qqq accelerate==0.21.0 
!pip install -qqq bitsandbytes==0.40.2 
!pip install -qqq transformers 
!pip install -qqq sentence_transformers 
!pip install -qqq InstructorEmbedding

In [8]:
import nest_asyncio
nest_asyncio.apply()

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

import json
import torch
from pathlib import Path
import pandas as pd
pd.set_option("display.max_colwidth", -1)

from copy import deepcopy

# transformers
from transformers import BitsAndBytesConfig

# llama_index
from llama_index.core.prompts import PromptTemplate
from llama_index.core.llms import HuggingFaceLLM
from llama_index import download_loader, Document, VectorStoreIndex, ServiceContext
from llama_index.node_parser import SentenceSplitter
from llama_index.postprocessor import LLMRerank
from langchain.embeddings import HuggingFaceInstructEmbeddings
from llama_index.response.notebook_utils import display_source_node
from llama_index.query_engine import RetrieverQueryEngine
from IPython.display import Markdown, display, HTML
from llama_index.retrievers import VectorIndexRetriever
from llama_index.schema import QueryBundle
from llama_index.indices.postprocessor import SentenceTransformerRerank

# Re-rank
from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.indices.postprocessor import SentenceTransformerRerank

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

  pd.set_option("display.max_colwidth", -1)


ImportError: cannot import name 'HuggingFace' from 'llama_index.core.llms' (/home/renato/Documents/env_default/lib/python3.10/site-packages/llama_index/core/llms/__init__.py)

# Setup

1. In this section we will work with the QLoRA paper and create an initial set of nodes (chunk size 512).
2. We will use Open Source LLM [`zephyr-7b-alpha`](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) and embedding [`hkunlp/instructor-large`](https://huggingface.co/hkunlp/instructor-large)

# Load Data

In [5]:
PDFReader = download_loader("PDFReader")
loader = PDFReader()
docs = loader.load_data(file=Path("QLoRa.pdf"))

In [6]:
from llama_index.node_parser import SimpleNodeParser
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(docs)

In [7]:
len(nodes)

78

# Models

## LLM (`zephyr-7b-alpha`)

In [8]:
from google.colab import userdata

# huggingface and cohere api token
hf_token = userdata.get('hf_token')
cohere_api_key = userdata.get('COHERE_API_KEY')

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


def messages_to_prompt(messages):
  prompt = ""
  for message in messages:
    if message.role == 'system':
      prompt += f"<|system|>\n{message.content}</s>\n"
    elif message.role == 'user':
      prompt += f"<|user|>\n{message.content}</s>\n"
    elif message.role == 'assistant':
      prompt += f"<|assistant|>\n{message.content}</s>\n"

  # ensure we start with a system prompt, insert blank if needed
  if not prompt.startswith("<|system|>\n"):
    prompt = "<|system|>\n</s>\n" + prompt

  # add final assistant prompt
  prompt = prompt + "<|assistant|>\n"

  return prompt


llm = HuggingFaceLLM(
    model_name="HuggingFaceH4/zephyr-7b-alpha",
    tokenizer_name="HuggingFaceH4/zephyr-7b-alpha",
    query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
    context_window=3900,
    max_new_tokens=256,
    model_kwargs={"quantization_config": quantization_config},
    # tokenizer_kwargs={},
    generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
    messages_to_prompt=messages_to_prompt,
    device_map="auto",
)

config.json:   0%|          | 0.00/628 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

## Embedding (`hkunlp/instructor-large`)

In [9]:
embed_model = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-large", model_kwargs={"device": DEVICE}
)

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.41k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


## Configure Index and Retriever

In [12]:
# ServiceContext
service_context = ServiceContext.from_defaults(llm=llm,
                                               embed_model=embed_model
                                               )

# index
vector_index = VectorStoreIndex(
    nodes, service_context=service_context
)

# configure retriever
retriever = VectorIndexRetriever(
    index=vector_index,
    similarity_top_k=10,
    service_context=service_context)

## Initialize Re-rankers

In [13]:
# Define all embeddings and rerankers
RERANKERS = {
    "WithoutReranker": "None",
    "CohereRerank": CohereRerank(api_key=cohere_api_key, top_n=5),
    "bge-reranker-base": SentenceTransformerRerank(model="BAAI/bge-reranker-base", top_n=5),
    "bge-reranker-large": SentenceTransformerRerank(model="BAAI/bge-reranker-large", top_n=5)
}

config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

## Retrieval Comparisons

In [14]:
def get_retrieved_nodes(
    query_str, reranker
):
    query_bundle = QueryBundle(query_str)

    retrieved_nodes = retriever.retrieve(query_bundle)

    if reranker != "None":
      retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
    else:
        retrieved_nodes

    return retrieved_nodes


def pretty_print(df):
    return display(HTML(df.to_html().replace("\\n", "<br>")))


def visualize_retrieved_nodes(nodes) -> None:
    result_dicts = []
    for node in nodes:
        node = deepcopy(node)
        node.node.metadata = None
        node_text = node.node.get_text()
        node_text = node_text.replace("\n", " ")

        result_dict = {"Score": node.score, "Text": node_text}
        result_dicts.append(result_dict)

    pretty_print(pd.DataFrame(result_dicts))

  pd.set_option("display.max_colwidth", -1)


In [30]:
RERANKERS.items()

dict_items([('WithoutReranker', 'None'), ('CohereRerank', CohereRerank(callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7fbeacade170>, model='rerank-english-v2.0', top_n=5)), ('bge-reranker-base', SentenceTransformerRerank(callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7fbea5994a60>, model='BAAI/bge-reranker-base', top_n=5)), ('bge-reranker-large', SentenceTransformerRerank(callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7fbea59ee740>, model='BAAI/bge-reranker-large', top_n=5))])

In [31]:
query_str = "What are the top features of QLoRA?"

# Loop over rerankers
for rerank_name, reranker in RERANKERS.items():
    print(f"Running Evaluation for Reranker: {rerank_name}")

    query_bundle = QueryBundle(query_str)

    retrieved_nodes = retriever.retrieve(query_bundle)

    if reranker != "None":
      retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
    else:
        retrieved_nodes

    print(f"Visualize Retrieved Nodes for Reranker: {rerank_name}")
    visualize_retrieved_nodes(retrieved_nodes)


Running Evaluation for Reranker: WithoutReranker
Visualize Retrieved Nodes for Reranker: WithoutReranker


Unnamed: 0,Score,Text
0,0.871219,"QL ORA.Using the components described above, we define QLORAfor a single linear layer in the quantized base model with a single LoRA adapter as follows: YBF16=XBF16doubleDequant (cFP32 1, ck-bit 2,WNF4) +XBF16LBF16 1LBF16 2, (5) where doubleDequant (·)is defined as: doubleDequant (cFP32 1, ck-bit 2,Wk-bit) =dequant (dequant (cFP32 1, ck-bit 2),W4bit) =WBF16,(6) We use NF4 for Wand FP8 for c2. We use a blocksize of 64 for Wfor higher quantization precision and a blocksize of 256 for c2to conserve memory. For parameter updates only the gradient with respect to the error for the adapters weights∂E ∂Liare needed, and not for 4-bit weights∂E ∂W. However, the calculation of∂E ∂Lientails the calculation of∂X ∂W which proceeds via equation (5) with dequantization from storage WNF4to computation data type WBF16to calculate the derivative∂X ∂Win BFloat16 precision. To summarize, QLORAhas one storage data type (usually 4-bit NormalFloat) and a computation data type (16-bit BrainFloat). We dequantize the storage data type to the computation data type to perform the forward and backward pass, but we only compute weight gradients for the LoRA parameters which use 16-bit BrainFloat. 4 QLoRA vs. Standard Finetuning We have discussed how QLoRA works and how it can significantly reduce the required memory for finetuning models. The main question now is whether QLoRA can perform as well as full-model finetuning. Furthermore, we want to analyze the components of QLoRA including the impact of NormalFloat4 over standard Float4. The following sections will discuss the experiments that aimed at answering these questions. 3https://docs.nvidia.com/cuda/cuda-c-programming-guide 5"
1,0.869763,"This second step yields the quantized quantization constants cFP8 2and the second level of quantization constants cFP32 1. We use 8-bit Floats with a blocksize of 256 for the second quantization as no performance degradation is observed for 8-bit quantization, in line with results from Dettmers and Zettlemoyer [13]. Since the cFP32 2are positive, we subtract the mean from c2before quantization to center the values around zero and make use of symmetric quantization. On average, for a blocksize of 64, this quantization reduces the memory footprint per parameter from 32/64 = 0 .5bits, to 8/64 + 32 /(64·256) = 0 .127bits, a reduction of 0.373 bits per parameter. Paged Optimizers use the NVIDIA unified memory3feature wich does automatic page-to-page transfers between the CPU and GPU for error-free GPU processing in the scenario where the GPU occasionally runs out-of-memory. The feature works like regular memory paging between CPU RAM and the disk. We use this feature to allocate paged memory for the optimizer states which are then automatically evicted to CPU RAM when the GPU runs out-of-memory and paged back into GPU memory when the memory is needed in the optimizer update step. QL ORA.Using the components described above, we define QLORAfor a single linear layer in the quantized base model with a single LoRA adapter as follows: YBF16=XBF16doubleDequant (cFP32 1, ck-bit 2,WNF4) +XBF16LBF16 1LBF16 2, (5) where doubleDequant (·)is defined as: doubleDequant (cFP32 1, ck-bit 2,Wk-bit) =dequant (dequant (cFP32 1, ck-bit 2),W4bit) =WBF16,(6) We use NF4 for Wand FP8 for c2. We use a blocksize of 64 for Wfor higher quantization precision and a blocksize of 256 for c2to conserve memory."
2,0.868802,"QLORAcan be seen as an equalizing factor that helps to close the resource gap between large corporations and small teams with consumer GPUs. Another potential source of impact is deployment to mobile phones. We believe our QLORAmethod might enable the critical milestone of enabling the finetuning of LLMs on phones and other low resource settings. While 7B models were shown to be able to be run on phones before, QLORAis the first method that would enable the finetuning of such models. We estimate that with an iPhone 12 Plus, QLORAcan finetune 3 million tokens per night while the phone is charging. While finetuned 7B models do not reach the quality of ChatGPT, we believe that the quality is good enough to enable novel applications that have not been possible before due to privacy or LLM quality issues. QLORA can help enable privacy-preserving usage of LLMs, where users can own and manage their own data and models, while simultaneously making LLMs easier to deploy. However, finetuning is a dual-use technology that can be abused to cause harm. Widespread use of LLMs has known dangers [ 8,6], but we believe that equalizing access to a technology that is quickly becoming ubiquitous will allow for better more independent analysis than keeping the power of LLMs in the hands of large corporations that do not release models or source code for auditing. All in all, we believe that QLORAwill have a broadly positive impact making the finetuning of high quality LLMs much more widely and easily accessible. Acknowledgements We thank Aditya Kusupati, Ofir Press, Ashish Sharma, Margaret Li, Raphael Olivier, Zihao Ye, and Evangelia Spiliopoulou for their valuable feedback. Our research was facilitated by the advanced computational, storage, and networking infrastructure of the Hyak supercomputer system at the University of Washington. We thank the Hyak team for ensuring a smooth operation. We thank the beta testers of the bitsandbytes library, in particular Alex Birch and Alyssa Vance. We thank Younes Belkada for help with the integration of our software into the Hugging Face transformers stack. 16"
3,0.868658,"QLORAhas one low-precision storage data type, in our case usually 4-bit, and one computation data type that is usually BFloat16. In practice, this means whenever a QLORAweight tensor is used, we dequantize the tensor to BFloat16, and then perform a matrix multiplication in 16-bit. We now discuss the components of QL ORA followed by a formal definition of QL ORA. 4-bit NormalFloat Quantization The NormalFloat (NF) data type builds on Quantile Quantization [15] which is an information-theoretically optimal data type that ensures each quantization bin has an equal number of values assigned from the input tensor. Quantile quantization works by estimating the quantile of the input tensor through the empirical cumulative distribution function. The main limitation of quantile quantization is that the process of quantile estimation is expensive. Therefore fast quantile approximation algorithms, such as SRAM quantiles [ 15], are used to estimate them. Due to the approximate nature of these quantile estimation algorithms, the data type has large quantization errors for outliers, which are often the most important values. Expensive quantile estimates and approximation errors can be avoided when input tensors come from a distribution fixed up to a quantization constant. In such cases, input tensors have the same quantiles making exact quantile estimation computationally feasible. Since pretrained neural network weights usually have a zero-centered normal distribution with standard deviation σ(see Appendix F), we can transform all weights to a single fixed distribution by scaling σsuch that the distribution fits exactly into the range of our data type. For our data type, we set the arbitrary range [−1,1]. As such, both the quantiles for the data type and the neural network weights need to be normalized into this range."
4,0.856008,"This calls for further investigations of the tradeoffs of simple cross-entropy loss and RLHF training. We hope that QLORA enables such analysis at scale, without the need for overwhelming computational resources. 7 Related Work Quantization of Large Language Models Quantization of LLMs has largely focused on quanti- zation for inference time. Major approaches for preserving 16-bit LLM quality focus on managing outlier features (e.g., SmoothQuant [ 66] and LLM.int8() [ 14]) while others use more sophisticated grouping methods [ 44,69]. Lossy quantization approaches study the trade-offs for regular round- ing [ 13,71,47] or how to optimize rounding decisions to improve quantization precision [ 18]. Besides our work, SwitchBack layers [ 65] is the only work that studies backpropagation through quantized weights at a scale beyond 1B parameters. Finetuning with Adapters While we use Low-rank Adapters [ 28] (LoRA), many other Parameter Efficient FineTuning (PEFT) methods have been proposed such as prompt tuning [ 48,33,34], tuning the embedding layer inputs [ 1], tuning hidden states (IA3) [37], adding full layers [ 27], tuning biases [ 70], learning a mask over weights based on Fisher information [ 54], and a combination of approaches [ 23]. In our work, we show that LoRA adapters are able to reach full 16-bit finetuning performance. We leave it to future work to explore the tradeoffs of other PEFT approaches. Instruction Finetuning To help a pretrained LLM follow the instructions provided in a prompt, instruction finetuning uses input-output pairs of various data sources to finetune a pretrained LLM to generate the output given the input as a prompt. Approaches and datasets include MetaICL [ 40], 14"
5,0.85522,"Parameter Efficient Finetuning (PEFT) method, most of the memory footprint for LLM finetuning comes from activation gradients and not from the learned LoRA parameters. For a 7B LLaMA model trained on FLAN v2 with a batch size of 1, with LoRA weights equivalent to commonly used 0.2% of the original model weights[ 28,37], the LoRA input gradients have a memory footprint of 567 MB while the LoRA parameters take up only 26 MB. With gradient checkpointing [ 9], the input gradients reduce to an average of 18 MB per sequence making them more memory intensive than all LoRA weights combined. In comparison, the 4-bit base model consumes 5,048 MB of memory. This highlights that gradient checkpointing is important but also that aggressively reducing the amount of LoRA parameter yields only minor memory benefits. This means we can use more adapters without significantly increasing the overall training memory footprint (see Appendix G for a detailed breakdown). As discussed later, this is crucial for recovering full 16-bit precision performance. 3 QL ORA Finetuning QLORAachieves high-fidelity 4-bit finetuning via two techniques we propose—4-bit NormalFloat (NF4) quantization and Double Quantization. Additionally, we introduce Paged Optimizers, to prevent memory spikes during gradient checkpointing from causing out-of-memory errors that have traditionally made finetuning on a single machine difficult for large models. QLORAhas one low-precision storage data type, in our case usually 4-bit, and one computation data type that is usually BFloat16. In practice, this means whenever a QLORAweight tensor is used, we dequantize the tensor to BFloat16, and then perform a matrix multiplication in 16-bit. We now discuss the components of QL ORA followed by a formal definition of QL ORA. 4-bit NormalFloat Quantization The NormalFloat (NF) data type builds on Quantile Quantization [15] which is an information-theoretically optimal data type that ensures each quantization bin has an equal number of values assigned from the input tensor. Quantile quantization works by estimating the quantile of the input tensor through the empirical cumulative distribution function."
6,0.85418,"Figure 1: Different finetuning methods and their memory requirements. QLORAimproves over LoRA by quantizing the transformer model to 4-bit precision and using paged optimizers to handle memory spikes. 2 Background Block-wise k-bit Quantization Quantization is the process of discretizing an input from a rep- resentation that holds more information to a representation with less information. It often means taking a data type with more bits and converting it to fewer bits, for example from 32-bit floats to 8-bit Integers. To ensure that the entire range of the low-bit data type is used, the input data type is commonly rescaled into the target data type range through normalization by the absolute maximum of the input elements, which are usually structured as a tensor. For example, quantizing a 32-bit Floating Point (FP32) tensor into a Int8 tensor with range [−127,127]: XInt8=round127 absmax (XFP32)XFP32 =round (cFP32·XFP32), (1) where cis the quantization constant orquantization scale . Dequantization is the inverse: dequant (cFP32,XInt8) =XInt8 cFP32=XFP32(2) The problem with this approach is that if a large magnitude value (i.e., an outlier) occurs in the input tensor, then the quantization bins—certain bit combinations—are not utilized well with few or no numbers quantized in some bins. To prevent the outlier issue, a common approach is to chunk the input tensor into blocks that are independently quantized, each with their own quantization constant c. This can be formalized as follows: We chunk the input tensor X∈Rb×hintoncontiguous blocks of sizeBby flattening the input tensor and slicing the linear segment into n= (b×h)/Bblocks. We quantize these blocks independently with Equation 1 to create a quantized tensor and nquantization constants ci. Low-rank Adapters Low-rank Adapter (LoRA) finetuning [ 28] is a method that reduces memory requirements by using a small set of trainable parameters, often termed adapters, while not updating the full model parameters which remain fixed."
7,0.852811,"Table 1: Elo ratings for a competition between models, averaged for 10,000 random initial order- ings. The winner of a match is determined by GPT-4 which declares which response is better for a given prompt of the the Vicuna benchmark. 95% confidence intervals are shown ( ±). After GPT- 4, Guanaco 33B and 65B win the most matches, while Guanaco 13B scores better than Bard. Model Size Elo GPT-4 - 1348 ±1 Guanaco 65B 41 GB 1022 ±1 Guanaco 33B 21 GB 992 ±1 Vicuna 13B 26 GB 974 ±1 ChatGPT - 966 ±1 Guanaco 13B 10 GB 916 ±1 Bard - 902 ±1 Guanaco 7B 6 GB 879 ±1that are tuned by backpropagating gradients through the quantized weights. QLORAreduces the average memory requirements of finetuning a 65B parameter model from >780GB of GPU memory to <48GB without degrading the runtime or predictive performance compared to a 16- bit fully finetuned baseline. This marks a significant shift in accessibility of LLM finetuning: now the largest publicly available models to date finetunable on a single GPU. Using QLORA, we train the Gua- naco family of models, with the second best model reaching 97.8% of the performance level of ChatGPT on the Vicuna [ 10] benchmark, while being trainable in less than 12 hours on a single consumer GPU; using a single professional GPU over 24 hours we achieve 99.3% with our largest model, essentially closing the gap to ChatGPT on the Vicuna bench- mark. When deployed, our smallest Guanaco model (7B parameters) requires just 5 GB of memory and outperforms a 26 GB Alpaca model by more than 20 percentage points on the Vicuna benchmark (Table 6)."
8,0.851806,"An additional limitation is that we did not evaluate different bit-precisions, such as using 3-bit base models, or different adapter methods. Besides LoRA, there is also a wide variety Parameter Efficient FineTuning (PEFT) methods that have been shown to work well. However, it is unclear if these methods scale to large models. We used LoRA as many results established its robustness but other adapters might yield better performance. Since finetuning after quantization seems to recover most of the information that is lost during quantization this might enable much more aggressive quantization. For example, 3-bit GPTQ quantization of the basemodel with LoRA might also yield 16-bit full finetuning performance after finetuning. 9 Broader Impacts Our QLORAfinetuning method is the first method that enables the finetuning of 33B parameter models on a single consumer GPU and 65B parameter models on a single professional GPU, while not degrading performance relative to a full finetuning baseline. We have demonstrated that our best 33B model trained on the Open Assistant dataset can rival ChatGPT on the Vicuna benchmark. Since instruction finetuning is an essential tool to transform raw pretrained LLMs into ChatGPT-like chatbots, we believe that our method will make finetuning widespread and common in particular for the researchers that have the least resources, a big win for the accessibility of state of the art NLP technology. QLORAcan be seen as an equalizing factor that helps to close the resource gap between large corporations and small teams with consumer GPUs. Another potential source of impact is deployment to mobile phones. We believe our QLORAmethod might enable the critical milestone of enabling the finetuning of LLMs on phones and other low resource settings. While 7B models were shown to be able to be run on phones before, QLORAis the first method that would enable the finetuning of such models. We estimate that with an iPhone 12 Plus, QLORAcan finetune 3 million tokens per night while the phone is charging."
9,0.850268,"This is a multiple-choice benchmark covering 57 tasks including elementary mathematics, US history, computer science, law, and more. We report 5-shot test accuracy. We also test generative language capabilities through both automated and human evaluations. This second set of evaluations relies on queries curated by humans and aims at measuring the quality of model responses. While this is a more realistic testbed for chatbot model performance and is growing in popularity, there is no commonly accepted protocol in the literature. We de- scribe below our proposed setup, using nucleus sampling with p= 0.9and temperature 0.7in all cases. 8"


Running Evaluation for Reranker: CohereRerank
Visualize Retrieved Nodes for Reranker: CohereRerank


Unnamed: 0,Score,Text
0,0.988799,"QLORAhas one low-precision storage data type, in our case usually 4-bit, and one computation data type that is usually BFloat16. In practice, this means whenever a QLORAweight tensor is used, we dequantize the tensor to BFloat16, and then perform a matrix multiplication in 16-bit. We now discuss the components of QL ORA followed by a formal definition of QL ORA. 4-bit NormalFloat Quantization The NormalFloat (NF) data type builds on Quantile Quantization [15] which is an information-theoretically optimal data type that ensures each quantization bin has an equal number of values assigned from the input tensor. Quantile quantization works by estimating the quantile of the input tensor through the empirical cumulative distribution function. The main limitation of quantile quantization is that the process of quantile estimation is expensive. Therefore fast quantile approximation algorithms, such as SRAM quantiles [ 15], are used to estimate them. Due to the approximate nature of these quantile estimation algorithms, the data type has large quantization errors for outliers, which are often the most important values. Expensive quantile estimates and approximation errors can be avoided when input tensors come from a distribution fixed up to a quantization constant. In such cases, input tensors have the same quantiles making exact quantile estimation computationally feasible. Since pretrained neural network weights usually have a zero-centered normal distribution with standard deviation σ(see Appendix F), we can transform all weights to a single fixed distribution by scaling σsuch that the distribution fits exactly into the range of our data type. For our data type, we set the arbitrary range [−1,1]. As such, both the quantiles for the data type and the neural network weights need to be normalized into this range."
1,0.986044,"QLORAcan be seen as an equalizing factor that helps to close the resource gap between large corporations and small teams with consumer GPUs. Another potential source of impact is deployment to mobile phones. We believe our QLORAmethod might enable the critical milestone of enabling the finetuning of LLMs on phones and other low resource settings. While 7B models were shown to be able to be run on phones before, QLORAis the first method that would enable the finetuning of such models. We estimate that with an iPhone 12 Plus, QLORAcan finetune 3 million tokens per night while the phone is charging. While finetuned 7B models do not reach the quality of ChatGPT, we believe that the quality is good enough to enable novel applications that have not been possible before due to privacy or LLM quality issues. QLORA can help enable privacy-preserving usage of LLMs, where users can own and manage their own data and models, while simultaneously making LLMs easier to deploy. However, finetuning is a dual-use technology that can be abused to cause harm. Widespread use of LLMs has known dangers [ 8,6], but we believe that equalizing access to a technology that is quickly becoming ubiquitous will allow for better more independent analysis than keeping the power of LLMs in the hands of large corporations that do not release models or source code for auditing. All in all, we believe that QLORAwill have a broadly positive impact making the finetuning of high quality LLMs much more widely and easily accessible. Acknowledgements We thank Aditya Kusupati, Ofir Press, Ashish Sharma, Margaret Li, Raphael Olivier, Zihao Ye, and Evangelia Spiliopoulou for their valuable feedback. Our research was facilitated by the advanced computational, storage, and networking infrastructure of the Hyak supercomputer system at the University of Washington. We thank the Hyak team for ensuring a smooth operation. We thank the beta testers of the bitsandbytes library, in particular Alex Birch and Alyssa Vance. We thank Younes Belkada for help with the integration of our software into the Hugging Face transformers stack. 16"
2,0.973795,"Table 1: Elo ratings for a competition between models, averaged for 10,000 random initial order- ings. The winner of a match is determined by GPT-4 which declares which response is better for a given prompt of the the Vicuna benchmark. 95% confidence intervals are shown ( ±). After GPT- 4, Guanaco 33B and 65B win the most matches, while Guanaco 13B scores better than Bard. Model Size Elo GPT-4 - 1348 ±1 Guanaco 65B 41 GB 1022 ±1 Guanaco 33B 21 GB 992 ±1 Vicuna 13B 26 GB 974 ±1 ChatGPT - 966 ±1 Guanaco 13B 10 GB 916 ±1 Bard - 902 ±1 Guanaco 7B 6 GB 879 ±1that are tuned by backpropagating gradients through the quantized weights. QLORAreduces the average memory requirements of finetuning a 65B parameter model from >780GB of GPU memory to <48GB without degrading the runtime or predictive performance compared to a 16- bit fully finetuned baseline. This marks a significant shift in accessibility of LLM finetuning: now the largest publicly available models to date finetunable on a single GPU. Using QLORA, we train the Gua- naco family of models, with the second best model reaching 97.8% of the performance level of ChatGPT on the Vicuna [ 10] benchmark, while being trainable in less than 12 hours on a single consumer GPU; using a single professional GPU over 24 hours we achieve 99.3% with our largest model, essentially closing the gap to ChatGPT on the Vicuna bench- mark. When deployed, our smallest Guanaco model (7B parameters) requires just 5 GB of memory and outperforms a 26 GB Alpaca model by more than 20 percentage points on the Vicuna benchmark (Table 6)."
3,0.97007,"This second step yields the quantized quantization constants cFP8 2and the second level of quantization constants cFP32 1. We use 8-bit Floats with a blocksize of 256 for the second quantization as no performance degradation is observed for 8-bit quantization, in line with results from Dettmers and Zettlemoyer [13]. Since the cFP32 2are positive, we subtract the mean from c2before quantization to center the values around zero and make use of symmetric quantization. On average, for a blocksize of 64, this quantization reduces the memory footprint per parameter from 32/64 = 0 .5bits, to 8/64 + 32 /(64·256) = 0 .127bits, a reduction of 0.373 bits per parameter. Paged Optimizers use the NVIDIA unified memory3feature wich does automatic page-to-page transfers between the CPU and GPU for error-free GPU processing in the scenario where the GPU occasionally runs out-of-memory. The feature works like regular memory paging between CPU RAM and the disk. We use this feature to allocate paged memory for the optimizer states which are then automatically evicted to CPU RAM when the GPU runs out-of-memory and paged back into GPU memory when the memory is needed in the optimizer update step. QL ORA.Using the components described above, we define QLORAfor a single linear layer in the quantized base model with a single LoRA adapter as follows: YBF16=XBF16doubleDequant (cFP32 1, ck-bit 2,WNF4) +XBF16LBF16 1LBF16 2, (5) where doubleDequant (·)is defined as: doubleDequant (cFP32 1, ck-bit 2,Wk-bit) =dequant (dequant (cFP32 1, ck-bit 2),W4bit) =WBF16,(6) We use NF4 for Wand FP8 for c2. We use a blocksize of 64 for Wfor higher quantization precision and a blocksize of 256 for c2to conserve memory."
4,0.964589,"QL ORA.Using the components described above, we define QLORAfor a single linear layer in the quantized base model with a single LoRA adapter as follows: YBF16=XBF16doubleDequant (cFP32 1, ck-bit 2,WNF4) +XBF16LBF16 1LBF16 2, (5) where doubleDequant (·)is defined as: doubleDequant (cFP32 1, ck-bit 2,Wk-bit) =dequant (dequant (cFP32 1, ck-bit 2),W4bit) =WBF16,(6) We use NF4 for Wand FP8 for c2. We use a blocksize of 64 for Wfor higher quantization precision and a blocksize of 256 for c2to conserve memory. For parameter updates only the gradient with respect to the error for the adapters weights∂E ∂Liare needed, and not for 4-bit weights∂E ∂W. However, the calculation of∂E ∂Lientails the calculation of∂X ∂W which proceeds via equation (5) with dequantization from storage WNF4to computation data type WBF16to calculate the derivative∂X ∂Win BFloat16 precision. To summarize, QLORAhas one storage data type (usually 4-bit NormalFloat) and a computation data type (16-bit BrainFloat). We dequantize the storage data type to the computation data type to perform the forward and backward pass, but we only compute weight gradients for the LoRA parameters which use 16-bit BrainFloat. 4 QLoRA vs. Standard Finetuning We have discussed how QLoRA works and how it can significantly reduce the required memory for finetuning models. The main question now is whether QLoRA can perform as well as full-model finetuning. Furthermore, we want to analyze the components of QLoRA including the impact of NormalFloat4 over standard Float4. The following sections will discuss the experiments that aimed at answering these questions. 3https://docs.nvidia.com/cuda/cuda-c-programming-guide 5"


Running Evaluation for Reranker: bge-reranker-base
Visualize Retrieved Nodes for Reranker: bge-reranker-base


Unnamed: 0,Score,Text
0,0.728742,"This second step yields the quantized quantization constants cFP8 2and the second level of quantization constants cFP32 1. We use 8-bit Floats with a blocksize of 256 for the second quantization as no performance degradation is observed for 8-bit quantization, in line with results from Dettmers and Zettlemoyer [13]. Since the cFP32 2are positive, we subtract the mean from c2before quantization to center the values around zero and make use of symmetric quantization. On average, for a blocksize of 64, this quantization reduces the memory footprint per parameter from 32/64 = 0 .5bits, to 8/64 + 32 /(64·256) = 0 .127bits, a reduction of 0.373 bits per parameter. Paged Optimizers use the NVIDIA unified memory3feature wich does automatic page-to-page transfers between the CPU and GPU for error-free GPU processing in the scenario where the GPU occasionally runs out-of-memory. The feature works like regular memory paging between CPU RAM and the disk. We use this feature to allocate paged memory for the optimizer states which are then automatically evicted to CPU RAM when the GPU runs out-of-memory and paged back into GPU memory when the memory is needed in the optimizer update step. QL ORA.Using the components described above, we define QLORAfor a single linear layer in the quantized base model with a single LoRA adapter as follows: YBF16=XBF16doubleDequant (cFP32 1, ck-bit 2,WNF4) +XBF16LBF16 1LBF16 2, (5) where doubleDequant (·)is defined as: doubleDequant (cFP32 1, ck-bit 2,Wk-bit) =dequant (dequant (cFP32 1, ck-bit 2),W4bit) =WBF16,(6) We use NF4 for Wand FP8 for c2. We use a blocksize of 64 for Wfor higher quantization precision and a blocksize of 256 for c2to conserve memory."
1,0.560365,"Parameter Efficient Finetuning (PEFT) method, most of the memory footprint for LLM finetuning comes from activation gradients and not from the learned LoRA parameters. For a 7B LLaMA model trained on FLAN v2 with a batch size of 1, with LoRA weights equivalent to commonly used 0.2% of the original model weights[ 28,37], the LoRA input gradients have a memory footprint of 567 MB while the LoRA parameters take up only 26 MB. With gradient checkpointing [ 9], the input gradients reduce to an average of 18 MB per sequence making them more memory intensive than all LoRA weights combined. In comparison, the 4-bit base model consumes 5,048 MB of memory. This highlights that gradient checkpointing is important but also that aggressively reducing the amount of LoRA parameter yields only minor memory benefits. This means we can use more adapters without significantly increasing the overall training memory footprint (see Appendix G for a detailed breakdown). As discussed later, this is crucial for recovering full 16-bit precision performance. 3 QL ORA Finetuning QLORAachieves high-fidelity 4-bit finetuning via two techniques we propose—4-bit NormalFloat (NF4) quantization and Double Quantization. Additionally, we introduce Paged Optimizers, to prevent memory spikes during gradient checkpointing from causing out-of-memory errors that have traditionally made finetuning on a single machine difficult for large models. QLORAhas one low-precision storage data type, in our case usually 4-bit, and one computation data type that is usually BFloat16. In practice, this means whenever a QLORAweight tensor is used, we dequantize the tensor to BFloat16, and then perform a matrix multiplication in 16-bit. We now discuss the components of QL ORA followed by a formal definition of QL ORA. 4-bit NormalFloat Quantization The NormalFloat (NF) data type builds on Quantile Quantization [15] which is an information-theoretically optimal data type that ensures each quantization bin has an equal number of values assigned from the input tensor. Quantile quantization works by estimating the quantile of the input tensor through the empirical cumulative distribution function."
2,0.384644,"QLORAhas one low-precision storage data type, in our case usually 4-bit, and one computation data type that is usually BFloat16. In practice, this means whenever a QLORAweight tensor is used, we dequantize the tensor to BFloat16, and then perform a matrix multiplication in 16-bit. We now discuss the components of QL ORA followed by a formal definition of QL ORA. 4-bit NormalFloat Quantization The NormalFloat (NF) data type builds on Quantile Quantization [15] which is an information-theoretically optimal data type that ensures each quantization bin has an equal number of values assigned from the input tensor. Quantile quantization works by estimating the quantile of the input tensor through the empirical cumulative distribution function. The main limitation of quantile quantization is that the process of quantile estimation is expensive. Therefore fast quantile approximation algorithms, such as SRAM quantiles [ 15], are used to estimate them. Due to the approximate nature of these quantile estimation algorithms, the data type has large quantization errors for outliers, which are often the most important values. Expensive quantile estimates and approximation errors can be avoided when input tensors come from a distribution fixed up to a quantization constant. In such cases, input tensors have the same quantiles making exact quantile estimation computationally feasible. Since pretrained neural network weights usually have a zero-centered normal distribution with standard deviation σ(see Appendix F), we can transform all weights to a single fixed distribution by scaling σsuch that the distribution fits exactly into the range of our data type. For our data type, we set the arbitrary range [−1,1]. As such, both the quantiles for the data type and the neural network weights need to be normalized into this range."
3,0.27373,"This is a multiple-choice benchmark covering 57 tasks including elementary mathematics, US history, computer science, law, and more. We report 5-shot test accuracy. We also test generative language capabilities through both automated and human evaluations. This second set of evaluations relies on queries curated by humans and aims at measuring the quality of model responses. While this is a more realistic testbed for chatbot model performance and is growing in popularity, there is no commonly accepted protocol in the literature. We de- scribe below our proposed setup, using nucleus sampling with p= 0.9and temperature 0.7in all cases. 8"
4,0.272615,"QLORAcan be seen as an equalizing factor that helps to close the resource gap between large corporations and small teams with consumer GPUs. Another potential source of impact is deployment to mobile phones. We believe our QLORAmethod might enable the critical milestone of enabling the finetuning of LLMs on phones and other low resource settings. While 7B models were shown to be able to be run on phones before, QLORAis the first method that would enable the finetuning of such models. We estimate that with an iPhone 12 Plus, QLORAcan finetune 3 million tokens per night while the phone is charging. While finetuned 7B models do not reach the quality of ChatGPT, we believe that the quality is good enough to enable novel applications that have not been possible before due to privacy or LLM quality issues. QLORA can help enable privacy-preserving usage of LLMs, where users can own and manage their own data and models, while simultaneously making LLMs easier to deploy. However, finetuning is a dual-use technology that can be abused to cause harm. Widespread use of LLMs has known dangers [ 8,6], but we believe that equalizing access to a technology that is quickly becoming ubiquitous will allow for better more independent analysis than keeping the power of LLMs in the hands of large corporations that do not release models or source code for auditing. All in all, we believe that QLORAwill have a broadly positive impact making the finetuning of high quality LLMs much more widely and easily accessible. Acknowledgements We thank Aditya Kusupati, Ofir Press, Ashish Sharma, Margaret Li, Raphael Olivier, Zihao Ye, and Evangelia Spiliopoulou for their valuable feedback. Our research was facilitated by the advanced computational, storage, and networking infrastructure of the Hyak supercomputer system at the University of Washington. We thank the Hyak team for ensuring a smooth operation. We thank the beta testers of the bitsandbytes library, in particular Alex Birch and Alyssa Vance. We thank Younes Belkada for help with the integration of our software into the Hugging Face transformers stack. 16"


Running Evaluation for Reranker: bge-reranker-large
Visualize Retrieved Nodes for Reranker: bge-reranker-large


Unnamed: 0,Score,Text
0,0.246923,"This is a multiple-choice benchmark covering 57 tasks including elementary mathematics, US history, computer science, law, and more. We report 5-shot test accuracy. We also test generative language capabilities through both automated and human evaluations. This second set of evaluations relies on queries curated by humans and aims at measuring the quality of model responses. While this is a more realistic testbed for chatbot model performance and is growing in popularity, there is no commonly accepted protocol in the literature. We de- scribe below our proposed setup, using nucleus sampling with p= 0.9and temperature 0.7in all cases. 8"
1,0.231972,"QLORAhas one low-precision storage data type, in our case usually 4-bit, and one computation data type that is usually BFloat16. In practice, this means whenever a QLORAweight tensor is used, we dequantize the tensor to BFloat16, and then perform a matrix multiplication in 16-bit. We now discuss the components of QL ORA followed by a formal definition of QL ORA. 4-bit NormalFloat Quantization The NormalFloat (NF) data type builds on Quantile Quantization [15] which is an information-theoretically optimal data type that ensures each quantization bin has an equal number of values assigned from the input tensor. Quantile quantization works by estimating the quantile of the input tensor through the empirical cumulative distribution function. The main limitation of quantile quantization is that the process of quantile estimation is expensive. Therefore fast quantile approximation algorithms, such as SRAM quantiles [ 15], are used to estimate them. Due to the approximate nature of these quantile estimation algorithms, the data type has large quantization errors for outliers, which are often the most important values. Expensive quantile estimates and approximation errors can be avoided when input tensors come from a distribution fixed up to a quantization constant. In such cases, input tensors have the same quantiles making exact quantile estimation computationally feasible. Since pretrained neural network weights usually have a zero-centered normal distribution with standard deviation σ(see Appendix F), we can transform all weights to a single fixed distribution by scaling σsuch that the distribution fits exactly into the range of our data type. For our data type, we set the arbitrary range [−1,1]. As such, both the quantiles for the data type and the neural network weights need to be normalized into this range."
2,0.181327,"An additional limitation is that we did not evaluate different bit-precisions, such as using 3-bit base models, or different adapter methods. Besides LoRA, there is also a wide variety Parameter Efficient FineTuning (PEFT) methods that have been shown to work well. However, it is unclear if these methods scale to large models. We used LoRA as many results established its robustness but other adapters might yield better performance. Since finetuning after quantization seems to recover most of the information that is lost during quantization this might enable much more aggressive quantization. For example, 3-bit GPTQ quantization of the basemodel with LoRA might also yield 16-bit full finetuning performance after finetuning. 9 Broader Impacts Our QLORAfinetuning method is the first method that enables the finetuning of 33B parameter models on a single consumer GPU and 65B parameter models on a single professional GPU, while not degrading performance relative to a full finetuning baseline. We have demonstrated that our best 33B model trained on the Open Assistant dataset can rival ChatGPT on the Vicuna benchmark. Since instruction finetuning is an essential tool to transform raw pretrained LLMs into ChatGPT-like chatbots, we believe that our method will make finetuning widespread and common in particular for the researchers that have the least resources, a big win for the accessibility of state of the art NLP technology. QLORAcan be seen as an equalizing factor that helps to close the resource gap between large corporations and small teams with consumer GPUs. Another potential source of impact is deployment to mobile phones. We believe our QLORAmethod might enable the critical milestone of enabling the finetuning of LLMs on phones and other low resource settings. While 7B models were shown to be able to be run on phones before, QLORAis the first method that would enable the finetuning of such models. We estimate that with an iPhone 12 Plus, QLORAcan finetune 3 million tokens per night while the phone is charging."
3,0.160098,"Parameter Efficient Finetuning (PEFT) method, most of the memory footprint for LLM finetuning comes from activation gradients and not from the learned LoRA parameters. For a 7B LLaMA model trained on FLAN v2 with a batch size of 1, with LoRA weights equivalent to commonly used 0.2% of the original model weights[ 28,37], the LoRA input gradients have a memory footprint of 567 MB while the LoRA parameters take up only 26 MB. With gradient checkpointing [ 9], the input gradients reduce to an average of 18 MB per sequence making them more memory intensive than all LoRA weights combined. In comparison, the 4-bit base model consumes 5,048 MB of memory. This highlights that gradient checkpointing is important but also that aggressively reducing the amount of LoRA parameter yields only minor memory benefits. This means we can use more adapters without significantly increasing the overall training memory footprint (see Appendix G for a detailed breakdown). As discussed later, this is crucial for recovering full 16-bit precision performance. 3 QL ORA Finetuning QLORAachieves high-fidelity 4-bit finetuning via two techniques we propose—4-bit NormalFloat (NF4) quantization and Double Quantization. Additionally, we introduce Paged Optimizers, to prevent memory spikes during gradient checkpointing from causing out-of-memory errors that have traditionally made finetuning on a single machine difficult for large models. QLORAhas one low-precision storage data type, in our case usually 4-bit, and one computation data type that is usually BFloat16. In practice, this means whenever a QLORAweight tensor is used, we dequantize the tensor to BFloat16, and then perform a matrix multiplication in 16-bit. We now discuss the components of QL ORA followed by a formal definition of QL ORA. 4-bit NormalFloat Quantization The NormalFloat (NF) data type builds on Quantile Quantization [15] which is an information-theoretically optimal data type that ensures each quantization bin has an equal number of values assigned from the input tensor. Quantile quantization works by estimating the quantile of the input tensor through the empirical cumulative distribution function."
4,0.124812,"QLORAcan be seen as an equalizing factor that helps to close the resource gap between large corporations and small teams with consumer GPUs. Another potential source of impact is deployment to mobile phones. We believe our QLORAmethod might enable the critical milestone of enabling the finetuning of LLMs on phones and other low resource settings. While 7B models were shown to be able to be run on phones before, QLORAis the first method that would enable the finetuning of such models. We estimate that with an iPhone 12 Plus, QLORAcan finetune 3 million tokens per night while the phone is charging. While finetuned 7B models do not reach the quality of ChatGPT, we believe that the quality is good enough to enable novel applications that have not been possible before due to privacy or LLM quality issues. QLORA can help enable privacy-preserving usage of LLMs, where users can own and manage their own data and models, while simultaneously making LLMs easier to deploy. However, finetuning is a dual-use technology that can be abused to cause harm. Widespread use of LLMs has known dangers [ 8,6], but we believe that equalizing access to a technology that is quickly becoming ubiquitous will allow for better more independent analysis than keeping the power of LLMs in the hands of large corporations that do not release models or source code for auditing. All in all, we believe that QLORAwill have a broadly positive impact making the finetuning of high quality LLMs much more widely and easily accessible. Acknowledgements We thank Aditya Kusupati, Ofir Press, Ashish Sharma, Margaret Li, Raphael Olivier, Zihao Ye, and Evangelia Spiliopoulou for their valuable feedback. Our research was facilitated by the advanced computational, storage, and networking infrastructure of the Hyak supercomputer system at the University of Washington. We thank the Hyak team for ensuring a smooth operation. We thank the beta testers of the bitsandbytes library, in particular Alex Birch and Alyssa Vance. We thank Younes Belkada for help with the integration of our software into the Hugging Face transformers stack. 16"


In [32]:
query_str = "What are Paged Optimizers?"

results_df = pd.DataFrame()
# Loop over rerankers
for rerank_name, reranker in RERANKERS.items():
    print(f"Running Evaluation for Reranker: {rerank_name}")

    query_bundle = QueryBundle(query_str)

    retrieved_nodes = retriever.retrieve(query_bundle)

    if reranker != "None":
      retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
    else:
        retrieved_nodes

    print(f"Visualize Retrieved Nodes for Reranker: {rerank_name}")
    visualize_retrieved_nodes(retrieved_nodes)


Running Evaluation for Reranker: WithoutReranker
Visualize Retrieved Nodes for Reranker: WithoutReranker


Unnamed: 0,Score,Text
0,0.860933,"This second step yields the quantized quantization constants cFP8 2and the second level of quantization constants cFP32 1. We use 8-bit Floats with a blocksize of 256 for the second quantization as no performance degradation is observed for 8-bit quantization, in line with results from Dettmers and Zettlemoyer [13]. Since the cFP32 2are positive, we subtract the mean from c2before quantization to center the values around zero and make use of symmetric quantization. On average, for a blocksize of 64, this quantization reduces the memory footprint per parameter from 32/64 = 0 .5bits, to 8/64 + 32 /(64·256) = 0 .127bits, a reduction of 0.373 bits per parameter. Paged Optimizers use the NVIDIA unified memory3feature wich does automatic page-to-page transfers between the CPU and GPU for error-free GPU processing in the scenario where the GPU occasionally runs out-of-memory. The feature works like regular memory paging between CPU RAM and the disk. We use this feature to allocate paged memory for the optimizer states which are then automatically evicted to CPU RAM when the GPU runs out-of-memory and paged back into GPU memory when the memory is needed in the optimizer update step. QL ORA.Using the components described above, we define QLORAfor a single linear layer in the quantized base model with a single LoRA adapter as follows: YBF16=XBF16doubleDequant (cFP32 1, ck-bit 2,WNF4) +XBF16LBF16 1LBF16 2, (5) where doubleDequant (·)is defined as: doubleDequant (cFP32 1, ck-bit 2,Wk-bit) =dequant (dequant (cFP32 1, ck-bit 2),W4bit) =WBF16,(6) We use NF4 for Wand FP8 for c2. We use a blocksize of 64 for Wfor higher quantization precision and a blocksize of 256 for c2to conserve memory."
1,0.846524,"Each point represents a run with a different random seed. We improve on the Stanford Alpaca fully finetuned default hyperparameters to construct a strong 16-bit baseline for comparisons. Using LoRA on all transformer layers is critical to match 16-bit performance.While paged optimizers are critical to do 33B/65B QLORAtuning on a single 24/48GB GPU, we do not provide hard measurements for Paged Optimiz- ers since the paging only occurs when processing mini-batches with long sequence lengths, which is rare. We do, however, perform an analysis of the runtime of paged optimizers for 65B models on 48GB GPUs and find that with a batch size of 16, paged optimizers provide the same training speed as regular optimizers. Future work should measure and characterize under what circumstances slow- downs occur from the paging process. Default LoRA hyperparameters do not match 16- bit performance When using the standard prac- tice of applying LoRA to query and value attention projection matrices [ 28], we are not able to replicate full finetuning performance for large base models. As shown in Figure 2 for LLaMA 7B finetuning on Alpaca, we find that the most critical LoRA hyper- parameter is how many LoRA adapters are used in total and that LoRA on all linear transformer block layers are required to match full finetuning perfor- mance. Other LoRA hyperparameters, such as the projection dimension r, do not affect performance (see Appendix A). 1010 1011 T otal model bits 0.60 0.61 0.62 0.63 0.64 0.65 0.66 0.67Mean zeroshot accuracy 4-bit LLaMA Float NFloat NFloat + DQData type Figure 3: Mean zero-shot accuracy over Wino- grande, HellaSwag, PiQA, Arc-Easy, and Arc- Challenge using LLaMA models with different 4-bit data types. The NormalFloat data type significantly improves the bit-for-bit accuracy gains compared to regular 4-bit Floats."
2,0.827598,"ensure a discrete zeropoint of 0and to use all 2kbits for a k-bit datatype, we create an asymmetric data type by estimating the quantiles qiof two ranges qi:2k−1for the negative part and 2k−1+ 1for the positive part and then we unify these sets of qiand remove one of the two zeros that occurs in both sets. We term the resulting data type that has equal expected number of values in each quantization bin k-bit NormalFloat (NFk), since the data type is information-theoretically optimal for zero-centered normally distributed data. The exact values of this data type can be found in Appendix E. Double Quantization We introduce Double Quantization (DQ), the process of quantizing the quantization constants for additional memory savings. While a small blocksize is required for precise 4-bit quantization [ 13], it also has a considerable memory overhead. For example, using 32-bit constants and a blocksize of 64 for W, quantization constants add 32/64 = 0 .5bits per parameter on average. Double Quantization helps reduce the memory footprint of quantization constants. More specifically, Double Quantization treats quantization constants cFP32 2of the first quantization as inputs to a second quantization. This second step yields the quantized quantization constants cFP8 2and the second level of quantization constants cFP32 1. We use 8-bit Floats with a blocksize of 256 for the second quantization as no performance degradation is observed for 8-bit quantization, in line with results from Dettmers and Zettlemoyer [13]. Since the cFP32 2are positive, we subtract the mean from c2before quantization to center the values around zero and make use of symmetric quantization. On average, for a blocksize of 64, this quantization reduces the memory footprint per parameter from 32/64 = 0 .5bits, to 8/64 + 32 /(64·256) = 0 .127bits, a reduction of 0.373 bits per parameter."
3,0.82199,"Deep reinforcement learning from human preferences. Advances in neural information processing systems , 30, 2017. [12] H. W. Chung, L. Hou, S. Longpre, B. Zoph, Y . Tay, W. Fedus, E. Li, X. Wang, M. De- hghani, S. Brahma, et al. Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 , 2022. [13] T. Dettmers and L. Zettlemoyer. The case for 4-bit precision: k-bit inference scaling laws. arXiv preprint arXiv:2212.09720 , 2022. [14] T. Dettmers, M. Lewis, Y . Belkada, and L. Zettlemoyer. LLM.int8(): 8-bit matrix multiplication for transformers at scale. Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022, NeurIPS 2022 , 2022. [15] T. Dettmers, M. Lewis, S. Shleifer, and L. Zettlemoyer. 8-bit optimizers via block-wise quantization. 9th International Conference on Learning Representations, ICLR , 2022. [16] A. E. Elo. The proposed uscf rating system. its development, theory, and applications. Chess Life, 22(8):242–247, 1967. [17] A. E. Elo. The rating of chessplayers, past and present . Arco Pub., 1978. 17"
4,0.817314,"Experimental setup. We consider three architectures (encoder, encoder-decoder, and decoder only) and compare QLoRA with 16-bit adapter-finetuning and with full-finetuning for models up to 3B. Our evaluations include GLUE [ 58] with RoBERTa-large [ 38], Super-NaturalInstructions (TKInstruct) [61] with T5 [ 49], and 5-shot MMLU [ 24] after finetuning LLaMA on Flan v2 [ 39] and Alpaca [55]. To additionally study the advantages of NF4 over other 4-bit data types, we use the setup of Dettmers and Zettlemoyer [13] and measure post-quantization zero-shot accuracy and perplexity across different models (OPT [ 72], LLaMA [ 57], BLOOM [ 52], Pythia [ 7]) for model sizes 125m - 13B. We provide more details in the results section for each particular setup to make the results more readable. Full details in Appendix A. QLoRA-AllQLoRA-FFN QLoRA-AttentionAlpaca (ours) Stanford-Alpaca Model6061626364RougeL bits 4 16 Figure 2: RougeL for LLaMA 7B models on the Alpaca dataset. Each point represents a run with a different random seed. We improve on the Stanford Alpaca fully finetuned default hyperparameters to construct a strong 16-bit baseline for comparisons. Using LoRA on all transformer layers is critical to match 16-bit performance.While paged optimizers are critical to do 33B/65B QLORAtuning on a single 24/48GB GPU, we do not provide hard measurements for Paged Optimiz- ers since the paging only occurs when processing mini-batches with long sequence lengths, which is rare. We do, however, perform an analysis of the runtime of paged optimizers for 65B models on 48GB GPUs and find that with a batch size of 16, paged optimizers provide the same training speed as regular optimizers."
5,0.814126,"Figure 1: Different finetuning methods and their memory requirements. QLORAimproves over LoRA by quantizing the transformer model to 4-bit precision and using paged optimizers to handle memory spikes. 2 Background Block-wise k-bit Quantization Quantization is the process of discretizing an input from a rep- resentation that holds more information to a representation with less information. It often means taking a data type with more bits and converting it to fewer bits, for example from 32-bit floats to 8-bit Integers. To ensure that the entire range of the low-bit data type is used, the input data type is commonly rescaled into the target data type range through normalization by the absolute maximum of the input elements, which are usually structured as a tensor. For example, quantizing a 32-bit Floating Point (FP32) tensor into a Int8 tensor with range [−127,127]: XInt8=round127 absmax (XFP32)XFP32 =round (cFP32·XFP32), (1) where cis the quantization constant orquantization scale . Dequantization is the inverse: dequant (cFP32,XInt8) =XInt8 cFP32=XFP32(2) The problem with this approach is that if a large magnitude value (i.e., an outlier) occurs in the input tensor, then the quantization bins—certain bit combinations—are not utilized well with few or no numbers quantized in some bins. To prevent the outlier issue, a common approach is to chunk the input tensor into blocks that are independently quantized, each with their own quantization constant c. This can be formalized as follows: We chunk the input tensor X∈Rb×hintoncontiguous blocks of sizeBby flattening the input tensor and slicing the linear segment into n= (b×h)/Bblocks. We quantize these blocks independently with Equation 1 to create a quantized tensor and nquantization constants ci. Low-rank Adapters Low-rank Adapter (LoRA) finetuning [ 28] is a method that reduces memory requirements by using a small set of trainable parameters, often termed adapters, while not updating the full model parameters which remain fixed."
6,0.806766,"(3) Paged Optimizers , using NVIDIA unified memory to avoid the gradient checkpointing memory spikes that occur when processing a mini-batch with a long sequence length. We combine these contributions into a better tuned LoRA approach that includes adapters at every network layer and thereby avoids almost all of the accuracy tradeoffs seen in prior work. QLORA’s efficiency enables us to perform an in-depth study of instruction finetuning and chatbot performance on model scales that would be impossible using regular finetuning due to memory overhead. Therefore, we train more than 1,000 models across several instruction tuning datasets, model architectures, and sizes between 80M to 65B parameters. In addition to showing that QLORA recovers 16-bit performance (§4) and training a state-of-the-art chatbot, Guanaco , (§5), we also analyze trends in the trained models. First, we find that data quality is far more important than dataset size, e.g., a 9k sample dataset (OASST1) outperformed a 450k sample dataset (FLAN v2, subsampled) on chatbot performance, even when both are meant to support instruction following generalization. Second, we show that strong Massive Multitask Language Understanding (MMLU) benchmark performance does not imply strong Vicuna chatbot benchmark performance and vice versa—in other words, dataset suitability matters more than size for a given task. Furthermore, we also provide a extensive analysis of chatbot performance that uses both human raters and GPT-4 for evaluation. We use tournament-style benchmarking where models compete against each other in matches to produce the best response for a given prompt. The winner of a match is judged by either GPT-4 or human annotators. The tournament results are aggregated into Elo scores [ 16,17] which determine the ranking of chatbot performance. We find that GPT-4 and human evaluations largely agree on the rank of model performance in the tournaments, but we also find there are instances of strong disagreement. As such, we highlight that model-based evaluation while providing a cheap alternative to human-annotation also has its uncertainties. We augment our chatbot benchmark results with a qualitative analysis of Guanaco models."
7,0.804762,"arXiv preprint arXiv:2209.14375 , 2022. [22] S. Gururangan, S. Swayamdipta, O. Levy, R. Schwartz, S. R. Bowman, and N. A. Smith. Annotation artifacts in natural language inference data. arXiv preprint arXiv:1803.02324 , 2018. [23] J. Henderson, S. Ruder, et al. Compacter: Efficient low-rank hypercomplex adapter layers. In Advances in Neural Information Processing Systems , 2021. [24] D. Hendrycks, C. Burns, S. Basart, A. Zou, M. Mazeika, D. Song, and J. Steinhardt. Mea- suring massive multitask language understanding. In International Conference on Learning Representations , 2020. [25] A. Holtzman, J. Buys, L. Du, M. Forbes, and Y . Choi. The curious case of neural text degeneration. In International Conference on Learning Representations , 2020. [26] O. Honovich, T. Scialom, O. Levy, and T. Schick. Unnatural instructions: Tuning language models with (almost) no human labor. arXiv preprint arXiv:2212.09689 , 2022. [27] N. Houlsby, A. Giurgiu, S. Jastrzebski, B. Morrone, Q. De Laroussilhe, A. Gesmundo, M. At- tariyan, and S. Gelly. Parameter-efficient transfer learning for nlp. In International Conference on Machine Learning , pages 2790–2799. PMLR, 2019. [28] E. J. Hu, Y . Shen, P. Wallis, Z. Allen-Zhu, Y . Li, S. Wang, L. Wang, and W. Chen. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 , 2021. [29] S. Iyer, X. V ."
8,0.804004,"This marks a significant shift in accessibility of LLM finetuning: now the largest publicly available models to date finetunable on a single GPU. Using QLORA, we train the Gua- naco family of models, with the second best model reaching 97.8% of the performance level of ChatGPT on the Vicuna [ 10] benchmark, while being trainable in less than 12 hours on a single consumer GPU; using a single professional GPU over 24 hours we achieve 99.3% with our largest model, essentially closing the gap to ChatGPT on the Vicuna bench- mark. When deployed, our smallest Guanaco model (7B parameters) requires just 5 GB of memory and outperforms a 26 GB Alpaca model by more than 20 percentage points on the Vicuna benchmark (Table 6). QLORAintroduces multiple innovations designed to reduce memory use without sacrificing per- formance: (1) 4-bit NormalFloat , an information theoretically optimal quantization data type for normally distributed data that yields better empirical results than 4-bit Integers and 4-bit Floats. (2)Double Quantization , a method that quantizes the quantization constants, saving an average of about 0.37 bits per parameter (approximately 3 GB for a 65B model). (3) Paged Optimizers , using NVIDIA unified memory to avoid the gradient checkpointing memory spikes that occur when processing a mini-batch with a long sequence length. We combine these contributions into a better tuned LoRA approach that includes adapters at every network layer and thereby avoids almost all of the accuracy tradeoffs seen in prior work. QLORA’s efficiency enables us to perform an in-depth study of instruction finetuning and chatbot performance on model scales that would be impossible using regular finetuning due to memory overhead. Therefore, we train more than 1,000 models across several instruction tuning datasets, model architectures, and sizes between 80M to 65B parameters. In addition to showing that QLORA recovers 16-bit performance (§4) and training a state-of-the-art chatbot, Guanaco , (§5), we also analyze trends in the trained models."
9,0.802683,"Parameter Efficient Finetuning (PEFT) method, most of the memory footprint for LLM finetuning comes from activation gradients and not from the learned LoRA parameters. For a 7B LLaMA model trained on FLAN v2 with a batch size of 1, with LoRA weights equivalent to commonly used 0.2% of the original model weights[ 28,37], the LoRA input gradients have a memory footprint of 567 MB while the LoRA parameters take up only 26 MB. With gradient checkpointing [ 9], the input gradients reduce to an average of 18 MB per sequence making them more memory intensive than all LoRA weights combined. In comparison, the 4-bit base model consumes 5,048 MB of memory. This highlights that gradient checkpointing is important but also that aggressively reducing the amount of LoRA parameter yields only minor memory benefits. This means we can use more adapters without significantly increasing the overall training memory footprint (see Appendix G for a detailed breakdown). As discussed later, this is crucial for recovering full 16-bit precision performance. 3 QL ORA Finetuning QLORAachieves high-fidelity 4-bit finetuning via two techniques we propose—4-bit NormalFloat (NF4) quantization and Double Quantization. Additionally, we introduce Paged Optimizers, to prevent memory spikes during gradient checkpointing from causing out-of-memory errors that have traditionally made finetuning on a single machine difficult for large models. QLORAhas one low-precision storage data type, in our case usually 4-bit, and one computation data type that is usually BFloat16. In practice, this means whenever a QLORAweight tensor is used, we dequantize the tensor to BFloat16, and then perform a matrix multiplication in 16-bit. We now discuss the components of QL ORA followed by a formal definition of QL ORA. 4-bit NormalFloat Quantization The NormalFloat (NF) data type builds on Quantile Quantization [15] which is an information-theoretically optimal data type that ensures each quantization bin has an equal number of values assigned from the input tensor. Quantile quantization works by estimating the quantile of the input tensor through the empirical cumulative distribution function."


Running Evaluation for Reranker: CohereRerank
Visualize Retrieved Nodes for Reranker: CohereRerank


Unnamed: 0,Score,Text
0,0.997469,"(3) Paged Optimizers , using NVIDIA unified memory to avoid the gradient checkpointing memory spikes that occur when processing a mini-batch with a long sequence length. We combine these contributions into a better tuned LoRA approach that includes adapters at every network layer and thereby avoids almost all of the accuracy tradeoffs seen in prior work. QLORA’s efficiency enables us to perform an in-depth study of instruction finetuning and chatbot performance on model scales that would be impossible using regular finetuning due to memory overhead. Therefore, we train more than 1,000 models across several instruction tuning datasets, model architectures, and sizes between 80M to 65B parameters. In addition to showing that QLORA recovers 16-bit performance (§4) and training a state-of-the-art chatbot, Guanaco , (§5), we also analyze trends in the trained models. First, we find that data quality is far more important than dataset size, e.g., a 9k sample dataset (OASST1) outperformed a 450k sample dataset (FLAN v2, subsampled) on chatbot performance, even when both are meant to support instruction following generalization. Second, we show that strong Massive Multitask Language Understanding (MMLU) benchmark performance does not imply strong Vicuna chatbot benchmark performance and vice versa—in other words, dataset suitability matters more than size for a given task. Furthermore, we also provide a extensive analysis of chatbot performance that uses both human raters and GPT-4 for evaluation. We use tournament-style benchmarking where models compete against each other in matches to produce the best response for a given prompt. The winner of a match is judged by either GPT-4 or human annotators. The tournament results are aggregated into Elo scores [ 16,17] which determine the ranking of chatbot performance. We find that GPT-4 and human evaluations largely agree on the rank of model performance in the tournaments, but we also find there are instances of strong disagreement. As such, we highlight that model-based evaluation while providing a cheap alternative to human-annotation also has its uncertainties. We augment our chatbot benchmark results with a qualitative analysis of Guanaco models."
1,0.99315,"This second step yields the quantized quantization constants cFP8 2and the second level of quantization constants cFP32 1. We use 8-bit Floats with a blocksize of 256 for the second quantization as no performance degradation is observed for 8-bit quantization, in line with results from Dettmers and Zettlemoyer [13]. Since the cFP32 2are positive, we subtract the mean from c2before quantization to center the values around zero and make use of symmetric quantization. On average, for a blocksize of 64, this quantization reduces the memory footprint per parameter from 32/64 = 0 .5bits, to 8/64 + 32 /(64·256) = 0 .127bits, a reduction of 0.373 bits per parameter. Paged Optimizers use the NVIDIA unified memory3feature wich does automatic page-to-page transfers between the CPU and GPU for error-free GPU processing in the scenario where the GPU occasionally runs out-of-memory. The feature works like regular memory paging between CPU RAM and the disk. We use this feature to allocate paged memory for the optimizer states which are then automatically evicted to CPU RAM when the GPU runs out-of-memory and paged back into GPU memory when the memory is needed in the optimizer update step. QL ORA.Using the components described above, we define QLORAfor a single linear layer in the quantized base model with a single LoRA adapter as follows: YBF16=XBF16doubleDequant (cFP32 1, ck-bit 2,WNF4) +XBF16LBF16 1LBF16 2, (5) where doubleDequant (·)is defined as: doubleDequant (cFP32 1, ck-bit 2,Wk-bit) =dequant (dequant (cFP32 1, ck-bit 2),W4bit) =WBF16,(6) We use NF4 for Wand FP8 for c2. We use a blocksize of 64 for Wfor higher quantization precision and a blocksize of 256 for c2to conserve memory."
2,0.973894,"Parameter Efficient Finetuning (PEFT) method, most of the memory footprint for LLM finetuning comes from activation gradients and not from the learned LoRA parameters. For a 7B LLaMA model trained on FLAN v2 with a batch size of 1, with LoRA weights equivalent to commonly used 0.2% of the original model weights[ 28,37], the LoRA input gradients have a memory footprint of 567 MB while the LoRA parameters take up only 26 MB. With gradient checkpointing [ 9], the input gradients reduce to an average of 18 MB per sequence making them more memory intensive than all LoRA weights combined. In comparison, the 4-bit base model consumes 5,048 MB of memory. This highlights that gradient checkpointing is important but also that aggressively reducing the amount of LoRA parameter yields only minor memory benefits. This means we can use more adapters without significantly increasing the overall training memory footprint (see Appendix G for a detailed breakdown). As discussed later, this is crucial for recovering full 16-bit precision performance. 3 QL ORA Finetuning QLORAachieves high-fidelity 4-bit finetuning via two techniques we propose—4-bit NormalFloat (NF4) quantization and Double Quantization. Additionally, we introduce Paged Optimizers, to prevent memory spikes during gradient checkpointing from causing out-of-memory errors that have traditionally made finetuning on a single machine difficult for large models. QLORAhas one low-precision storage data type, in our case usually 4-bit, and one computation data type that is usually BFloat16. In practice, this means whenever a QLORAweight tensor is used, we dequantize the tensor to BFloat16, and then perform a matrix multiplication in 16-bit. We now discuss the components of QL ORA followed by a formal definition of QL ORA. 4-bit NormalFloat Quantization The NormalFloat (NF) data type builds on Quantile Quantization [15] which is an information-theoretically optimal data type that ensures each quantization bin has an equal number of values assigned from the input tensor. Quantile quantization works by estimating the quantile of the input tensor through the empirical cumulative distribution function."
3,0.968915,"Each point represents a run with a different random seed. We improve on the Stanford Alpaca fully finetuned default hyperparameters to construct a strong 16-bit baseline for comparisons. Using LoRA on all transformer layers is critical to match 16-bit performance.While paged optimizers are critical to do 33B/65B QLORAtuning on a single 24/48GB GPU, we do not provide hard measurements for Paged Optimiz- ers since the paging only occurs when processing mini-batches with long sequence lengths, which is rare. We do, however, perform an analysis of the runtime of paged optimizers for 65B models on 48GB GPUs and find that with a batch size of 16, paged optimizers provide the same training speed as regular optimizers. Future work should measure and characterize under what circumstances slow- downs occur from the paging process. Default LoRA hyperparameters do not match 16- bit performance When using the standard prac- tice of applying LoRA to query and value attention projection matrices [ 28], we are not able to replicate full finetuning performance for large base models. As shown in Figure 2 for LLaMA 7B finetuning on Alpaca, we find that the most critical LoRA hyper- parameter is how many LoRA adapters are used in total and that LoRA on all linear transformer block layers are required to match full finetuning perfor- mance. Other LoRA hyperparameters, such as the projection dimension r, do not affect performance (see Appendix A). 1010 1011 T otal model bits 0.60 0.61 0.62 0.63 0.64 0.65 0.66 0.67Mean zeroshot accuracy 4-bit LLaMA Float NFloat NFloat + DQData type Figure 3: Mean zero-shot accuracy over Wino- grande, HellaSwag, PiQA, Arc-Easy, and Arc- Challenge using LLaMA models with different 4-bit data types. The NormalFloat data type significantly improves the bit-for-bit accuracy gains compared to regular 4-bit Floats."
4,0.960805,"This marks a significant shift in accessibility of LLM finetuning: now the largest publicly available models to date finetunable on a single GPU. Using QLORA, we train the Gua- naco family of models, with the second best model reaching 97.8% of the performance level of ChatGPT on the Vicuna [ 10] benchmark, while being trainable in less than 12 hours on a single consumer GPU; using a single professional GPU over 24 hours we achieve 99.3% with our largest model, essentially closing the gap to ChatGPT on the Vicuna bench- mark. When deployed, our smallest Guanaco model (7B parameters) requires just 5 GB of memory and outperforms a 26 GB Alpaca model by more than 20 percentage points on the Vicuna benchmark (Table 6). QLORAintroduces multiple innovations designed to reduce memory use without sacrificing per- formance: (1) 4-bit NormalFloat , an information theoretically optimal quantization data type for normally distributed data that yields better empirical results than 4-bit Integers and 4-bit Floats. (2)Double Quantization , a method that quantizes the quantization constants, saving an average of about 0.37 bits per parameter (approximately 3 GB for a 65B model). (3) Paged Optimizers , using NVIDIA unified memory to avoid the gradient checkpointing memory spikes that occur when processing a mini-batch with a long sequence length. We combine these contributions into a better tuned LoRA approach that includes adapters at every network layer and thereby avoids almost all of the accuracy tradeoffs seen in prior work. QLORA’s efficiency enables us to perform an in-depth study of instruction finetuning and chatbot performance on model scales that would be impossible using regular finetuning due to memory overhead. Therefore, we train more than 1,000 models across several instruction tuning datasets, model architectures, and sizes between 80M to 65B parameters. In addition to showing that QLORA recovers 16-bit performance (§4) and training a state-of-the-art chatbot, Guanaco , (§5), we also analyze trends in the trained models."


Running Evaluation for Reranker: bge-reranker-base
Visualize Retrieved Nodes for Reranker: bge-reranker-base


Unnamed: 0,Score,Text
0,0.277108,"Each point represents a run with a different random seed. We improve on the Stanford Alpaca fully finetuned default hyperparameters to construct a strong 16-bit baseline for comparisons. Using LoRA on all transformer layers is critical to match 16-bit performance.While paged optimizers are critical to do 33B/65B QLORAtuning on a single 24/48GB GPU, we do not provide hard measurements for Paged Optimiz- ers since the paging only occurs when processing mini-batches with long sequence lengths, which is rare. We do, however, perform an analysis of the runtime of paged optimizers for 65B models on 48GB GPUs and find that with a batch size of 16, paged optimizers provide the same training speed as regular optimizers. Future work should measure and characterize under what circumstances slow- downs occur from the paging process. Default LoRA hyperparameters do not match 16- bit performance When using the standard prac- tice of applying LoRA to query and value attention projection matrices [ 28], we are not able to replicate full finetuning performance for large base models. As shown in Figure 2 for LLaMA 7B finetuning on Alpaca, we find that the most critical LoRA hyper- parameter is how many LoRA adapters are used in total and that LoRA on all linear transformer block layers are required to match full finetuning perfor- mance. Other LoRA hyperparameters, such as the projection dimension r, do not affect performance (see Appendix A). 1010 1011 T otal model bits 0.60 0.61 0.62 0.63 0.64 0.65 0.66 0.67Mean zeroshot accuracy 4-bit LLaMA Float NFloat NFloat + DQData type Figure 3: Mean zero-shot accuracy over Wino- grande, HellaSwag, PiQA, Arc-Easy, and Arc- Challenge using LLaMA models with different 4-bit data types. The NormalFloat data type significantly improves the bit-for-bit accuracy gains compared to regular 4-bit Floats."
1,0.223023,"Parameter Efficient Finetuning (PEFT) method, most of the memory footprint for LLM finetuning comes from activation gradients and not from the learned LoRA parameters. For a 7B LLaMA model trained on FLAN v2 with a batch size of 1, with LoRA weights equivalent to commonly used 0.2% of the original model weights[ 28,37], the LoRA input gradients have a memory footprint of 567 MB while the LoRA parameters take up only 26 MB. With gradient checkpointing [ 9], the input gradients reduce to an average of 18 MB per sequence making them more memory intensive than all LoRA weights combined. In comparison, the 4-bit base model consumes 5,048 MB of memory. This highlights that gradient checkpointing is important but also that aggressively reducing the amount of LoRA parameter yields only minor memory benefits. This means we can use more adapters without significantly increasing the overall training memory footprint (see Appendix G for a detailed breakdown). As discussed later, this is crucial for recovering full 16-bit precision performance. 3 QL ORA Finetuning QLORAachieves high-fidelity 4-bit finetuning via two techniques we propose—4-bit NormalFloat (NF4) quantization and Double Quantization. Additionally, we introduce Paged Optimizers, to prevent memory spikes during gradient checkpointing from causing out-of-memory errors that have traditionally made finetuning on a single machine difficult for large models. QLORAhas one low-precision storage data type, in our case usually 4-bit, and one computation data type that is usually BFloat16. In practice, this means whenever a QLORAweight tensor is used, we dequantize the tensor to BFloat16, and then perform a matrix multiplication in 16-bit. We now discuss the components of QL ORA followed by a formal definition of QL ORA. 4-bit NormalFloat Quantization The NormalFloat (NF) data type builds on Quantile Quantization [15] which is an information-theoretically optimal data type that ensures each quantization bin has an equal number of values assigned from the input tensor. Quantile quantization works by estimating the quantile of the input tensor through the empirical cumulative distribution function."
2,0.208092,"This marks a significant shift in accessibility of LLM finetuning: now the largest publicly available models to date finetunable on a single GPU. Using QLORA, we train the Gua- naco family of models, with the second best model reaching 97.8% of the performance level of ChatGPT on the Vicuna [ 10] benchmark, while being trainable in less than 12 hours on a single consumer GPU; using a single professional GPU over 24 hours we achieve 99.3% with our largest model, essentially closing the gap to ChatGPT on the Vicuna bench- mark. When deployed, our smallest Guanaco model (7B parameters) requires just 5 GB of memory and outperforms a 26 GB Alpaca model by more than 20 percentage points on the Vicuna benchmark (Table 6). QLORAintroduces multiple innovations designed to reduce memory use without sacrificing per- formance: (1) 4-bit NormalFloat , an information theoretically optimal quantization data type for normally distributed data that yields better empirical results than 4-bit Integers and 4-bit Floats. (2)Double Quantization , a method that quantizes the quantization constants, saving an average of about 0.37 bits per parameter (approximately 3 GB for a 65B model). (3) Paged Optimizers , using NVIDIA unified memory to avoid the gradient checkpointing memory spikes that occur when processing a mini-batch with a long sequence length. We combine these contributions into a better tuned LoRA approach that includes adapters at every network layer and thereby avoids almost all of the accuracy tradeoffs seen in prior work. QLORA’s efficiency enables us to perform an in-depth study of instruction finetuning and chatbot performance on model scales that would be impossible using regular finetuning due to memory overhead. Therefore, we train more than 1,000 models across several instruction tuning datasets, model architectures, and sizes between 80M to 65B parameters. In addition to showing that QLORA recovers 16-bit performance (§4) and training a state-of-the-art chatbot, Guanaco , (§5), we also analyze trends in the trained models."
3,0.122867,"(3) Paged Optimizers , using NVIDIA unified memory to avoid the gradient checkpointing memory spikes that occur when processing a mini-batch with a long sequence length. We combine these contributions into a better tuned LoRA approach that includes adapters at every network layer and thereby avoids almost all of the accuracy tradeoffs seen in prior work. QLORA’s efficiency enables us to perform an in-depth study of instruction finetuning and chatbot performance on model scales that would be impossible using regular finetuning due to memory overhead. Therefore, we train more than 1,000 models across several instruction tuning datasets, model architectures, and sizes between 80M to 65B parameters. In addition to showing that QLORA recovers 16-bit performance (§4) and training a state-of-the-art chatbot, Guanaco , (§5), we also analyze trends in the trained models. First, we find that data quality is far more important than dataset size, e.g., a 9k sample dataset (OASST1) outperformed a 450k sample dataset (FLAN v2, subsampled) on chatbot performance, even when both are meant to support instruction following generalization. Second, we show that strong Massive Multitask Language Understanding (MMLU) benchmark performance does not imply strong Vicuna chatbot benchmark performance and vice versa—in other words, dataset suitability matters more than size for a given task. Furthermore, we also provide a extensive analysis of chatbot performance that uses both human raters and GPT-4 for evaluation. We use tournament-style benchmarking where models compete against each other in matches to produce the best response for a given prompt. The winner of a match is judged by either GPT-4 or human annotators. The tournament results are aggregated into Elo scores [ 16,17] which determine the ranking of chatbot performance. We find that GPT-4 and human evaluations largely agree on the rank of model performance in the tournaments, but we also find there are instances of strong disagreement. As such, we highlight that model-based evaluation while providing a cheap alternative to human-annotation also has its uncertainties. We augment our chatbot benchmark results with a qualitative analysis of Guanaco models."
4,0.117635,"This second step yields the quantized quantization constants cFP8 2and the second level of quantization constants cFP32 1. We use 8-bit Floats with a blocksize of 256 for the second quantization as no performance degradation is observed for 8-bit quantization, in line with results from Dettmers and Zettlemoyer [13]. Since the cFP32 2are positive, we subtract the mean from c2before quantization to center the values around zero and make use of symmetric quantization. On average, for a blocksize of 64, this quantization reduces the memory footprint per parameter from 32/64 = 0 .5bits, to 8/64 + 32 /(64·256) = 0 .127bits, a reduction of 0.373 bits per parameter. Paged Optimizers use the NVIDIA unified memory3feature wich does automatic page-to-page transfers between the CPU and GPU for error-free GPU processing in the scenario where the GPU occasionally runs out-of-memory. The feature works like regular memory paging between CPU RAM and the disk. We use this feature to allocate paged memory for the optimizer states which are then automatically evicted to CPU RAM when the GPU runs out-of-memory and paged back into GPU memory when the memory is needed in the optimizer update step. QL ORA.Using the components described above, we define QLORAfor a single linear layer in the quantized base model with a single LoRA adapter as follows: YBF16=XBF16doubleDequant (cFP32 1, ck-bit 2,WNF4) +XBF16LBF16 1LBF16 2, (5) where doubleDequant (·)is defined as: doubleDequant (cFP32 1, ck-bit 2,Wk-bit) =dequant (dequant (cFP32 1, ck-bit 2),W4bit) =WBF16,(6) We use NF4 for Wand FP8 for c2. We use a blocksize of 64 for Wfor higher quantization precision and a blocksize of 256 for c2to conserve memory."


Running Evaluation for Reranker: bge-reranker-large
Visualize Retrieved Nodes for Reranker: bge-reranker-large


Unnamed: 0,Score,Text
0,0.242676,"Parameter Efficient Finetuning (PEFT) method, most of the memory footprint for LLM finetuning comes from activation gradients and not from the learned LoRA parameters. For a 7B LLaMA model trained on FLAN v2 with a batch size of 1, with LoRA weights equivalent to commonly used 0.2% of the original model weights[ 28,37], the LoRA input gradients have a memory footprint of 567 MB while the LoRA parameters take up only 26 MB. With gradient checkpointing [ 9], the input gradients reduce to an average of 18 MB per sequence making them more memory intensive than all LoRA weights combined. In comparison, the 4-bit base model consumes 5,048 MB of memory. This highlights that gradient checkpointing is important but also that aggressively reducing the amount of LoRA parameter yields only minor memory benefits. This means we can use more adapters without significantly increasing the overall training memory footprint (see Appendix G for a detailed breakdown). As discussed later, this is crucial for recovering full 16-bit precision performance. 3 QL ORA Finetuning QLORAachieves high-fidelity 4-bit finetuning via two techniques we propose—4-bit NormalFloat (NF4) quantization and Double Quantization. Additionally, we introduce Paged Optimizers, to prevent memory spikes during gradient checkpointing from causing out-of-memory errors that have traditionally made finetuning on a single machine difficult for large models. QLORAhas one low-precision storage data type, in our case usually 4-bit, and one computation data type that is usually BFloat16. In practice, this means whenever a QLORAweight tensor is used, we dequantize the tensor to BFloat16, and then perform a matrix multiplication in 16-bit. We now discuss the components of QL ORA followed by a formal definition of QL ORA. 4-bit NormalFloat Quantization The NormalFloat (NF) data type builds on Quantile Quantization [15] which is an information-theoretically optimal data type that ensures each quantization bin has an equal number of values assigned from the input tensor. Quantile quantization works by estimating the quantile of the input tensor through the empirical cumulative distribution function."
1,0.055309,"This marks a significant shift in accessibility of LLM finetuning: now the largest publicly available models to date finetunable on a single GPU. Using QLORA, we train the Gua- naco family of models, with the second best model reaching 97.8% of the performance level of ChatGPT on the Vicuna [ 10] benchmark, while being trainable in less than 12 hours on a single consumer GPU; using a single professional GPU over 24 hours we achieve 99.3% with our largest model, essentially closing the gap to ChatGPT on the Vicuna bench- mark. When deployed, our smallest Guanaco model (7B parameters) requires just 5 GB of memory and outperforms a 26 GB Alpaca model by more than 20 percentage points on the Vicuna benchmark (Table 6). QLORAintroduces multiple innovations designed to reduce memory use without sacrificing per- formance: (1) 4-bit NormalFloat , an information theoretically optimal quantization data type for normally distributed data that yields better empirical results than 4-bit Integers and 4-bit Floats. (2)Double Quantization , a method that quantizes the quantization constants, saving an average of about 0.37 bits per parameter (approximately 3 GB for a 65B model). (3) Paged Optimizers , using NVIDIA unified memory to avoid the gradient checkpointing memory spikes that occur when processing a mini-batch with a long sequence length. We combine these contributions into a better tuned LoRA approach that includes adapters at every network layer and thereby avoids almost all of the accuracy tradeoffs seen in prior work. QLORA’s efficiency enables us to perform an in-depth study of instruction finetuning and chatbot performance on model scales that would be impossible using regular finetuning due to memory overhead. Therefore, we train more than 1,000 models across several instruction tuning datasets, model architectures, and sizes between 80M to 65B parameters. In addition to showing that QLORA recovers 16-bit performance (§4) and training a state-of-the-art chatbot, Guanaco , (§5), we also analyze trends in the trained models."
2,0.048074,"This second step yields the quantized quantization constants cFP8 2and the second level of quantization constants cFP32 1. We use 8-bit Floats with a blocksize of 256 for the second quantization as no performance degradation is observed for 8-bit quantization, in line with results from Dettmers and Zettlemoyer [13]. Since the cFP32 2are positive, we subtract the mean from c2before quantization to center the values around zero and make use of symmetric quantization. On average, for a blocksize of 64, this quantization reduces the memory footprint per parameter from 32/64 = 0 .5bits, to 8/64 + 32 /(64·256) = 0 .127bits, a reduction of 0.373 bits per parameter. Paged Optimizers use the NVIDIA unified memory3feature wich does automatic page-to-page transfers between the CPU and GPU for error-free GPU processing in the scenario where the GPU occasionally runs out-of-memory. The feature works like regular memory paging between CPU RAM and the disk. We use this feature to allocate paged memory for the optimizer states which are then automatically evicted to CPU RAM when the GPU runs out-of-memory and paged back into GPU memory when the memory is needed in the optimizer update step. QL ORA.Using the components described above, we define QLORAfor a single linear layer in the quantized base model with a single LoRA adapter as follows: YBF16=XBF16doubleDequant (cFP32 1, ck-bit 2,WNF4) +XBF16LBF16 1LBF16 2, (5) where doubleDequant (·)is defined as: doubleDequant (cFP32 1, ck-bit 2,Wk-bit) =dequant (dequant (cFP32 1, ck-bit 2),W4bit) =WBF16,(6) We use NF4 for Wand FP8 for c2. We use a blocksize of 64 for Wfor higher quantization precision and a blocksize of 256 for c2to conserve memory."
3,0.045258,"(3) Paged Optimizers , using NVIDIA unified memory to avoid the gradient checkpointing memory spikes that occur when processing a mini-batch with a long sequence length. We combine these contributions into a better tuned LoRA approach that includes adapters at every network layer and thereby avoids almost all of the accuracy tradeoffs seen in prior work. QLORA’s efficiency enables us to perform an in-depth study of instruction finetuning and chatbot performance on model scales that would be impossible using regular finetuning due to memory overhead. Therefore, we train more than 1,000 models across several instruction tuning datasets, model architectures, and sizes between 80M to 65B parameters. In addition to showing that QLORA recovers 16-bit performance (§4) and training a state-of-the-art chatbot, Guanaco , (§5), we also analyze trends in the trained models. First, we find that data quality is far more important than dataset size, e.g., a 9k sample dataset (OASST1) outperformed a 450k sample dataset (FLAN v2, subsampled) on chatbot performance, even when both are meant to support instruction following generalization. Second, we show that strong Massive Multitask Language Understanding (MMLU) benchmark performance does not imply strong Vicuna chatbot benchmark performance and vice versa—in other words, dataset suitability matters more than size for a given task. Furthermore, we also provide a extensive analysis of chatbot performance that uses both human raters and GPT-4 for evaluation. We use tournament-style benchmarking where models compete against each other in matches to produce the best response for a given prompt. The winner of a match is judged by either GPT-4 or human annotators. The tournament results are aggregated into Elo scores [ 16,17] which determine the ranking of chatbot performance. We find that GPT-4 and human evaluations largely agree on the rank of model performance in the tournaments, but we also find there are instances of strong disagreement. As such, we highlight that model-based evaluation while providing a cheap alternative to human-annotation also has its uncertainties. We augment our chatbot benchmark results with a qualitative analysis of Guanaco models."
4,0.034322,"Experimental setup. We consider three architectures (encoder, encoder-decoder, and decoder only) and compare QLoRA with 16-bit adapter-finetuning and with full-finetuning for models up to 3B. Our evaluations include GLUE [ 58] with RoBERTa-large [ 38], Super-NaturalInstructions (TKInstruct) [61] with T5 [ 49], and 5-shot MMLU [ 24] after finetuning LLaMA on Flan v2 [ 39] and Alpaca [55]. To additionally study the advantages of NF4 over other 4-bit data types, we use the setup of Dettmers and Zettlemoyer [13] and measure post-quantization zero-shot accuracy and perplexity across different models (OPT [ 72], LLaMA [ 57], BLOOM [ 52], Pythia [ 7]) for model sizes 125m - 13B. We provide more details in the results section for each particular setup to make the results more readable. Full details in Appendix A. QLoRA-AllQLoRA-FFN QLoRA-AttentionAlpaca (ours) Stanford-Alpaca Model6061626364RougeL bits 4 16 Figure 2: RougeL for LLaMA 7B models on the Alpaca dataset. Each point represents a run with a different random seed. We improve on the Stanford Alpaca fully finetuned default hyperparameters to construct a strong 16-bit baseline for comparisons. Using LoRA on all transformer layers is critical to match 16-bit performance.While paged optimizers are critical to do 33B/65B QLORAtuning on a single 24/48GB GPU, we do not provide hard measurements for Paged Optimiz- ers since the paging only occurs when processing mini-batches with long sequence lengths, which is rare. We do, however, perform an analysis of the runtime of paged optimizers for 65B models on 48GB GPUs and find that with a batch size of 16, paged optimizers provide the same training speed as regular optimizers."


# Evaluation

Now, we will use RetrieverEvaluator to evaluate the quality of any Retriever module.

We specify a set of different evaluation metrics: this includes hit-rate and MRR. For any given question, these will compare the quality of retrieved results from the ground-truth context.

To ease the burden of creating the eval dataset in the first place, we can rely on synthetic data generation.

## Build an Evaluation dataset of (query, context) pairs
Here we build a simple evaluation dataset over the existing text corpus.

We use our generate_question_context_pairs to generate a set of (question, context) pairs over a given unstructured text corpus. This uses the LLM to auto-generate questions from each context chunk.

We will use `Zephr-7B` LLM to generate Question-Context Pairs.

We get back a EmbeddingQAFinetuneDataset object. At a high-level this contains a set of ids mapping to queries and relevant doc chunks, as well as the corpus itself.

In [17]:
# Prompt to generate questions
qa_generate_prompt_tmpl = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.

You are a Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. The questions should not contain options, not start with Q1/ Q2. \
Restrict the questions to the context information provided.\
"""

In [18]:
# Evaluator
from llama_index.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index.evaluation import RetrieverEvaluator

qa_dataset = generate_question_context_pairs(
    nodes, llm=llm, num_questions_per_chunk=2, qa_generate_prompt_tmpl=qa_generate_prompt_tmpl
)

100%|██████████| 78/78 [10:18<00:00,  7.93s/it]


In [19]:
len(qa_dataset.corpus.keys())

78

In [22]:
# Generated 2 questions for this chunk
qa_dataset.queries['297a0d1c-57d6-4b8b-928c-5c234cbcc02d']

'How does the Glue benchmark and analysis platform for natural language understanding differ from Self-instruct and Super-naturalinstructions in terms of their approach to language model training and task generalization?'

In [23]:
# Extract relevant doc for this chunk
qa_dataset.relevant_docs['297a0d1c-57d6-4b8b-928c-5c234cbcc02d']

['bc19a32b-006e-4ae2-98ea-6a20b7143f3c']

In [24]:
# Extract corpus for this relevant doc
qa_dataset.corpus['bc19a32b-006e-4ae2-98ea-6a20b7143f3c']

'Glue: A multi-\ntask benchmark and analysis platform for natural language understanding. arXiv preprint\narXiv:1804.07461 , 2018.\n[59] Y . Wang, Y . Kordi, S. Mishra, A. Liu, N. A. Smith, D. Khashabi, and H. Hajishirzi. Self-instruct:\nAligning language model with self generated instructions. arXiv preprint arXiv:2212.10560 ,\n2022.\n[60] Y . Wang, S. Mishra, P. Alipoormolabashi, Y . Kordi, A. Mirzaei, A. Arunkumar, A. Ashok, A. S.\nDhanasekaran, A. Naik, D. Stap, et al. Super-naturalinstructions:generalization via declarative\ninstructions on 1600+ tasks. In EMNLP , 2022.\n[61] Y . Wang, S. Mishra, P. Alipoormolabashi, Y . Kordi, A. Mirzaei, A. Naik, A. Ashok, A. S.\nDhanasekaran, A. Arunkumar, D. Stap, et al. Super-naturalinstructions: Generalization via\ndeclarative instructions on 1600+ nlp tasks. In Proceedings of the 2022 Conference on Empirical\nMethods in Natural Language Processing , pages 5085–5109, 2022.\n[62] J. Wei, M. Bosma, V . Y . Zhao, K. Guu, A. W. Yu, B. Lester, N.

In [25]:
# try it out on a sample query
sample_id, sample_query = list(qa_dataset.queries.items())[1]
sample_expected = qa_dataset.relevant_docs[sample_id]

retriever_evaluator = RetrieverEvaluator.from_metric_names(
        ["mrr", "hit_rate"], retriever=retriever
    )

eval_result = retriever_evaluator.evaluate(sample_query, sample_expected)
print(eval_result)

Query: What are the results of finetuning more than 1,000 models using QLORA, and how do they compare to previous SoTA models in terms of instruction following and chatbot performance? Additionally, what insights does QLORA provide regarding the trustworthiness of current chatbot benchmarks?
Metrics: {'mrr': 0.5, 'hit_rate': 1.0}



## Try it out on an entire dataset

### Define a function to display results

In [26]:
def display_results(reranker_name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metric_df = pd.DataFrame({"Reranker": [reranker_name], "hit_rate": [hit_rate], "mrr": [mrr]})

    return metric_df

In [27]:
query_str = "What are the top features of QLoRA?"

results_df = pd.DataFrame()
# Loop over rerankers
for rerank_name, reranker in RERANKERS.items():
    print(f"Running Evaluation for Reranker: {rerank_name}")

    query_bundle = QueryBundle(query_str)

    retrieved_nodes = retriever.retrieve(query_bundle)

    if reranker != "None":
      retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
    else:
        retrieved_nodes

    retriever_evaluator = RetrieverEvaluator.from_metric_names(
        ["mrr", "hit_rate"], retriever=retriever
    )

    eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

    current_df = display_results(rerank_name, eval_results)
    results_df = pd.concat([results_df, current_df], ignore_index=True)

Running Evaluation for Reranker: WithoutReranker
Running Evaluation for Reranker: CohereRerank
Running Evaluation for Reranker: bge-reranker-base
Running Evaluation for Reranker: bge-reranker-large


In [29]:
# results_df

# END