# RAG Pipeline


In [1]:
!pip install numpy==1.26.4 # downgrading the numpy version



After installing 1.26.4, you need to "Restart Session" and re-import numpy. It's strange that they haven't updated their release notes yet (https://colab.research.google.com/notebooks/relnotes.ipynb)

In [4]:
import numpy
import pandas as pd

In [5]:
!pip install faiss-cpu
!pip install faiss-gpu-cu12 # CUDA 12.x, Python 3.8+
!pip install dotenv
!pip install langchain_chroma
!pip install langchain-community
!pip install langchain_experimental
!pip install langchain_openai
!pip install flashrank

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m70.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
Collecting faiss-gpu-cu12
  Downloading faiss_gpu_cu12-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading faiss_gpu_cu12-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (47.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.9/47.9 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu-cu12
Successfully installed faiss-gpu-cu12-1.10.0
Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting python-dotenv (from dotenv)
  Downloading python_doten

In [6]:
# NEW WITH GEMINI
import os
import torch
import pandas as pd
from tqdm import tqdm
from typing import List
from langchain.retrievers.document_compressors import FlashrankRerank
from langchain.retrievers import ContextualCompressionRetriever
import logging
from google import genai
import time

def call_gemini_with_interval(client, model, contents, config, interval=20, retries=3, backoff_factor=2):
    """
    Call Google Gemini API with a fixed interval between requests and retry logic for handling errors.

    Args:
        client (genai.Client): The Google Gemini client.
        model (str): The model name (e.g., "gemini-2.0-flash").
        contents (str): The input prompt or question.
        config (dict): Configuration for the API call (e.g., tools, temperature, etc.).
        interval (int): Time in seconds to wait between API requests.
        retries (int): Number of retries for handling quota exhaustion or other errors.
        backoff_factor (int): Factor for exponential backoff in case of retries.

    Returns:
        response: The response from the Google Gemini API.
    """
    for attempt in range(retries):
        try:
            # Send the API request
            response = client.models.generate_content(
                model=model,
                contents=contents,
                config=config,
            )
            print(f"Request succeeded. Waiting {interval} seconds before the next request...")
            time.sleep(interval)  # Wait for the specified interval
            return response
        except Exception as e:
            if "RESOURCE_EXHAUSTED" in str(e):
                print(f"Quota exceeded. Retrying in {backoff_factor ** attempt} seconds...")
                time.sleep(backoff_factor ** attempt)
            else:
                raise e
    raise Exception("Max retries exceeded. Quota still exhausted.")


# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def load_text_files(path: str) -> List[str]:
    """
    Load text files from the given path.

    Args:
        path (str): The path to the directory or file containing the text files.

    Returns:
        list: A list of text documents.
    """
    docs = []

    try:
        if os.path.isdir(path):
            # Iterate over files in the directory
            for file_name in os.listdir(path):
                if file_name.endswith(".txt"):
                    file_path = os.path.join(path, file_name)
                    with open(file_path, 'r', encoding='utf-8') as file:
                        docs.append(file.read())
        elif os.path.isfile(path) and path.endswith(".txt"):
            # If the path is a file, directly read it
            with open(path, 'r', encoding='utf-8') as file:
                docs.append(file.read())
    except Exception as e:
        logging.error(f"Error loading text files from {path}: {e}")

    return docs

def format_retrieved_docs(docs: List[str]) -> str:
    """
    Format the retrieved documents in reverse order.

    Args:
        docs (list): A list of documents.

    Returns:
        str: Formatted string with contexts.
    """
    try:
        docs = reversed(docs)
        return "\n\n".join([f"Context {i+1}: {doc}" for i, doc in enumerate(docs)])
    except Exception as e:
        logging.error(f"Error formatting retrieved documents: {e}")
        return ""

def rerank_docs(query: str, retriever, rerank_model_name: str, k: int = 3) -> List[str]:
    """
    Rerank the retrieved documents based on the query using Flashrank.

    Args:
        query (str): The query to rerank documents for.
        retriever: The base retriever object.
        rerank_model_name (str): The name of the rerank model.
        k (int): The number of top documents to rerank.

    Returns:
        list: A list of reranked documents.
    """
    try:
        compressor = FlashrankRerank(top_n=k, model=rerank_model_name)
        compression_retriever = ContextualCompressionRetriever(
            base_compressor=compressor, base_retriever=retriever
        )
        return compression_retriever.invoke(query)
    except Exception as e:
        logging.error(f"Error reranking documents for query '{query}': {e}")
        return []

def get_hypo_doc(query: str, generation_pipe) -> str:
    """
    Generate a hypothesis document for the given query using the language model.

    Args:
        query (str): The query to generate a hypothesis for.
        generation_pipe: The language model pipeline.

    Returns:
        str: The hypothesis document or the original query if unavailable.
    """
    template = """Imagine you are an expert providing a detailed and factual explanation in response to the query '{query}'.
    Your response should include all key points that would be found in a top search result, without adding any personal opinions, commentary, or experiences.
    Do not include any subjective phrases such as 'I think', 'I believe', or 'I am not sure'. Do not apologize, hedge, or express uncertainty.
    The response should be structured as an objective, factual explanation only, without any conversational elements or chatting.
    If you are truly uncertain and cannot provide an accurate answer, simply respond with: 'Unavailable: {query}'.
    Otherwise, answer confidently with only the relevant information.
    """

    messages = [{"role": "user", "content": template.format(query=query)}]

    try:
        with torch.no_grad():
            hypo_doc = generation_pipe(messages, max_new_tokens=100, return_full_text=False)[0]["generated_text"]
        logging.info(f"Generated hypothesis document for query: {query}")
        # print("Question:", query)
        # print("Hypothesis Document:", hypo_doc)
        if hypo_doc.startswith("Unavailable"):
            logging.warning(f"Hypothesis unavailable for query: {query}")
            return query
        return hypo_doc
    except Exception as e:
        logging.error(f"Error generating hypothesis document for query '{query}': {e}")
        return query

def answer_generation(
    qa_df: pd.DataFrame,
    output_file: str,
    retriever,
    generation_pipe=None,
    client=None,
    prompt=None,
    rerank: bool = False,
    rerank_model_name: str = "",
    hypo: bool = False,
    top_k_rerank: int = 3,
    model_name: str = "llama"
):
    """
    Generate answers for the given questions using the retriever and the generation pipeline or Google Gemini client.

    Args:
        qa_df (pd.DataFrame): DataFrame containing questions and other metadata.
        output_file (str): Path to save the generated answers.
        retriever: A retriever object to retrieve documents.
        generation_pipe: A pipeline object for text generation (used for LLaMA).
        client: A Google Gemini client object (used for Gemini).
        prompt: A ChatPromptTemplate object for generating prompts.
        rerank (bool): Whether to rerank retrieved documents.
        rerank_model_name (str): The name of the rerank model.
        hypo (bool): Whether to generate a hypothesis document.
        top_k_rerank (int): Number of top documents to rerank.
        model_name (str): The name of the model (e.g., "llama" or "gemini").
    """
    logging.info("Starting answer generation...")

    # Check if the output file exists
    if not os.path.exists(output_file):
        with open(output_file, 'w') as f_out:
            f_out.write(",".join(list(qa_df.columns) + ["Generated_Answer"]) + "\n")
            start_idx = 0
    else:
        # Calculate the number of rows in the output file
        with open(output_file, 'r') as f_out:
            num_rows = sum(1 for line in f_out)
            start_idx = num_rows - 1

    # Iterate over the DataFrame
    with open(output_file, 'a') as f_out:
        for idx, row in tqdm(qa_df.iterrows(), total=len(qa_df)):
            if idx < start_idx:
                continue

            query = row["Question"]
            if hypo:
                query = get_hypo_doc(query, generation_pipe)

            # Retrieve documents
            try:
                if rerank:
                    logging.info(f"Reranking documents for query: {query}")
                    retrieved_docs = rerank_docs(query, retriever, rerank_model_name, k=top_k_rerank)
                else:
                    retrieved_docs = retriever.invoke(query)
            except Exception as e:
                logging.error(f"Error retrieving documents for query '{query}': {e}")
                continue

            # Format the documents
            context = format_retrieved_docs(retrieved_docs)

            # Create the full prompt
            if prompt:
                prompt_messages = prompt.format_messages(context=context, question=row["Question"])
                full_prompt = "\n".join(message.content for message in prompt_messages)
            else:
                full_prompt = f"Context: {context}\nQuestion: {row['Question']}\nAnswer:"

            # Generate the answer
            try:
                if "gemini" in model_name.lower():
                    # Use Google Gemini for answer generation
                    if client is None:
                        raise ValueError("Google Gemini client is not provided.")
                    response = call_gemini_with_interval(
                        client,
                        model=model_name,  # Use the provided Gemini model name
                        contents=full_prompt,
                        config={},  # {"tools": [{"google_search": {}}]},
                        interval=4,  # Wait 4 seconds between requests
                        retries=3,  # Retry up to 3 times if quota is exceeded
                        backoff_factor=2  # Exponential backoff for retries
                    )
                    llm_output = response.text
                elif "llama" in model_name.lower():
                    # Use Hugging Face LLaMA for answer generation
                    if generation_pipe is None:
                        raise ValueError("Hugging Face generation pipeline is not provided.")
                    messages = [{"role": "user", "content": full_prompt}]
                    with torch.no_grad():
                        llm_output = generation_pipe(
                            messages, max_new_tokens=20, return_full_text=False
                        )[0]["generated_text"]
                else:
                    raise ValueError("Invalid model_name. Please ensure it contains 'gemini' or 'llama'.")

                row["Generated_Answer"] = llm_output
                pd.DataFrame([row]).to_csv(f_out, header=False, index=False)
            except Exception as e:
                logging.error(f"Error generating answer for query '{query}': {e}")
                continue

            # Clear cache
            del retrieved_docs, context, full_prompt, llm_output
            torch.cuda.empty_cache()

# Constants
PROMPT_TEMPLATE = """
You are an expert assistant answering factual questions about Pittsburgh or Carnegie Mellon University (CMU).
Use the retrieved information to give a detailed and helpful answer. If the provided context does not contain the answer, leverage your pretraining knowledge to provide the correct answer.
If you truly do not know, just say "I don't know."

Important Instructions:
- Answer concisely without repeating the question.
- Use the provided context if relevant; otherwise, rely on your pretraining knowledge.
- Do **not** use complete sentences. Provide only the word, name, date, or phrase that directly answers the question. For example, given the question "When was Carnegie Mellon University founded?", you should only answer "1900".

Examples:
Question: Who is Pittsburgh named after?
Answer: William Pitt
Question: What famous machine learning venue had its first conference in Pittsburgh in 1980?
Answer: ICML
Question: What musical artist is performing at PPG Arena on October 13?
Answer: Billie Eilish

Context: \n\n {context} \n\n
Question: {question} \n\n
Answer:
"""

In [None]:
# NEW WITH GEMINI
import os
import torch
import pandas as pd
from tqdm import tqdm
import faiss
import numpy as np
import pickle
import random
from dotenv import load_dotenv
from huggingface_hub import login
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from langchain_chroma import Chroma
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    PromptTemplate
)


# ========================================
# Helper Functions
# ========================================
def str2bool(value):
    if isinstance(value, bool):
        return value
    if value.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif value.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise ValueError('Boolean value expected.')

# ========================================
# Main Function for Jupyter/IPython
# ========================================
def run_RAG(
    model_name = "meta-llama/Llama-3.1-8B-Instruct",
    dtype = "float16", # or torch.bfloat16
    embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2",
    embedding_dim = 384,
    splitter_type = "recursive", # or "character", "token", "semantic"
    chunk_size = 1000,
    chunk_overlap = 200,
    text_files_path = "./data/scraped/scraped_all",
    sublink_files_path = "./data/scraped/scraped_text_data",
    sublink_files_nums = 0,
    retriever_type = "FAISS", # or "CHROMA"
    retriever_algorithm = "similarity", # or "mmr"
    rerank = False,
    rerank_model_name = "ms-marco-MultiBERT-L-12",
    top_k_search = 3,
    top_k_rerank = 3,
    hypo = False,
    qes_file_path = "./data/annotated/QA_pairs_1.csv",
    output_file = "./output/results.json",
    qa_nums = 100
):
    # Step 0: Load environment variables
    load_dotenv()

    os.environ["LANGCHAIN_TRACING_V2"] = "true"
    os.environ["LANGCHAIN_API_KEY"] = os.getenv('LANGCHAIN_API_KEY')
    os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
    os.environ["LANGCHAIN_PROJECT"] = "RAGmodel"
    os.environ["USER_AGENT"] = "LangChain/1.0 (+https://www.langchain.com)"

    login(token=os.getenv("HUGGINGFACE_TOKEN", "your_huggingface_token")) # 

    # Set model name, precision, and other parameters
    dtype = torch.float16 if dtype == "float16" else torch.bfloat16
    random.seed(42)

    # Check if rerank is set to True
    if rerank:
        print("Reranking is set to True.")

    # Step 1: Initialize the model as your LLM
    # Determine the model type based on the model name
    if "gemini" in model_name.lower():
        # Initialize the Gemini client
        print("Initializing Google Gemini client...")
        gemini_client = genai.Client(api_key=os.getenv("GEMINI_API_KEY", "your_google_gemini_api_key")) # "your_google_gemini_api_key"
        # Load the model and tokenizer
        print(f"Loading Gemini model: {model_name}")
    elif "llama" in model_name.lower():
        # Login to Hugging Face Hub
        print("Initializing Hugging Face LLaMA model...")

        # Load the model and tokenizer
        print(f"Loading LLaMA model: {model_name}")
        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "left"

        # Initialize the text generation pipeline
        generation_pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            torch_dtype=torch.float16
        )

    else:
        raise ValueError("Invalid model_name. Please ensure it contains 'gemini' or 'llama'.")
    print("Model initialized successfully!")

    # Step 2: Load the Sentence Transformers model for embeddings
    docs_length = f"main160_sublink{sublink_files_nums}"
    model_name_str = embedding_model_name.split('/')[-1]
    embeddings_file_path = f"./data/embeddings/embeddings_{model_name_str}_{docs_length}_{splitter_type}_{retriever_type}_{chunk_size}_{chunk_overlap}.npy"
    splits_file_path = f"./data/embeddings/splits_{model_name_str}_{docs_length}_{splitter_type}_{retriever_type}_{chunk_size}_{chunk_overlap}.pkl"
    embeddings = None
    splits = None
    embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
    print(f"Start loading QA from {qes_file_path}")
    qa_test_data_path = qes_file_path
    qa_df = pd.read_csv(qa_test_data_path)
    print(len(qa_df))
    if len(qa_df) != 574:
        qa_df = qa_df.sample(qa_nums, random_state=221)
    print(f"Loaded {len(qa_df)} QAs")

    # Dynamically determine embedding dimensionality
    embedding_dim = embedding_model.client.get_sentence_embedding_dimension()

    if not os.path.exists(embeddings_file_path):
        # Step 3: Load the text files for building the index and QA evaluation
        print(f"Start loading texts from {text_files_path}")
        # Step 4: Split the documents into smaller chunks
        # Wrap text strings in Document objects
        docs = load_text_files(path=text_files_path)
        documents = [Document(page_content=text) for text in tqdm(docs, desc="Wrapping text in Document objects")]
        del docs

        if sublink_files_nums != 0:
            sublink_file_store_path = "./data/embeddings/sublink_docs.pkl"
            if os.path.exists(sublink_file_store_path):
                print(f"Start loading sublink files from {sublink_file_store_path}")
                with open(sublink_file_store_path, "rb") as f:
                    all_sublink_docs = pickle.load(f)
            else:
                print(f"Start reading all sublink files")
                all_sublink_docs = load_text_files(path=sublink_files_path)
                print(f"Finish loading {len(all_sublink_docs)} sublinks, now store it")
                with open(sublink_file_store_path, 'wb') as f:
                    pickle.dump(all_sublink_docs, f)
                print(f"Store all sublink file in {sublink_file_store_path}")

            sampled_sublink_docs = random.sample(all_sublink_docs, sublink_files_nums)
            documents.extend([Document(page_content=text) for text in tqdm(sampled_sublink_docs, desc="Wrapping text in Document objects")])
            del sampled_sublink_docs
            del all_sublink_docs

        if splitter_type == "recursive":
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        elif splitter_type == "character":
            text_splitter = CharacterTextSplitter(separator=" ", chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        elif splitter_type == "token":
            text_splitter = TokenTextSplitter(chunk_size=int(chunk_size / 4), chunk_overlap=int(chunk_overlap / 4))
        elif splitter_type == "semantic":
            text_splitter = SemanticChunker(
                embeddings=embedding_model,
                breakpoint_threshold_type="percentile",
                breakpoint_threshold_amount=80
            )
        else:
            raise ValueError("Invalid splitter type. Please choose between recursive, character, token, or semantic.")

        splits = text_splitter.split_documents(documents)
        del documents
        print(f"End splitting texts -- Number of splits: {len(splits)}")

        # Step 5: Create Chroma vectorstore with embeddings from Sentence Transformers
        # Generate embeddings with the correct dimensionality
        embeddings = embedding_model.embed_documents([doc.page_content for doc in tqdm(splits, desc="Embedding texts")])
        print(f"Generated embeddings with dimensionality: {embedding_dim}")
        print(f"End embedding texts")

        # Free GPU cache after generating embeddings
        torch.cuda.empty_cache()
        print(f"Start saving embeddings and splits")
        np.save(embeddings_file_path, embeddings)
        with open(splits_file_path, 'wb') as f:
            pickle.dump(splits, f)
        print(f"Embeddings saved in {embeddings_file_path}, splits saved in {splits_file_path}")
    else:
        print(f"Embeddings already exist! Loading embeddings with dimensionality: {embedding_dim}")
        # Step 1: Load embeddings from the saved NumPy file
        embeddings = np.load(embeddings_file_path)
        with open(splits_file_path, 'rb') as f:
            splits = pickle.load(f)
        # Step 2: Load document metadata if needed
        # doc_metadata = np.load("doc_metadata.npy", allow_pickle=True)
        print("End loading")

    # Step 6: Create the RAG prompting pipeline
    prompt_template = PromptTemplate(
        input_variables=['context', 'question'],
        template=PROMPT_TEMPLATE
    )

    # Update the HumanMessagePromptTemplate with the new PromptTemplate
    human_message_template = HumanMessagePromptTemplate(prompt=prompt_template)

    # Update the ChatPromptTemplate with the modified message
    chat_prompt_template = ChatPromptTemplate(
        input_variables=['context', 'question'],
        messages=[human_message_template]
    )
    prompt = chat_prompt_template


    # Step 7: Generate answers for the questions
    print("Building the vectorstore ", retriever_type, "...")
    if retriever_type == "CHROMA":
        retriever = Chroma.from_documents(documents=splits, embeding=embedding_model, collection_name="collectionChroma").as_retriever(search_type=retriever_algorithm, search_kwargs={'k': top_k_search})
    elif retriever_type == "FAISS":
        # embeddings_np = np.array(embeddings).astype("float32")
        retriever = FAISS.from_documents(splits, embedding_model).as_retriever(search_type=retriever_algorithm, search_kwargs={"k": top_k_search})
    else:
        raise ValueError("Invalid retriever type. Please choose between FAISS or CHROMA.")

    print("Retriever built successfully!")
    # Free GPU cache after generating embeddings
    torch.cuda.empty_cache()
    del splits

    # Determine the model type based on the model name
    if "gemini" in model_name.lower():
        answer_generation(
            qa_df=qa_df,
            output_file=output_file,
            retriever=retriever,
            client=gemini_client,  # Pass the Google Gemini client
            prompt=prompt,
            rerank=rerank,
            rerank_model_name=rerank_model_name,
            hypo=hypo,
            top_k_rerank=top_k_rerank,
            model_name=model_name
        )

    elif "llama" in model_name.lower():
        answer_generation(
            qa_df=qa_df,
            output_file=output_file,
            retriever=retriever,
            generation_pipe=generation_pipe,  # Pass the Hugging Face pipeline
            prompt=prompt,
            rerank=rerank,
            rerank_model_name=rerank_model_name,
            hypo=hypo,
            top_k_rerank=top_k_rerank,
            model_name=model_name
        )

    print(f"QA evaluation completed! Results saved to {output_file}")


# run_RAG(
#     model_name="meta-llama/Llama-3.2-3B-Instruct", #"gemini-2.0-flash-thinking-exp-01-21",#
#     dtype="float16",
#     embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
#     embedding_dim = 384,
#     text_files_path="./data/scraped/scraped_all",
#     qes_file_path="./data/annotated/QA_pairs_1.csv",
#     output_file="./output/results.json",
#     qa_nums=100
# )


In [None]:
def run_rag_with_models():
    """
    Run the `run_RAG` function with multiple models and rerank configurations.
    """
    # Define the models and rerank configurations
    models = [
        "meta-llama/Llama-3.2-3B-Instruct",
        "meta-llama/Llama-3.1-8B-Instruct",
        "gemini-2.0-flash-thinking-exp-01-21",
        "gemini-2.0-flash",
    ]
    rerank_options = [True, False]

    # Define the rerank model name
    rerank_model_name = "ms-marco-MiniLM-L-12-v2"

    # Iterate over models and rerank configurations
    for model_name in models:
        for rerank in rerank_options:
            # Generate a unique output file name for each combination
            if rerank:
                output_file = f"./output/{model_name.split('/')[-1]}_rerank_{rerank_model_name}.csv"
            else:
                output_file = f"./output/{model_name.split('/')[-1]}_rerank_false.csv"

            print(f"Running RAG for model: {model_name} with rerank={rerank}")

            # Call the run_RAG function
            run_RAG(
                model_name=model_name,
                dtype="float16",
                embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
                embedding_dim=384,
                splitter_type="recursive",
                chunk_size=1000,
                chunk_overlap=200,
                text_files_path="./data/scraped/scraped_all",
                qes_file_path="./data/annotated/QA_pairs_1.csv",
                qa_nums=100,
                retriever_type="FAISS",
                top_k_search=10,
                top_k_rerank=3,
                rerank=rerank,
                rerank_model_name=rerank_model_name,
                output_file=output_file
            )

            print(f"Completed RAG for model: {model_name} with rerank={rerank}")

In [9]:
run_rag_with_models()

Running RAG for model: gemini-2.0-flash with rerank=True
Reranking is set to True.
Initializing Google Gemini client...
Loading Gemini model: gemini-2.0-flash
Model initialized successfully!


  embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Embeddings already exist! Loading embeddings with dimensionality: 384
End loading
Building the vectorstore  FAISS ...
Retriever built successfully!


  0%|          | 0/100 [00:00<?, ?it/s]
ms-marco-MiniLM-L-12-v2.zip:   0%|          | 0.00/21.6M [00:00<?, ?iB/s][A
ms-marco-MiniLM-L-12-v2.zip: 100%|██████████| 21.6M/21.6M [00:00<00:00, 185MiB/s]


Request succeeded. Waiting 4 seconds before the next request...


  1%|          | 1/100 [00:05<09:45,  5.92s/it]

Request succeeded. Waiting 4 seconds before the next request...


  2%|▏         | 2/100 [00:10<08:50,  5.41s/it]

Request succeeded. Waiting 4 seconds before the next request...


  3%|▎         | 3/100 [00:16<08:41,  5.38s/it]

Request succeeded. Waiting 4 seconds before the next request...


  4%|▍         | 4/100 [00:21<08:29,  5.30s/it]

Request succeeded. Waiting 4 seconds before the next request...


  5%|▌         | 5/100 [00:26<08:14,  5.21s/it]

Request succeeded. Waiting 4 seconds before the next request...


  6%|▌         | 6/100 [00:31<08:02,  5.13s/it]

Request succeeded. Waiting 4 seconds before the next request...


  7%|▋         | 7/100 [00:36<07:55,  5.12s/it]

Request succeeded. Waiting 4 seconds before the next request...


  8%|▊         | 8/100 [00:41<07:49,  5.10s/it]

Request succeeded. Waiting 4 seconds before the next request...


  9%|▉         | 9/100 [00:46<07:44,  5.10s/it]

Request succeeded. Waiting 4 seconds before the next request...


 10%|█         | 10/100 [00:51<07:38,  5.09s/it]

Request succeeded. Waiting 4 seconds before the next request...


 11%|█         | 11/100 [00:57<07:36,  5.13s/it]

Request succeeded. Waiting 4 seconds before the next request...


 12%|█▏        | 12/100 [01:02<07:36,  5.19s/it]

Request succeeded. Waiting 4 seconds before the next request...


 13%|█▎        | 13/100 [01:07<07:28,  5.15s/it]

Request succeeded. Waiting 4 seconds before the next request...


 14%|█▍        | 14/100 [01:12<07:24,  5.17s/it]

Request succeeded. Waiting 4 seconds before the next request...


 15%|█▌        | 15/100 [01:17<07:17,  5.14s/it]

Request succeeded. Waiting 4 seconds before the next request...


 16%|█▌        | 16/100 [01:22<07:06,  5.08s/it]

Request succeeded. Waiting 4 seconds before the next request...


 17%|█▋        | 17/100 [01:27<07:00,  5.07s/it]

Request succeeded. Waiting 4 seconds before the next request...


 18%|█▊        | 18/100 [01:32<06:56,  5.08s/it]

Request succeeded. Waiting 4 seconds before the next request...


 19%|█▉        | 19/100 [01:37<06:51,  5.08s/it]

Request succeeded. Waiting 4 seconds before the next request...


 20%|██        | 20/100 [01:43<06:55,  5.19s/it]

Request succeeded. Waiting 4 seconds before the next request...


 21%|██        | 21/100 [01:48<06:46,  5.15s/it]

Request succeeded. Waiting 4 seconds before the next request...


 22%|██▏       | 22/100 [01:53<06:45,  5.20s/it]

Request succeeded. Waiting 4 seconds before the next request...


 23%|██▎       | 23/100 [01:58<06:37,  5.17s/it]

Request succeeded. Waiting 4 seconds before the next request...


 24%|██▍       | 24/100 [02:03<06:32,  5.16s/it]

Request succeeded. Waiting 4 seconds before the next request...


 25%|██▌       | 25/100 [02:09<06:25,  5.15s/it]

Request succeeded. Waiting 4 seconds before the next request...


 26%|██▌       | 26/100 [02:14<06:16,  5.09s/it]

Request succeeded. Waiting 4 seconds before the next request...


 27%|██▋       | 27/100 [02:19<06:11,  5.09s/it]

Request succeeded. Waiting 4 seconds before the next request...


 28%|██▊       | 28/100 [02:24<06:05,  5.07s/it]

Request succeeded. Waiting 4 seconds before the next request...


 29%|██▉       | 29/100 [02:29<06:02,  5.11s/it]

Request succeeded. Waiting 4 seconds before the next request...


 30%|███       | 30/100 [02:34<05:54,  5.06s/it]

Request succeeded. Waiting 4 seconds before the next request...


 31%|███       | 31/100 [02:39<05:48,  5.05s/it]

Request succeeded. Waiting 4 seconds before the next request...


 32%|███▏      | 32/100 [02:44<05:49,  5.14s/it]

Request succeeded. Waiting 4 seconds before the next request...


 33%|███▎      | 33/100 [02:50<05:53,  5.28s/it]

Request succeeded. Waiting 4 seconds before the next request...


 34%|███▍      | 34/100 [02:55<05:45,  5.23s/it]

Request succeeded. Waiting 4 seconds before the next request...


 35%|███▌      | 35/100 [03:00<05:40,  5.24s/it]

Request succeeded. Waiting 4 seconds before the next request...


 36%|███▌      | 36/100 [03:05<05:36,  5.25s/it]

Request succeeded. Waiting 4 seconds before the next request...


 37%|███▋      | 37/100 [03:11<05:27,  5.20s/it]

Request succeeded. Waiting 4 seconds before the next request...


 38%|███▊      | 38/100 [03:15<05:16,  5.10s/it]

Request succeeded. Waiting 4 seconds before the next request...


 39%|███▉      | 39/100 [03:20<05:09,  5.08s/it]

Request succeeded. Waiting 4 seconds before the next request...


 40%|████      | 40/100 [03:25<05:03,  5.06s/it]

Request succeeded. Waiting 4 seconds before the next request...


 41%|████      | 41/100 [03:31<05:05,  5.18s/it]

Request succeeded. Waiting 4 seconds before the next request...


 42%|████▏     | 42/100 [03:36<04:57,  5.13s/it]

Request succeeded. Waiting 4 seconds before the next request...


 43%|████▎     | 43/100 [03:41<04:50,  5.09s/it]

Request succeeded. Waiting 4 seconds before the next request...


 44%|████▍     | 44/100 [03:46<04:46,  5.11s/it]

Request succeeded. Waiting 4 seconds before the next request...


 45%|████▌     | 45/100 [03:51<04:39,  5.08s/it]

Request succeeded. Waiting 4 seconds before the next request...


 46%|████▌     | 46/100 [03:56<04:34,  5.08s/it]

Request succeeded. Waiting 4 seconds before the next request...


 47%|████▋     | 47/100 [04:01<04:28,  5.07s/it]

Request succeeded. Waiting 4 seconds before the next request...


 48%|████▊     | 48/100 [04:06<04:26,  5.13s/it]

Request succeeded. Waiting 4 seconds before the next request...


 49%|████▉     | 49/100 [04:11<04:16,  5.03s/it]

Request succeeded. Waiting 4 seconds before the next request...


 50%|█████     | 50/100 [04:16<04:11,  5.04s/it]

Request succeeded. Waiting 4 seconds before the next request...


 51%|█████     | 51/100 [04:22<04:11,  5.13s/it]

Request succeeded. Waiting 4 seconds before the next request...


 52%|█████▏    | 52/100 [04:27<04:08,  5.17s/it]

Request succeeded. Waiting 4 seconds before the next request...


 53%|█████▎    | 53/100 [04:32<04:04,  5.20s/it]

Request succeeded. Waiting 4 seconds before the next request...


 54%|█████▍    | 54/100 [04:37<03:57,  5.17s/it]

Request succeeded. Waiting 4 seconds before the next request...


 55%|█████▌    | 55/100 [04:43<03:53,  5.19s/it]

Request succeeded. Waiting 4 seconds before the next request...


 56%|█████▌    | 56/100 [04:48<03:48,  5.19s/it]

Request succeeded. Waiting 4 seconds before the next request...


 57%|█████▋    | 57/100 [04:53<03:39,  5.11s/it]

Request succeeded. Waiting 4 seconds before the next request...


 58%|█████▊    | 58/100 [04:58<03:33,  5.08s/it]

Request succeeded. Waiting 4 seconds before the next request...


 59%|█████▉    | 59/100 [05:03<03:25,  5.02s/it]

Request succeeded. Waiting 4 seconds before the next request...


 60%|██████    | 60/100 [05:08<03:22,  5.05s/it]

Request succeeded. Waiting 4 seconds before the next request...


 61%|██████    | 61/100 [05:13<03:23,  5.22s/it]

Request succeeded. Waiting 4 seconds before the next request...


 62%|██████▏   | 62/100 [05:18<03:15,  5.15s/it]

Request succeeded. Waiting 4 seconds before the next request...


 63%|██████▎   | 63/100 [05:23<03:09,  5.12s/it]

Request succeeded. Waiting 4 seconds before the next request...


 64%|██████▍   | 64/100 [05:29<03:07,  5.20s/it]

Request succeeded. Waiting 4 seconds before the next request...


 65%|██████▌   | 65/100 [05:34<02:59,  5.13s/it]

Request succeeded. Waiting 4 seconds before the next request...


 66%|██████▌   | 66/100 [05:39<02:53,  5.09s/it]

Request succeeded. Waiting 4 seconds before the next request...


 67%|██████▋   | 67/100 [05:44<02:46,  5.05s/it]

Request succeeded. Waiting 4 seconds before the next request...


 68%|██████▊   | 68/100 [05:49<02:43,  5.12s/it]

Request succeeded. Waiting 4 seconds before the next request...


 69%|██████▉   | 69/100 [05:54<02:39,  5.16s/it]

Request succeeded. Waiting 4 seconds before the next request...


 70%|███████   | 70/100 [06:00<02:36,  5.21s/it]

Request succeeded. Waiting 4 seconds before the next request...


 71%|███████   | 71/100 [06:05<02:31,  5.22s/it]

Request succeeded. Waiting 4 seconds before the next request...


 72%|███████▏  | 72/100 [06:10<02:27,  5.27s/it]

Request succeeded. Waiting 4 seconds before the next request...


 73%|███████▎  | 73/100 [06:15<02:18,  5.13s/it]

Request succeeded. Waiting 4 seconds before the next request...


 74%|███████▍  | 74/100 [06:20<02:11,  5.05s/it]

Request succeeded. Waiting 4 seconds before the next request...


 75%|███████▌  | 75/100 [06:25<02:09,  5.18s/it]

Request succeeded. Waiting 4 seconds before the next request...


 76%|███████▌  | 76/100 [06:30<02:03,  5.13s/it]

Request succeeded. Waiting 4 seconds before the next request...


 77%|███████▋  | 77/100 [06:35<01:57,  5.09s/it]

Request succeeded. Waiting 4 seconds before the next request...


 78%|███████▊  | 78/100 [06:41<01:53,  5.16s/it]

Request succeeded. Waiting 4 seconds before the next request...


 79%|███████▉  | 79/100 [06:46<01:48,  5.15s/it]

Request succeeded. Waiting 4 seconds before the next request...


 80%|████████  | 80/100 [06:51<01:43,  5.16s/it]

Request succeeded. Waiting 4 seconds before the next request...


 81%|████████  | 81/100 [06:56<01:36,  5.11s/it]

Request succeeded. Waiting 4 seconds before the next request...


 82%|████████▏ | 82/100 [07:01<01:32,  5.13s/it]

Request succeeded. Waiting 4 seconds before the next request...


 83%|████████▎ | 83/100 [07:06<01:26,  5.08s/it]

Request succeeded. Waiting 4 seconds before the next request...


 84%|████████▍ | 84/100 [07:11<01:21,  5.10s/it]

Request succeeded. Waiting 4 seconds before the next request...


 85%|████████▌ | 85/100 [07:16<01:16,  5.08s/it]

Request succeeded. Waiting 4 seconds before the next request...


 86%|████████▌ | 86/100 [07:21<01:11,  5.08s/it]

Request succeeded. Waiting 4 seconds before the next request...


 87%|████████▋ | 87/100 [07:26<01:05,  5.07s/it]

Request succeeded. Waiting 4 seconds before the next request...


 88%|████████▊ | 88/100 [07:32<01:01,  5.10s/it]

Request succeeded. Waiting 4 seconds before the next request...


 89%|████████▉ | 89/100 [07:37<00:55,  5.08s/it]

Request succeeded. Waiting 4 seconds before the next request...


 90%|█████████ | 90/100 [07:42<00:50,  5.09s/it]

Request succeeded. Waiting 4 seconds before the next request...


 91%|█████████ | 91/100 [07:47<00:46,  5.20s/it]

Request succeeded. Waiting 4 seconds before the next request...


 92%|█████████▏| 92/100 [07:52<00:40,  5.09s/it]

Request succeeded. Waiting 4 seconds before the next request...


 93%|█████████▎| 93/100 [07:57<00:36,  5.16s/it]

Request succeeded. Waiting 4 seconds before the next request...


 94%|█████████▍| 94/100 [08:02<00:30,  5.11s/it]

Request succeeded. Waiting 4 seconds before the next request...


 95%|█████████▌| 95/100 [08:07<00:25,  5.08s/it]

Request succeeded. Waiting 4 seconds before the next request...


 96%|█████████▌| 96/100 [08:12<00:20,  5.08s/it]

Request succeeded. Waiting 4 seconds before the next request...


 97%|█████████▋| 97/100 [08:17<00:15,  5.03s/it]

Request succeeded. Waiting 4 seconds before the next request...


 98%|█████████▊| 98/100 [08:22<00:10,  5.08s/it]

Request succeeded. Waiting 4 seconds before the next request...


 99%|█████████▉| 99/100 [08:27<00:05,  5.06s/it]

Request succeeded. Waiting 4 seconds before the next request...


100%|██████████| 100/100 [08:32<00:00,  5.13s/it]


QA evaluation completed! Results saved to ./output/gemini-2.0-flash_rerank_ms-marco-MiniLM-L-12-v2.csv
Completed RAG for model: gemini-2.0-flash with rerank=True
Running RAG for model: gemini-2.0-flash with rerank=False
Initializing Google Gemini client...
Loading Gemini model: gemini-2.0-flash
Model initialized successfully!
Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Embeddings already exist! Loading embeddings with dimensionality: 384
End loading
Building the vectorstore  FAISS ...
Retriever built successfully!


  0%|          | 0/100 [00:00<?, ?it/s]

Request succeeded. Waiting 4 seconds before the next request...


  1%|          | 1/100 [00:04<07:36,  4.61s/it]

Request succeeded. Waiting 4 seconds before the next request...


  2%|▏         | 2/100 [00:09<07:20,  4.50s/it]

Request succeeded. Waiting 4 seconds before the next request...


  3%|▎         | 3/100 [00:13<07:09,  4.43s/it]

Request succeeded. Waiting 4 seconds before the next request...


  4%|▍         | 4/100 [00:17<07:04,  4.42s/it]

Request succeeded. Waiting 4 seconds before the next request...


  5%|▌         | 5/100 [00:22<07:06,  4.49s/it]

Request succeeded. Waiting 4 seconds before the next request...


  6%|▌         | 6/100 [00:27<07:06,  4.53s/it]

Request succeeded. Waiting 4 seconds before the next request...


  7%|▋         | 7/100 [00:31<07:02,  4.54s/it]

Request succeeded. Waiting 4 seconds before the next request...


  8%|▊         | 8/100 [00:36<06:59,  4.56s/it]

Request succeeded. Waiting 4 seconds before the next request...


  9%|▉         | 9/100 [00:40<06:58,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 10%|█         | 10/100 [00:45<06:46,  4.52s/it]

Request succeeded. Waiting 4 seconds before the next request...


 11%|█         | 11/100 [00:49<06:43,  4.53s/it]

Request succeeded. Waiting 4 seconds before the next request...


 12%|█▏        | 12/100 [00:54<06:43,  4.58s/it]

Request succeeded. Waiting 4 seconds before the next request...


 13%|█▎        | 13/100 [00:58<06:33,  4.53s/it]

Request succeeded. Waiting 4 seconds before the next request...


 14%|█▍        | 14/100 [01:03<06:31,  4.55s/it]

Request succeeded. Waiting 4 seconds before the next request...


 15%|█▌        | 15/100 [01:08<06:29,  4.58s/it]

Request succeeded. Waiting 4 seconds before the next request...


 16%|█▌        | 16/100 [01:12<06:27,  4.61s/it]

Request succeeded. Waiting 4 seconds before the next request...


 17%|█▋        | 17/100 [01:17<06:23,  4.62s/it]

Request succeeded. Waiting 4 seconds before the next request...


 18%|█▊        | 18/100 [01:21<06:13,  4.55s/it]

Request succeeded. Waiting 4 seconds before the next request...


 19%|█▉        | 19/100 [01:26<06:08,  4.55s/it]

Request succeeded. Waiting 4 seconds before the next request...


 20%|██        | 20/100 [01:30<06:01,  4.52s/it]

Request succeeded. Waiting 4 seconds before the next request...


 21%|██        | 21/100 [01:35<05:54,  4.49s/it]

Request succeeded. Waiting 4 seconds before the next request...


 22%|██▏       | 22/100 [01:40<05:57,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 23%|██▎       | 23/100 [01:44<05:57,  4.64s/it]

Request succeeded. Waiting 4 seconds before the next request...


 24%|██▍       | 24/100 [01:49<05:47,  4.57s/it]

Request succeeded. Waiting 4 seconds before the next request...


 25%|██▌       | 25/100 [01:53<05:42,  4.57s/it]

Request succeeded. Waiting 4 seconds before the next request...


 26%|██▌       | 26/100 [01:58<05:34,  4.52s/it]

Request succeeded. Waiting 4 seconds before the next request...


 27%|██▋       | 27/100 [02:02<05:27,  4.48s/it]

Request succeeded. Waiting 4 seconds before the next request...


 28%|██▊       | 28/100 [02:06<05:20,  4.45s/it]

Request succeeded. Waiting 4 seconds before the next request...


 29%|██▉       | 29/100 [02:11<05:21,  4.53s/it]

Request succeeded. Waiting 4 seconds before the next request...


 30%|███       | 30/100 [02:16<05:17,  4.54s/it]

Request succeeded. Waiting 4 seconds before the next request...


 31%|███       | 31/100 [02:20<05:14,  4.56s/it]

Request succeeded. Waiting 4 seconds before the next request...


 32%|███▏      | 32/100 [02:25<05:10,  4.57s/it]

Request succeeded. Waiting 4 seconds before the next request...


 33%|███▎      | 33/100 [02:30<05:07,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 34%|███▍      | 34/100 [02:34<05:02,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 35%|███▌      | 35/100 [02:39<04:58,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 36%|███▌      | 36/100 [02:43<04:55,  4.62s/it]

Request succeeded. Waiting 4 seconds before the next request...


 37%|███▋      | 37/100 [02:48<04:52,  4.64s/it]

Request succeeded. Waiting 4 seconds before the next request...


 38%|███▊      | 38/100 [02:53<04:46,  4.61s/it]

Request succeeded. Waiting 4 seconds before the next request...


 39%|███▉      | 39/100 [02:57<04:40,  4.60s/it]

Request succeeded. Waiting 4 seconds before the next request...


 40%|████      | 40/100 [03:02<04:35,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 41%|████      | 41/100 [03:07<04:32,  4.62s/it]

Request succeeded. Waiting 4 seconds before the next request...


 42%|████▏     | 42/100 [03:11<04:24,  4.56s/it]

Request succeeded. Waiting 4 seconds before the next request...


 43%|████▎     | 43/100 [03:16<04:20,  4.57s/it]

Request succeeded. Waiting 4 seconds before the next request...


 44%|████▍     | 44/100 [03:20<04:16,  4.58s/it]

Request succeeded. Waiting 4 seconds before the next request...


 45%|████▌     | 45/100 [03:25<04:12,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 46%|████▌     | 46/100 [03:29<04:07,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 47%|████▋     | 47/100 [03:34<04:03,  4.60s/it]

Request succeeded. Waiting 4 seconds before the next request...


 48%|████▊     | 48/100 [03:39<03:58,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 49%|████▉     | 49/100 [03:43<03:54,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 50%|█████     | 50/100 [03:47<03:46,  4.53s/it]

Request succeeded. Waiting 4 seconds before the next request...


 51%|█████     | 51/100 [03:52<03:43,  4.55s/it]

Request succeeded. Waiting 4 seconds before the next request...


 52%|█████▏    | 52/100 [03:57<03:38,  4.55s/it]

Request succeeded. Waiting 4 seconds before the next request...


 53%|█████▎    | 53/100 [04:01<03:31,  4.50s/it]

Request succeeded. Waiting 4 seconds before the next request...


 54%|█████▍    | 54/100 [04:06<03:28,  4.54s/it]

Request succeeded. Waiting 4 seconds before the next request...


 55%|█████▌    | 55/100 [04:10<03:22,  4.50s/it]

Request succeeded. Waiting 4 seconds before the next request...


 56%|█████▌    | 56/100 [04:15<03:20,  4.55s/it]

Request succeeded. Waiting 4 seconds before the next request...


 57%|█████▋    | 57/100 [04:19<03:16,  4.56s/it]

Request succeeded. Waiting 4 seconds before the next request...


 58%|█████▊    | 58/100 [04:24<03:11,  4.56s/it]

Request succeeded. Waiting 4 seconds before the next request...


 59%|█████▉    | 59/100 [04:28<03:07,  4.57s/it]

Request succeeded. Waiting 4 seconds before the next request...


 60%|██████    | 60/100 [04:33<03:03,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 61%|██████    | 61/100 [04:38<02:58,  4.57s/it]

Request succeeded. Waiting 4 seconds before the next request...


 62%|██████▏   | 62/100 [04:42<02:53,  4.56s/it]

Request succeeded. Waiting 4 seconds before the next request...


 63%|██████▎   | 63/100 [04:47<02:49,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 64%|██████▍   | 64/100 [04:51<02:45,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 65%|██████▌   | 65/100 [04:56<02:41,  4.61s/it]

Request succeeded. Waiting 4 seconds before the next request...


 66%|██████▌   | 66/100 [05:00<02:34,  4.56s/it]

Request succeeded. Waiting 4 seconds before the next request...


 67%|██████▋   | 67/100 [05:05<02:30,  4.57s/it]

Request succeeded. Waiting 4 seconds before the next request...


 68%|██████▊   | 68/100 [05:09<02:24,  4.52s/it]

Request succeeded. Waiting 4 seconds before the next request...


 69%|██████▉   | 69/100 [05:14<02:19,  4.50s/it]

Request succeeded. Waiting 4 seconds before the next request...


 70%|███████   | 70/100 [05:18<02:13,  4.45s/it]

Request succeeded. Waiting 4 seconds before the next request...


 71%|███████   | 71/100 [05:23<02:08,  4.44s/it]

Request succeeded. Waiting 4 seconds before the next request...


 72%|███████▏  | 72/100 [05:27<02:06,  4.50s/it]

Request succeeded. Waiting 4 seconds before the next request...


 73%|███████▎  | 73/100 [05:32<02:02,  4.55s/it]

Request succeeded. Waiting 4 seconds before the next request...


 74%|███████▍  | 74/100 [05:37<01:58,  4.55s/it]

Request succeeded. Waiting 4 seconds before the next request...


 75%|███████▌  | 75/100 [05:41<01:54,  4.56s/it]

Request succeeded. Waiting 4 seconds before the next request...


 76%|███████▌  | 76/100 [05:46<01:50,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 77%|███████▋  | 77/100 [05:50<01:44,  4.53s/it]

Request succeeded. Waiting 4 seconds before the next request...


 78%|███████▊  | 78/100 [05:55<01:39,  4.53s/it]

Request succeeded. Waiting 4 seconds before the next request...


 79%|███████▉  | 79/100 [05:59<01:34,  4.50s/it]

Request succeeded. Waiting 4 seconds before the next request...


 80%|████████  | 80/100 [06:04<01:30,  4.51s/it]

Request succeeded. Waiting 4 seconds before the next request...


 81%|████████  | 81/100 [06:08<01:25,  4.52s/it]

Request succeeded. Waiting 4 seconds before the next request...


 82%|████████▏ | 82/100 [06:13<01:22,  4.56s/it]

Request succeeded. Waiting 4 seconds before the next request...


 83%|████████▎ | 83/100 [06:17<01:16,  4.52s/it]

Request succeeded. Waiting 4 seconds before the next request...


 84%|████████▍ | 84/100 [06:22<01:11,  4.48s/it]

Request succeeded. Waiting 4 seconds before the next request...


 85%|████████▌ | 85/100 [06:26<01:07,  4.53s/it]

Request succeeded. Waiting 4 seconds before the next request...


 86%|████████▌ | 86/100 [06:31<01:03,  4.51s/it]

Request succeeded. Waiting 4 seconds before the next request...


 87%|████████▋ | 87/100 [06:36<00:59,  4.58s/it]

Request succeeded. Waiting 4 seconds before the next request...


 88%|████████▊ | 88/100 [06:40<00:55,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 89%|████████▉ | 89/100 [06:45<00:50,  4.61s/it]

Request succeeded. Waiting 4 seconds before the next request...


 90%|█████████ | 90/100 [06:49<00:46,  4.61s/it]

Request succeeded. Waiting 4 seconds before the next request...


 91%|█████████ | 91/100 [06:54<00:41,  4.62s/it]

Request succeeded. Waiting 4 seconds before the next request...


 92%|█████████▏| 92/100 [06:59<00:36,  4.57s/it]

Request succeeded. Waiting 4 seconds before the next request...


 93%|█████████▎| 93/100 [07:03<00:32,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 94%|█████████▍| 94/100 [07:08<00:27,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 95%|█████████▌| 95/100 [07:12<00:22,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 96%|█████████▌| 96/100 [07:17<00:18,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 97%|█████████▋| 97/100 [07:22<00:13,  4.59s/it]

Request succeeded. Waiting 4 seconds before the next request...


 98%|█████████▊| 98/100 [07:26<00:09,  4.61s/it]

Request succeeded. Waiting 4 seconds before the next request...


 99%|█████████▉| 99/100 [07:31<00:04,  4.61s/it]

Request succeeded. Waiting 4 seconds before the next request...


100%|██████████| 100/100 [07:36<00:00,  4.56s/it]

QA evaluation completed! Results saved to ./output/gemini-2.0-flash_rerank_false.csv
Completed RAG for model: gemini-2.0-flash with rerank=False



