In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# change the working directory to the Drive root
%cd /content/drive/My\ Drive/Colab\ Notebooks/nlp

/content/drive/My Drive/Colab Notebooks/nlp


# RAG Pipeline


In [1]:
!pip install numpy==1.26.4 # downgrading the numpy version

Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m96.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
Successfully installed numpy-1.26.4


After installing 1.26.4, you need to "Restart Session" and re-import numpy. It's strange that they haven't updated their release notes yet (https://colab.research.google.com/notebooks/relnotes.ipynb)

In [3]:
import numpy
import pandas as pd

In [4]:
!pip install faiss-cpu
!pip install faiss-gpu-cu12 # CUDA 12.x, Python 3.8+
!pip install dotenv
!pip install langchain_chroma
!pip install langchain-community
!pip install langchain_experimental
!pip install langchain_openai
!pip install flashrank

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m71.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
Collecting faiss-gpu-cu12
  Downloading faiss_gpu_cu12-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading faiss_gpu_cu12-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (47.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.9/47.9 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu-cu12
Successfully installed faiss-gpu-cu12-1.10.0
Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting python-dotenv (from dotenv)
  Downloading python_doten

In [5]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from typing import List
from flashrank import Ranker
from langchain.retrievers.document_compressors import FlashrankRerank
from langchain.retrievers import ContextualCompressionRetriever
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def load_text_files(path: str) -> List[str]:
    """
    Load text files from the given path.

    Args:
        path (str): The path to the directory or file containing the text files.

    Returns:
        list: A list of text documents.
    """
    docs = []

    try:
        if os.path.isdir(path):
            # Iterate over files in the directory
            for file_name in os.listdir(path):
                if file_name.endswith(".txt"):
                    file_path = os.path.join(path, file_name)
                    with open(file_path, 'r', encoding='utf-8') as file:
                        docs.append(file.read())
        elif os.path.isfile(path) and path.endswith(".txt"):
            # If the path is a file, directly read it
            with open(path, 'r', encoding='utf-8') as file:
                docs.append(file.read())
    except Exception as e:
        logging.error(f"Error loading text files from {path}: {e}")

    return docs

def format_retrieved_docs(docs: List[str]) -> str:
    """
    Format the retrieved documents in reverse order.

    Args:
        docs (list): A list of documents.

    Returns:
        str: Formatted string with contexts.
    """
    try:
        docs = reversed(docs)
        return "\n\n".join([f"Context {i+1}: {doc}" for i, doc in enumerate(docs)])
    except Exception as e:
        logging.error(f"Error formatting retrieved documents: {e}")
        return ""

def rerank_docs(query: str, retriever, rerank_model_name: str, k: int = 3) -> List[str]:
    """
    Rerank the retrieved documents based on the query using Flashrank.

    Args:
        query (str): The query to rerank documents for.
        retriever: The base retriever object.
        rerank_model_name (str): The name of the rerank model.
        k (int): The number of top documents to rerank.

    Returns:
        list: A list of reranked documents.
    """
    try:
        ranker = Ranker(model_name=rerank_model_name)
        compressor = FlashrankRerank(top_n=k, model=rerank_model_name)
        compression_retriever = ContextualCompressionRetriever(
            base_compressor=compressor, base_retriever=retriever
        )
        return compression_retriever.invoke(query)
    except Exception as e:
        logging.error(f"Error reranking documents for query '{query}': {e}")
        return []

def get_hypo_doc(query: str, generation_pipe) -> str:
    """
    Generate a hypothesis document for the given query using the language model.

    Args:
        query (str): The query to generate a hypothesis for.
        generation_pipe: The language model pipeline.

    Returns:
        str: The hypothesis document or the original query if unavailable.
    """
    template = """Imagine you are an expert providing a detailed and factual explanation in response to the query '{query}'.
    Your response should include all key points that would be found in a top search result, without adding any personal opinions, commentary, or experiences.
    Do not include any subjective phrases such as 'I think', 'I believe', or 'I am not sure'. Do not apologize, hedge, or express uncertainty.
    The response should be structured as an objective, factual explanation only, without any conversational elements or chatting.
    If you are truly uncertain and cannot provide an accurate answer, simply respond with: 'Unavailable: {query}'.
    Otherwise, answer confidently with only the relevant information.
    """

    messages = [{"role": "user", "content": template.format(query=query)}]

    try:
        with torch.no_grad():
            hypo_doc = generation_pipe(messages, max_new_tokens=100, return_full_text=False)[0]["generated_text"]
        logging.info(f"Generated hypothesis document for query: {query}")
        # print("Question:", query)
        # print("Hypothesis Document:", hypo_doc)
        if hypo_doc.startswith("Unavailable"):
            logging.warning(f"Hypothesis unavailable for query: {query}")
            return query
        return hypo_doc
    except Exception as e:
        logging.error(f"Error generating hypothesis document for query '{query}': {e}")
        return query

def answer_generation(
    qa_df: pd.DataFrame, output_file: str, retriever, generation_pipe,
    prompt, rerank: bool, rerank_model_name: str, hypo: bool, top_k_rerank: int = 3
):
    """
    Generate answers for the given questions using the retriever and the generation pipeline.

    Args:
        qa_df (pd.DataFrame): DataFrame containing questions and other metadata.
        output_file (str): Path to save the generated answers.
        retriever: A retriever object to retrieve documents.
        generation_pipe: A pipeline object for text generation.
        prompt: A ChatPromptTemplate object for generating prompts.
        rerank (bool): Whether to rerank retrieved documents.
        rerank_model_name (str): The name of the rerank model.
        hypo (bool): Whether to generate a hypothesis document.
        top_k_rerank (int): Number of top documents to rerank.
    """
    logging.info("Starting answer generation...")

    # Check if the output file exists
    if not os.path.exists(output_file):
        with open(output_file, 'w') as f_out:
            f_out.write(",".join(list(qa_df.columns) + ["Generated_Answer"]) + "\n")
            start_idx = 0
    else:
        # Calculate the number of rows in the output file
        with open(output_file, 'r') as f_out:
            num_rows = sum(1 for line in f_out)
            start_idx = num_rows - 1

    # Iterate over the DataFrame
    with open(output_file, 'a') as f_out:
        for idx, row in tqdm(qa_df.iterrows(), total=len(qa_df)):
            if idx < start_idx:
                continue

            query = row["Question"]
            if hypo:
                query = get_hypo_doc(query, generation_pipe)

            # Retrieve documents
            try:
                if rerank:
                    logging.info(f"Reranking documents for query: {query}")
                    retrieved_docs = rerank_docs(query, retriever, rerank_model_name, k=top_k_rerank)
                else:
                    retrieved_docs = retriever.invoke(query)
            except Exception as e:
                logging.error(f"Error retrieving documents for query '{query}': {e}")
                continue

            # Format the documents
            context = format_retrieved_docs(retrieved_docs)

            # Create the full prompt
            prompt_messages = prompt.format_messages(context=context, question=row["Question"])
            full_prompt = "\n".join(message.content for message in prompt_messages)

            # Generate the answer
            try:
                messages = [{"role": "user", "content": full_prompt}]
                with torch.no_grad():
                    llm_output = generation_pipe(
                        messages, max_new_tokens=20, return_full_text=False
                    )[0]["generated_text"]

                row["Generated_Answer"] = llm_output
                pd.DataFrame([row]).to_csv(f_out, header=False, index=False)
            except Exception as e:
                logging.error(f"Error generating answer for query '{query}': {e}")
                continue

            # Clear cache
            del retrieved_docs, context, prompt_messages, full_prompt, messages, llm_output
            torch.cuda.empty_cache()


# Constants
PROMPT_TEMPLATE = """
You are an expert assistant answering factual questions about Pittsburgh or Carnegie Mellon University (CMU).
Use the retrieved information to give a detailed and helpful answer. If the provided context does not contain the answer, leverage your pretraining knowledge to provide the correct answer.
If you truly do not know, just say "I don't know."

Important Instructions:
- Answer concisely without repeating the question.
- Use the provided context if relevant; otherwise, rely on your pretraining knowledge.
- Do **not** use complete sentences. Provide only the word, name, date, or phrase that directly answers the question. For example, given the question "When was Carnegie Mellon University founded?", you should only answer "1900".

Examples:
Question: Who is Pittsburgh named after?
Answer: William Pitt
Question: What famous machine learning venue had its first conference in Pittsburgh in 1980?
Answer: ICML
Question: What musical artist is performing at PPG Arena on October 13?
Answer: Billie Eilish

Context: \n\n {context} \n\n
Question: {question} \n\n
Answer:
"""

In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm
import faiss
import numpy as np
import pickle
import random
from dotenv import load_dotenv
from huggingface_hub import login
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from langchain_chroma import Chroma
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    PromptTemplate
)

# ========================================
# Helper Functions
# ========================================
def str2bool(value):
    if isinstance(value, bool):
        return value
    if value.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif value.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise ValueError('Boolean value expected.')

# ========================================
# Main Function for Jupyter/IPython
# ========================================
def run_RAG(
    model_name = "meta-llama/Llama-3.1-8B-Instruct",
    dtype = "float16", # or torch.bfloat16
    embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2",
    embedding_dim = 384,
    splitter_type = "recursive", # or "character", "token", "semantic"
    chunk_size = 1000,
    chunk_overlap = 200,
    text_files_path = "./data/scraped/scraped_all",
    sublink_files_path = "./data/scraped/scraped_text_data",
    sublink_files_nums = 0,
    retriever_type = "FAISS", # or "CHROMA"
    retriever_algorithm = "similarity", # or "mmr"
    rerank = False,
    rerank_model_name = "ms-marco-MultiBERT-L-12",
    top_k_search = 3,
    top_k_rerank = 3,
    hypo = False,
    qes_file_path = "./data/annotated/QA_pairs_1.csv",
    output_file = "./output/results.json",
    qa_nums = 100
):
    # Step 0: Load environment variables
    load_dotenv()

    os.environ["LANGCHAIN_TRACING_V2"] = "true"
    os.environ["LANGCHAIN_API_KEY"] = os.getenv('LANGCHAIN_API_KEY') # os.getenv('LANGCHAIN_API_KEY')
    os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
    os.environ["LANGCHAIN_PROJECT"] = "RAGmodel"
    os.environ["USER_AGENT"] = "LangChain/1.0 (+https://www.langchain.com)"

    login(token=os.getenv('HUGGINGFACE_TOKEN')) # os.getenv('HUGGINGFACE_TOKEN')

    # Set model name, precision, and other parameters
    dtype = torch.float16 if dtype == "float16" else torch.bfloat16
    random.seed(42)

    # Check if rerank is set to True
    if rerank:
        print("Reranking is set to True.")

    # Step 1: Initialize the Hugging Face model as your LLM
    print("Initializing the Hugging Face model...")
    model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=dtype, device_map="cuda:0"
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"

    generation_pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=dtype
    )
    print("Model initialized successfully!")

    # Step 2: Load the Sentence Transformers model for embeddings
    docs_length = f"main160_sublink{sublink_files_nums}"
    model_name_str = embedding_model_name.split('/')[-1]
    embeddings_file_path = f"./data/embeddings/embeddings_{model_name_str}_{docs_length}_{splitter_type}_{retriever_type}_{chunk_size}_{chunk_overlap}.npy"
    splits_file_path = f"./data/embeddings/splits_{model_name_str}_{docs_length}_{splitter_type}_{retriever_type}_{chunk_size}_{chunk_overlap}.pkl"
    embeddings = None
    splits = None
    embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
    print(f"Start loading QA from {qes_file_path}")
    qa_test_data_path = qes_file_path
    qa_df = pd.read_csv(qa_test_data_path)
    print(len(qa_df))
    if len(qa_df) != 574:
        qa_df = qa_df.sample(qa_nums, random_state=221)
    print(f"Loaded {len(qa_df)} QAs")

    # Dynamically determine embedding dimensionality
    embedding_dim = embedding_model.client.get_sentence_embedding_dimension()

    if not os.path.exists(embeddings_file_path):
        # Step 3: Load the text files for building the index and QA evaluation
        print(f"Start loading texts from {text_files_path}")
        # Step 4: Split the documents into smaller chunks
        # Wrap text strings in Document objects
        docs = load_text_files(path=text_files_path)
        documents = [Document(page_content=text) for text in tqdm(docs, desc="Wrapping text in Document objects")]
        del docs

        if sublink_files_nums != 0:
            sublink_file_store_path = "./data/embeddings/sublink_docs.pkl"
            if os.path.exists(sublink_file_store_path):
                print(f"Start loading sublink files from {sublink_file_store_path}")
                with open(sublink_file_store_path, "rb") as f:
                    all_sublink_docs = pickle.load(f)
            else:
                print(f"Start reading all sublink files")
                all_sublink_docs = load_text_files(path=sublink_files_path)
                print(f"Finish loading {len(all_sublink_docs)} sublinks, now store it")
                with open(sublink_file_store_path, 'wb') as f:
                    pickle.dump(all_sublink_docs, f)
                print(f"Store all sublink file in {sublink_file_store_path}")

            sampled_sublink_docs = random.sample(all_sublink_docs, sublink_files_nums)
            documents.extend([Document(page_content=text) for text in tqdm(sampled_sublink_docs, desc="Wrapping text in Document objects")])
            del sampled_sublink_docs
            del all_sublink_docs

        if splitter_type == "recursive":
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        elif splitter_type == "character":
            text_splitter = CharacterTextSplitter(separator=" ", chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        elif splitter_type == "token":
            text_splitter = TokenTextSplitter(chunk_size=int(chunk_size / 4), chunk_overlap=int(chunk_overlap / 4))
        elif splitter_type == "semantic":
            text_splitter = SemanticChunker(
                embeddings=embedding_model,
                breakpoint_threshold_type="percentile",
                breakpoint_threshold_amount=80
            )
        else:
            raise ValueError("Invalid splitter type. Please choose between recursive, character, token, or semantic.")

        splits = text_splitter.split_documents(documents)
        del documents
        print(f"End splitting texts -- Number of splits: {len(splits)}")

        # Step 5: Create Chroma vectorstore with embeddings from Sentence Transformers
        # Generate embeddings with the correct dimensionality
        embeddings = embedding_model.embed_documents([doc.page_content for doc in tqdm(splits, desc="Embedding texts")])
        print(f"Generated embeddings with dimensionality: {embedding_dim}")
        print(f"End embedding texts")

        # Free GPU cache after generating embeddings
        torch.cuda.empty_cache()
        print(f"Start saving embeddings and splits")
        np.save(embeddings_file_path, embeddings)
        with open(splits_file_path, 'wb') as f:
            pickle.dump(splits, f)
        print(f"Embeddings saved in {embeddings_file_path}, splits saved in {splits_file_path}")
    else:
        print(f"Embeddings already exist! Loading embeddings with dimensionality: {embedding_dim}")
        # Step 1: Load embeddings from the saved NumPy file
        embeddings = np.load(embeddings_file_path)
        with open(splits_file_path, 'rb') as f:
            splits = pickle.load(f)
        # Step 2: Load document metadata if needed
        # doc_metadata = np.load("doc_metadata.npy", allow_pickle=True)
        print("End loading")

    # Step 6: Create the RAG prompting pipeline
    prompt_template = PromptTemplate(
        input_variables=['context', 'question'],
        template=PROMPT_TEMPLATE
    )

    # Update the HumanMessagePromptTemplate with the new PromptTemplate
    human_message_template = HumanMessagePromptTemplate(prompt=prompt_template)

    # Update the ChatPromptTemplate with the modified message
    chat_prompt_template = ChatPromptTemplate(
        input_variables=['context', 'question'],
        messages=[human_message_template]
    )
    prompt = chat_prompt_template


    # Step 7: Generate answers for the questions
    print("Building the vectorstore ", retriever_type, "...")
    if retriever_type == "CHROMA":
        retriever = Chroma.from_documents(documents=dsplits, embeding=embedding_model, collection_name="collectionChroma").as_retriever(search_type=retriever_algorithm, search_kwargs={'k': top_k_search})
    elif retriever_type == "FAISS":
        # embeddings_np = np.array(embeddings).astype("float32")
        retriever = FAISS.from_documents(splits, embedding_model).as_retriever(search_type=retriever_algorithm, search_kwargs={"k": top_k_search})
    else:
        raise ValueError("Invalid retriever type. Please choose between FAISS or CHROMA.")

    print("Retriever built successfully!")
    # Free GPU cache after generating embeddings
    torch.cuda.empty_cache()
    del splits

    answer_generation(
        qa_df, output_file, retriever,
        generation_pipe, prompt, rerank, rerank_model_name, hypo, top_k_rerank=top_k_rerank
    )

    print(f"QA evaluation completed! Results saved to {output_file}")



# The all-* models were trained on all available training data (more than 1 billion training pairs)
# and are designed as general purpose models.
# The all-mpnet-base-v2 model provides the best quality, while all-MiniLM-L6-v2 is 5 times faster and still offers good quality.

# run_RAG(
#     model_name="meta-llama/Llama-3.2-3B-Instruct",
#     dtype="float16",
#     embedding_model_name="sentence-transformers/all-mpnet-base-v2",
#     embedding_dim = 768,
#     text_files_path="./data/scraped/scraped_all",
#     qes_file_path="./data/annotated/QA_pairs_1.csv",
#     output_file="./output/results.json",
#     qa_nums=100
# )


In [None]:
# Don't Run This One
def run_rag_strategy(strategy):
    """
    Run the RAG pipeline with different strategies.

    Args:
        strategy (int): The strategy number to execute.
    """
    if strategy == 0:
        # Strategy 0: Default setting
        run_RAG(
            model_name="meta-llama/Llama-3.2-3B-Instruct",
            dtype="float16",
            embedding_model_name="sentence-transformers/all-MiniLM-L12-v2",
            embedding_dim=384,
            splitter_type="recursive",
            chunk_size=1000,
            chunk_overlap=200,
            text_files_path="./data/scraped/scraped_all",
            retriever_type="CHROMA",
            top_k_search=3,
            output_file="./output/llama3_recursive_chroma_top3.csv"
        )

    elif strategy == 1:
        # Strategy 1: Recursive splitter, CHROMA retriever, tune chunk size
        run_RAG(
            model_name="meta-llama/Llama-3.2-3B-Instruct",
            dtype="float16",
            embedding_model_name="sentence-transformers/all-MiniLM-L12-v2",
            embedding_dim=384,
            splitter_type="recursive",
            chunk_size=1500,
            chunk_overlap=200,
            text_files_path="./data/scraped/scraped_all",
            retriever_type="CHROMA",
            top_k_search=3,
            output_file="./output/llama3_recursive_chunk1500_chroma_top3_sample100.csv"
        )

    elif strategy == 2:
        # Strategy 2: CHROMA retriever, tune splitter
        run_RAG(
            model_name="meta-llama/Llama-3.2-3B-Instruct",
            dtype="float16",
            embedding_model_name="sentence-transformers/all-MiniLM-L12-v2",
            embedding_dim=384,
            splitter_type="semantic",
            chunk_size=1000,
            chunk_overlap=200,
            text_files_path="./data/scraped/scraped_all",
            retriever_type="CHROMA",
            top_k_search=3,
            output_file="./output/llama3_semantic_chroma_top3_sample100.csv"
        )

    elif strategy == 3:
        # Strategy 3: Change retriever to FAISS
        run_RAG(
            model_name="meta-llama/Llama-3.2-3B-Instruct",
            dtype="float16",
            embedding_model_name="sentence-transformers/all-MiniLM-L12-v2",
            embedding_dim=384,
            splitter_type="recursive",
            chunk_size=1000,
            chunk_overlap=200,
            text_files_path="./data/scraped/scraped_all",
            retriever_type="FAISS",
            top_k_search=3,
            output_file="./output/tests/llama3_faiss_test_reverseprompt.csv"
        )

    elif strategy == 4:
        # Strategy 4: Tune reranking using FAISS
        run_RAG(
            model_name="meta-llama/Llama-3.2-3B-Instruct",
            dtype="float16",
            embedding_model_name="sentence-transformers/all-MiniLM-L12-v2",
            embedding_dim=384,
            splitter_type="recursive",
            chunk_size=1000,
            chunk_overlap=200,
            text_files_path="./data/scraped/scraped_all",
            qes_file_path="./data/test/test_questions.csv",
            retriever_type="FAISS",
            top_k_search=10,
            top_k_rerank=3,
            rerank=True,
            rerank_model_name="ms-marco-MiniLM-L-12-v2",
            output_file="./output/submission/llama3_faiss_rerank.csv"
        )

    elif strategy == 5:
        # Strategy 5: Add hypothesis generation for better retrieval
        run_RAG(
            model_name="meta-llama/Llama-3.2-3B-Instruct",
            dtype="float16",
            embedding_model_name="sentence-transformers/all-MiniLM-L12-v2",
            embedding_dim=384,
            splitter_type="recursive",
            chunk_size=1000,
            chunk_overlap=200,
            text_files_path="./data/scraped/scraped_all",
            retriever_type="FAISS",
            top_k_search=3,
            rerank_model_name="ms-marco-MiniLM-L-12-v2",
            hypo=True,
            output_file="./output/llama3_faiss_test_hypo_promptENG3.csv"
        )

    else:
        raise ValueError(f"Invalid strategy number: {strategy}")



# Run a specific strategy by passing the strategy number
# strategy_number = 2  # Change this to 1, 2, 3, 4, or 5 to test other strategies
# run_rag_strategy(strategy_number)

NameError: name 'run_RAG' is not defined

In [7]:
def run_rag_strategy(strategy):
    """
    Run the RAG pipeline with different strategies.

    Args:
        strategy (int): The strategy number to execute.
    """
    if strategy == 0:
        # Strategy 0: Default setting
        print(f"\n==============================")
        print(f"Running Strategy 0 Default setting.")
        print(f"==============================\n")
        run_RAG(
            model_name="meta-llama/Llama-3.2-3B-Instruct",
            dtype="float16",
            embedding_model_name="sentence-transformers/all-MiniLM-L12-v2",
            embedding_dim=384,
            splitter_type="recursive",
            chunk_size=1000,
            chunk_overlap=200,
            text_files_path="./data/scraped/scraped_all",
            retriever_type="CHROMA",
            top_k_search=3,
            output_file="./output/llama3_recursive_chroma_top3.csv"
        )

    elif strategy == 1:
        # Strategy 1: Compare different chunk sizes
        print(f"\n==============================")
        print(f"Running Strategy 1 Compare different chunk sizes.")
        print(f"==============================\n")
        chunk_sizes = [500, 700, 1000, 1500, 2000]
        for chunk_size in chunk_sizes:
            output_file = f"./output/llama3_recursive_chunk{chunk_size}_chroma_top3.csv"
            run_RAG(
                model_name="meta-llama/Llama-3.2-3B-Instruct",
                dtype="float16",
                embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
                embedding_dim=384,
                splitter_type="recursive",
                chunk_size=chunk_size,
                chunk_overlap=200,
                text_files_path="./data/scraped/scraped_all",
                retriever_type="CHROMA",
                top_k_search=3,
                output_file=output_file
            )

    elif strategy == 2:
        # Strategy 2: Compare different splitter types
        print(f"\n==============================")
        print(f"Running Strategy 2 Compare different splitter types.")
        print(f"==============================\n")
        splitter_types = ["recursive", "semantic", "token", "character"]
        for splitter_type in splitter_types:
            output_file = f"./output/llama3_{splitter_type}_chroma_top3.csv"
            run_RAG(
                model_name="meta-llama/Llama-3.2-3B-Instruct",
                dtype="float16",
                embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
                embedding_dim=384,
                splitter_type=splitter_type,
                chunk_size=1000,
                chunk_overlap=200,
                text_files_path="./data/scraped/scraped_all",
                retriever_type="CHROMA",
                top_k_search=3,
                output_file=output_file
            )

    elif strategy == 3:
        # Strategy 3 A: Compare retriever types and embedding models
        print(f"\n==============================")
        print(f"Running Strategy 3A Compare retriever types and embedding models.")
        print(f"==============================\n")
        retriever_types = ["FAISS", "CHROMA"]
        for retriever_type in retriever_types:
            output_file = f"./output/llama3_{retriever_type}_all-MiniLM-L6-v2_top3.csv"
            print(f"Running with retriever_type={retriever_type}, embedding_model=all-MiniLM-L6-v2, saving to {output_file}")
            run_RAG(
                model_name="meta-llama/Llama-3.2-3B-Instruct",
                dtype="float16",
                embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
                embedding_dim=384,
                splitter_type="recursive",
                chunk_size=1000,
                chunk_overlap=200,
                text_files_path="./data/scraped/scraped_all",
                retriever_type=retriever_type,
                top_k_search=3,
                output_file=output_file
            )

    elif strategy == 4:
        # Strategy 3 B: Compare retriever types and embedding models
        print(f"\n==============================")
        print(f"Running Strategy 3B Compare retriever types and embedding models.")
        print(f"==============================\n")
        retriever_types = ["FAISS", "CHROMA"]
        for retriever_type in retriever_types:
            output_file = f"./output/llama3_{retriever_type}_all-mpnet-base-v2_top3.csv"
            print(f"Running with retriever_type={retriever_type}, embedding_model=all-mpnet-base-v2, saving to {output_file}")
            run_RAG(
                model_name="meta-llama/Llama-3.2-3B-Instruct",
                dtype="float16",
                embedding_model_name="sentence-transformers/all-mpnet-base-v2",
                embedding_dim=768,
                splitter_type="recursive",
                chunk_size=1000,
                chunk_overlap=200,
                text_files_path="./data/scraped/scraped_all",
                retriever_type=retriever_type,
                top_k_search=3,
                output_file=output_file
            )

    elif strategy == 5:
        # Strategy 4: Compare reranking models and no reranking
        print(f"\n==============================")
        print(f"Running Strategy 4 Compare reranking models and no reranking.")
        print(f"==============================\n")

        rerank_options = [True, False]
        rerank_models = ["ms-marco-MiniLM-L-12-v2", "ms-marco-MultiBERT-L-12"]

        for rerank in rerank_options:
            if rerank:
                # When rerank=True, iterate over the rerank models
                for rerank_model_name in rerank_models:
                    output_file = f"./output/llama3_faiss_rerank_{rerank_model_name}.csv"
                    print(f"Running with rerank=True, rerank_model={rerank_model_name}, saving to {output_file}")
                    run_RAG(
                        model_name="meta-llama/Llama-3.2-3B-Instruct",
                        dtype="float16",
                        embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
                        embedding_dim=384,
                        splitter_type="recursive",
                        chunk_size=1000,
                        chunk_overlap=200,
                        text_files_path="./data/scraped/scraped_all",
                        qes_file_path="./data/test/test_questions.csv",
                        retriever_type="FAISS",
                        top_k_search=10,
                        top_k_rerank=3,
                        rerank=True,
                        rerank_model_name=rerank_model_name,
                        output_file=output_file
                    )
            else:
                # When rerank=False, no rerank model is used
                output_file = "./output/llama3_faiss_no_rerank.csv"
                print(f"Running with rerank=False, saving to {output_file}")
                run_RAG(
                    model_name="meta-llama/Llama-3.2-3B-Instruct",
                    dtype="float16",
                    embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
                    embedding_dim=384,
                    splitter_type="recursive",
                    chunk_size=1000,
                    chunk_overlap=200,
                    text_files_path="./data/scraped/scraped_all",
                    qes_file_path="./data/test/test_questions.csv",
                    retriever_type="FAISS",
                    top_k_search=10,
                    rerank=False,
                    output_file=output_file
                )

    elif strategy == 6:
        # Strategy 5: Compare retriever algorithms
        print(f"\n==============================")
        print(f"Running Strategy 5 Compare retriever algorithms.")
        print(f"==============================\n")
        retriever_algorithms = ["similarity", "mmr"]
        for retriever_algorithm in retriever_algorithms:
            output_file = f"./output/llama3_faiss_{retriever_algorithm}_top3.csv"
            run_RAG(
                model_name="meta-llama/Llama-3.2-3B-Instruct",
                dtype="float16",
                embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
                embedding_dim=384,
                splitter_type="recursive",
                chunk_size=1000,
                chunk_overlap=200,
                text_files_path="./data/scraped/scraped_all",
                retriever_type="FAISS",
                top_k_search=3,
                retriever_algorithm=retriever_algorithm,
                output_file=output_file
            )

    elif strategy == 7:
        # Strategy 6: Compare with/without hypothesis generation
        print(f"\n==============================")
        print(f"Running Strategy 6 Compare with/without hypothesis generation.")
        print(f"==============================\n")
        output_file = f"./output/llama3_faiss_test_hypo.csv"
        run_RAG(
            model_name="meta-llama/Llama-3.2-3B-Instruct",
            dtype="float16",
            embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
            embedding_dim=384,
            splitter_type="recursive",
            chunk_size=1000,
            chunk_overlap=200,
            text_files_path="./data/scraped/scraped_all",
            retriever_type="FAISS",
            top_k_search=3,
            rerank_model_name="ms-marco-MiniLM-L-12-v2",
            hypo=True,
            output_file=output_file
        )

    else:
        raise ValueError(f"Invalid strategy number: {strategy}")




In [None]:
strategy_number = 1  # Change this to 0, 1, 2, 3, 4, 5, or 6 to test other strategies
run_rag_strategy(strategy_number)

Initializing the Hugging Face model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Device set to use cuda:0
  embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)


Model initialized successfully!


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Start loading texts from ./data/scraped/scraped_all


Wrapping text in Document objects: 100%|██████████| 172/172 [00:00<00:00, 199287.37it/s]


End splitting texts -- Number of splits: 14099


Embedding texts: 100%|██████████| 14099/14099 [00:00<00:00, 2116820.31it/s]


Generated embeddings with dimensionality: 384
End embedding texts
Start saving embeddings and splits
Embeddings saved in ./data/embeddings/embeddings_all-MiniLM-L6-v2_main160_sublink0_recursive_500_200.npy, splits saved in ./data/embeddings/splits_all-MiniLM-L6-v2_main160_sublink0_recursive_500_200.pkl
Building the vectorstore  CHROMA ...
Retriever built successfully!


 11%|█         | 11/100 [00:04<00:24,  3.58it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 100/100 [00:24<00:00,  4.03it/s]


QA evaluation completed! Results saved to ./output/llama3_recursive_chunk500_chroma_top3.csv
Initializing the Hugging Face model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model initialized successfully!
Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Start loading texts from ./data/scraped/scraped_all


Wrapping text in Document objects: 100%|██████████| 172/172 [00:00<00:00, 305040.29it/s]


End splitting texts -- Number of splits: 8491


Embedding texts: 100%|██████████| 8491/8491 [00:00<00:00, 2237331.03it/s]


Generated embeddings with dimensionality: 384
End embedding texts
Start saving embeddings and splits
Embeddings saved in ./data/embeddings/embeddings_all-MiniLM-L6-v2_main160_sublink0_recursive_700_200.npy, splits saved in ./data/embeddings/splits_all-MiniLM-L6-v2_main160_sublink0_recursive_700_200.pkl
Building the vectorstore  CHROMA ...
Retriever built successfully!


100%|██████████| 100/100 [00:23<00:00,  4.18it/s]


QA evaluation completed! Results saved to ./output/llama3_recursive_chunk700_chroma_top3.csv
Initializing the Hugging Face model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model initialized successfully!
Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Embeddings already exist! Loading embeddings with dimensionality: 384
End loading
Building the vectorstore  CHROMA ...
Retriever built successfully!


100%|██████████| 100/100 [00:24<00:00,  4.09it/s]


QA evaluation completed! Results saved to ./output/llama3_recursive_chunk1000_chroma_top3.csv
Initializing the Hugging Face model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model initialized successfully!
Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Start loading texts from ./data/scraped/scraped_all


Wrapping text in Document objects: 100%|██████████| 172/172 [00:00<00:00, 197163.24it/s]


End splitting texts -- Number of splits: 3306


Embedding texts: 100%|██████████| 3306/3306 [00:00<00:00, 2251399.42it/s]


Generated embeddings with dimensionality: 384
End embedding texts
Start saving embeddings and splits
Embeddings saved in ./data/embeddings/embeddings_all-MiniLM-L6-v2_main160_sublink0_recursive_1500_200.npy, splits saved in ./data/embeddings/splits_all-MiniLM-L6-v2_main160_sublink0_recursive_1500_200.pkl
Building the vectorstore  CHROMA ...
Retriever built successfully!


100%|██████████| 100/100 [00:24<00:00,  4.15it/s]


QA evaluation completed! Results saved to ./output/llama3_recursive_chunk1500_chroma_top3.csv
Initializing the Hugging Face model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model initialized successfully!
Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Start loading texts from ./data/scraped/scraped_all


Wrapping text in Document objects: 100%|██████████| 172/172 [00:00<00:00, 295421.90it/s]


End splitting texts -- Number of splits: 2413


Embedding texts: 100%|██████████| 2413/2413 [00:00<00:00, 1588083.41it/s]


Generated embeddings with dimensionality: 384
End embedding texts
Start saving embeddings and splits
Embeddings saved in ./data/embeddings/embeddings_all-MiniLM-L6-v2_main160_sublink0_recursive_2000_200.npy, splits saved in ./data/embeddings/splits_all-MiniLM-L6-v2_main160_sublink0_recursive_2000_200.pkl
Building the vectorstore  CHROMA ...
Retriever built successfully!


100%|██████████| 100/100 [00:24<00:00,  4.07it/s]

QA evaluation completed! Results saved to ./output/llama3_recursive_chunk2000_chroma_top3.csv





In [None]:
strategy_number = 2  # Change this to 0, 1, 2, 3, 4, 5, or 6 to test other strategies
run_rag_strategy(strategy_number)

Initializing the Hugging Face model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model initialized successfully!
Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Embeddings already exist! Loading embeddings with dimensionality: 384
End loading
Building the vectorstore  CHROMA ...
Retriever built successfully!


100%|██████████| 100/100 [00:23<00:00,  4.33it/s]


QA evaluation completed! Results saved to ./output/llama3_recursive_chroma_top3.csv
Initializing the Hugging Face model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model initialized successfully!
Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Start loading texts from ./data/scraped/scraped_all


Wrapping text in Document objects: 100%|██████████| 172/172 [00:00<00:00, 283131.98it/s]


End splitting texts -- Number of splits: 4423


Embedding texts: 100%|██████████| 4423/4423 [00:00<00:00, 1585590.31it/s]


Generated embeddings with dimensionality: 384
End embedding texts
Start saving embeddings and splits
Embeddings saved in ./data/embeddings/embeddings_all-MiniLM-L6-v2_main160_sublink0_semantic_1000_200.npy, splits saved in ./data/embeddings/splits_all-MiniLM-L6-v2_main160_sublink0_semantic_1000_200.pkl
Building the vectorstore  CHROMA ...
Retriever built successfully!


100%|██████████| 100/100 [00:24<00:00,  4.14it/s]


QA evaluation completed! Results saved to ./output/llama3_semantic_chroma_top3.csv
Initializing the Hugging Face model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model initialized successfully!
Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Start loading texts from ./data/scraped/scraped_all


Wrapping text in Document objects: 100%|██████████| 172/172 [00:00<00:00, 291600.76it/s]


End splitting texts -- Number of splits: 6470


Embedding texts: 100%|██████████| 6470/6470 [00:00<00:00, 1946152.24it/s]


Generated embeddings with dimensionality: 384
End embedding texts
Start saving embeddings and splits
Embeddings saved in ./data/embeddings/embeddings_all-MiniLM-L6-v2_main160_sublink0_token_1000_200.npy, splits saved in ./data/embeddings/splits_all-MiniLM-L6-v2_main160_sublink0_token_1000_200.pkl
Building the vectorstore  CHROMA ...
Retriever built successfully!


100%|██████████| 100/100 [00:23<00:00,  4.17it/s]


QA evaluation completed! Results saved to ./output/llama3_token_chroma_top3.csv
Initializing the Hugging Face model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model initialized successfully!
Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Start loading texts from ./data/scraped/scraped_all


Wrapping text in Document objects: 100%|██████████| 172/172 [00:00<00:00, 297247.75it/s]


End splitting texts -- Number of splits: 4941


Embedding texts: 100%|██████████| 4941/4941 [00:00<00:00, 1984872.72it/s]


Generated embeddings with dimensionality: 384
End embedding texts
Start saving embeddings and splits
Embeddings saved in ./data/embeddings/embeddings_all-MiniLM-L6-v2_main160_sublink0_character_1000_200.npy, splits saved in ./data/embeddings/splits_all-MiniLM-L6-v2_main160_sublink0_character_1000_200.pkl
Building the vectorstore  CHROMA ...
Retriever built successfully!


100%|██████████| 100/100 [00:24<00:00,  4.12it/s]

QA evaluation completed! Results saved to ./output/llama3_character_chroma_top3.csv





In [None]:
strategy_number = 3  # Change this to 0, 1, 2, 3, 4, 5, or 6 to test other strategies
run_rag_strategy(strategy_number)

Strategy 3 A: Compare retriever types and embedding models.
Running with retriever_type=FAISS, embedding_model=all-MiniLM-L6-v2, saving to ./output/llama3_FAISS_all-MiniLM-L6-v2_top3.csv
Initializing the Hugging Face model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model initialized successfully!
Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Embeddings already exist! Loading embeddings with dimensionality: 384
End loading
Building the vectorstore  FAISS ...
Retriever built successfully!


 11%|█         | 11/100 [00:03<00:22,  3.97it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 100/100 [00:23<00:00,  4.29it/s]


QA evaluation completed! Results saved to ./output/llama3_FAISS_all-MiniLM-L6-v2_top3.csv
Running with retriever_type=CHROMA, embedding_model=all-MiniLM-L6-v2, saving to ./output/llama3_CHROMA_all-MiniLM-L6-v2_top3.csv
Initializing the Hugging Face model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model initialized successfully!
Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Embeddings already exist! Loading embeddings with dimensionality: 384
End loading
Building the vectorstore  CHROMA ...
Retriever built successfully!


100%|██████████| 100/100 [00:23<00:00,  4.31it/s]

QA evaluation completed! Results saved to ./output/llama3_CHROMA_all-MiniLM-L6-v2_top3.csv





To resolve the dimension mismatch error and use multiple embedding models with a single ChromaDB collection, you need to ensure that the embeddings from different models have the same dimensionality.

* all-MiniLM-L6-v2  dim: 384
* all-mpnet-base-v2 dim: 768

If using Jupyter Notebook, simply restart the kernel.

In [None]:
strategy_number = 4  # Change this to 0, 1, 2, 3, 4, 5, or 6 to test other strategies
run_rag_strategy(strategy_number)

Strategy 3 B: Compare retriever types and embedding models.
Running with retriever_type=FAISS, embedding_model=all-mpnet-base-v2, saving to ./output/llama3_FAISS_all-mpnet-base-v2_top3.csv
Initializing the Hugging Face model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
  embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)


Model initialized successfully!


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Embeddings already exist! Loading embeddings with dimensionality: 768
End loading
Building the vectorstore  FAISS ...
Retriever built successfully!


 11%|█         | 11/100 [00:02<00:18,  4.79it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 100/100 [00:22<00:00,  4.46it/s]


QA evaluation completed! Results saved to ./output/llama3_FAISS_all-mpnet-base-v2_top3.csv
Running with retriever_type=CHROMA, embedding_model=all-mpnet-base-v2, saving to ./output/llama3_CHROMA_all-mpnet-base-v2_top3.csv
Initializing the Hugging Face model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model initialized successfully!
Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Embeddings already exist! Loading embeddings with dimensionality: 768
End loading
Building the vectorstore  CHROMA ...
Retriever built successfully!


100%|██████████| 100/100 [00:24<00:00,  4.01it/s]

QA evaluation completed! Results saved to ./output/llama3_CHROMA_all-mpnet-base-v2_top3.csv





In [8]:
strategy_number = 5  # Change this to 0, 1, 2, 3, 4, 5, or 6 to test other strategies
run_rag_strategy(strategy_number)


Running Strategy 4 Compare reranking models and no reranking.

Running with rerank=True, rerank_model=ms-marco-MiniLM-L-12-v2, saving to ./output/llama3_faiss_rerank_ms-marco-MiniLM-L-12-v2.csv
Reranking is set to True.
Initializing the Hugging Face model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Device set to use cuda:0
  embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)


Model initialized successfully!


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Start loading QA from ./data/test/test_questions.csv
574
Loaded 574 QAs
Embeddings already exist! Loading embeddings with dimensionality: 384
End loading
Building the vectorstore  FAISS ...
Retriever built successfully!


100%|██████████| 574/574 [00:00<00:00, 30792.74it/s]


QA evaluation completed! Results saved to ./output/llama3_faiss_rerank_ms-marco-MiniLM-L-12-v2.csv
Running with rerank=True, rerank_model=ms-marco-MultiBERT-L-12, saving to ./output/llama3_faiss_rerank_ms-marco-MultiBERT-L-12.csv
Reranking is set to True.
Initializing the Hugging Face model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model initialized successfully!
Start loading QA from ./data/test/test_questions.csv
574
Loaded 574 QAs
Embeddings already exist! Loading embeddings with dimensionality: 384
End loading
Building the vectorstore  FAISS ...
Retriever built successfully!


100%|██████████| 574/574 [00:00<00:00, 28822.35it/s]


QA evaluation completed! Results saved to ./output/llama3_faiss_rerank_ms-marco-MultiBERT-L-12.csv
Running with rerank=False, saving to ./output/llama3_faiss_no_rerank.csv
Initializing the Hugging Face model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model initialized successfully!
Start loading QA from ./data/test/test_questions.csv
574
Loaded 574 QAs
Embeddings already exist! Loading embeddings with dimensionality: 384
End loading
Building the vectorstore  FAISS ...
Retriever built successfully!


  2%|▏         | 10/574 [00:03<02:35,  3.62it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 574/574 [03:06<00:00,  3.08it/s]

QA evaluation completed! Results saved to ./output/llama3_faiss_no_rerank.csv





In [None]:
strategy_number = 6  # Change this to 0, 1, 2, 3, 4, 5, or 6 to test other strategies
run_rag_strategy(strategy_number)

Initializing the Hugging Face model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model initialized successfully!
Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Embeddings already exist! Loading embeddings with dimensionality: 384
End loading
Building the vectorstore  FAISS ...
Retriever built successfully!


100%|██████████| 100/100 [00:24<00:00,  4.16it/s]


QA evaluation completed! Results saved to ./output/llama3_faiss_similarity_top3.csv
Initializing the Hugging Face model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model initialized successfully!
Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Embeddings already exist! Loading embeddings with dimensionality: 384
End loading
Building the vectorstore  FAISS ...
Retriever built successfully!


100%|██████████| 100/100 [00:24<00:00,  4.11it/s]

QA evaluation completed! Results saved to ./output/llama3_faiss_mmr_top3.csv





In [None]:
strategy_number = 7  # Change this to 0, 1, 2, 3, 4, 5, or 6 to test other strategies
run_rag_strategy(strategy_number)

Strategy 6: Compare with/without hypothesis generation.
Initializing the Hugging Face model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Device set to use cuda:0
  embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)


Model initialized successfully!


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Start loading QA from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 QAs
Embeddings already exist! Loading embeddings with dimensionality: 384
End loading
Building the vectorstore  FAISS ...
Retriever built successfully!


  6%|▌         | 6/100 [00:18<04:49,  3.08s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 100/100 [04:34<00:00,  2.75s/it]

QA evaluation completed! Results saved to ./output/llama3_faiss_test_hypo.csv





In [None]:
total_strategies = 8  # Total number of strategies (0 to 7)

for strategy in range(total_strategies):
  print(f"\n==============================")
  print(f"Running Strategy {strategy}...")
  print(f"==============================\n")
  run_rag_strategy(strategy)

In [3]:
!pip install ipywidgets


Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


In [4]:
!pip install nbclassic




In [7]:
!pip install nbstripout

Collecting nbstripout
  Downloading nbstripout-0.8.1-py2.py3-none-any.whl.metadata (19 kB)
Downloading nbstripout-0.8.1-py2.py3-none-any.whl (16 kB)
Installing collected packages: nbstripout
Successfully installed nbstripout-0.8.1


In [6]:
!jq -M 'del(.metadata.widgets)' /content/drive/My\ Drive/Colab\ Notebooks/nlp/pipeline/baseline_no_rag.ipynb > baseline_no_rag.ipynb

/bin/bash: line 1: jq: command not found
