In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# change the working directory to the Drive root
%cd /content/drive/My\ Drive/Colab\ Notebooks/nlp

/content/drive/My Drive/Colab Notebooks/nlp


# RAG Pipeline


In [3]:
!pip install faiss-cpu
!pip install faiss-gpu-cu12 # CUDA 12.x, Python 3.8+
!pip install dotenv
!pip install langchain_chroma

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m72.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
[31mERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss-gpu[0m[31m
[0mCollecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting python-dotenv (from dotenv)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, dotenv
Successfully installed dotenv-0.9.9 python-dotenv-1.0.1
Collecting 

In [5]:
!pip install langchain-community
!pip install langchain_experimental
!pip install langchain_openai

Collecting langchain-community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-no

In [6]:
def load_text_files(path):
    """
    Load text files from the given path.

    Args:
    path (str): The path to the directory containing the text files.

    Returns:
    list: A list of text documents.
    """

    docs = []

    # Check if the path is a directory
    if os.path.isdir(path):
        # Iterate over files in the directory
        for file_name in os.listdir(path):
            if file_name.endswith(".txt"):
                file_path = os.path.join(path, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:
                    docs.append(file.read())
    elif os.path.isfile(path) and path.endswith(".txt"):
        # If the path is a file, directly read it
        with open(path, 'r', encoding='utf-8') as file:
            docs.append(file.read())

    return docs

def format_retreived_docs(docs):
    """
    Format the retrieved documents by the following format:
    Context 1: <Document 1>
    Context 2: <Document 2>
    ...
    """
    # reverse the order of the documents
    docs = reversed(docs)
    return "\n\n".join([f"Context {i+1}: {doc}" for i, doc in enumerate(docs)])


def rerank_docs(query, retriever, rerank_model_name, k=3):
    """
    Rerank the retrieved documents based on the query using Flashrank.
    """
    # DEFAULT_MODEL_NAME = "ms-marco-MultiBERT-L-12"
    compressor = FlashrankRerank(top_n=k, model=rerank_model_name)
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor, base_retriever=retriever)

    compressed_docs = compression_retriever.invoke(query)

    return compressed_docs

def get_hypo_doc(query, generation_pipe):
    """
    Generate a hypothesis document for the given query using the language model.
    """
    template = """Imagine you are an expert providing a detailed and factual explanation in response to the query '{query}'.
    Your response should include all key points that would be found in a top search result, without adding any personal opinions, commentary, or experiences.
    Do not include any subjective phrases such as 'I think', 'I believe', or 'I am not sure'. Do not apologize, hedge, or express uncertainty.
    The response should be structured as an objective, factual explanation only, without any conversational elements or chatting.
    If you are truly uncertain and cannot provide an accurate answer, simply respond with: 'Unavailable: {query}'.
    Otherwise, answer confidently with only the relevant information.
    """

    messages = [
        {"role": "user", "content": template.format(query=query)}
    ]

    with torch.no_grad():
        hypo_doc = generation_pipe(messages, max_new_tokens=100, return_full_text=False)[0]["generated_text"]

    print("Question:", query)
    print("Hypothesis Document:", hypo_doc)

    # check if the hypo_doc starts with "Unavailable"
    if hypo_doc.startswith("Unavailable"):
        print("Using the original query.")
        return query
    else:
        return hypo_doc


def answer_generation(
    qa_df, output_file, retriever, generation_pipe,
    prompt, rerank, rerank_model_name, hypo, top_k_rerank=3):
    """
    Generate answers for the given questions using the retriever and the generation pipeline.

    Args:
    questions (list): A list of questions to answer.
    retriever (Chroma): A retriever object to retrieve documents.
    generation_pipe (pipeline): A pipeline object for text generation.
    prompt (ChatPromptTemplate): A ChatPromptTemplate object for generating prompts.

    Returns:
    list: A list of generated answers
    """

    print("Generating answers for the questions...")

    # check if the output file
    if not os.path.exists(output_file):
        with open(output_file, 'w') as f_out:
            f_out.write(",".join(list(qa_df.columns) + ["Generated_Answer"]) + "\n")
            start_idx = 0
    else:
        # calculate the number of rows in the output file
        with open(output_file, 'r') as f_out:
            num_rows = sum(1 for line in f_out)
            # the iteration will start from the next row
            start_idx = num_rows - 1

    # iterate over the dataframe
    with open(output_file, 'a') as f_out:
        for idx, row in tqdm(qa_df.iterrows(), total=len(qa_df)):
            # skip the rows that have been processed
            if idx < start_idx:
                continue
            query = row["Question"]

            if hypo:
                query = get_hypo_doc(query, generation_pipe)

            # Retrieve documents based on the question
            if rerank:
                print("Reranking documents...")
                retrieved_docs = rerank_docs(query, retriever, rerank_model_name, k=top_k_rerank)
            else:
                retrieved_docs = retriever.invoke(query)

            # Format the documents
            context = format_retreived_docs(retrieved_docs)

            # Create the full prompt using the prompt template
            prompt_messages = prompt.format_messages(context=context, question=row["Question"])
            full_prompt = "\n".join(message.content for message in prompt_messages)

            messages = [
            {"role": "user", "content": full_prompt},
            ]
            with torch.no_grad():
                llm_output = generation_pipe(
                    messages, max_new_tokens=20, return_full_text=False)[0]["generated_text"]

            row["Generated_Answer"] = llm_output
            pd.DataFrame([row]).to_csv(f_out, header=False, index=False)
            # Clear cache after generation
            del retrieved_docs, context, prompt_messages, full_prompt, messages, llm_output
            torch.cuda.empty_cache()

# ========================================
# Constants and Configuration
# ========================================

PROMPT_TEMPLATE = """
You are an expert assistant answering factual questions about Pittsburgh or Carnegie Mellon University (CMU).
Use the retrieved information to give a detailed and helpful answer. If the provided context does not contain the answer, leverage your pretraining knowledge to provide the correct answer.
If you truly do not know, just say "I don't know."

Important Instructions:
- Answer concisely without repeating the question.
- Use the provided context if relevant; otherwise, rely on your pretraining knowledge.
- Do **not** use complete sentences. Provide only the word, name, date, or phrase that directly answers the question. For example, given the question "When was Carnegie Mellon University founded?", you should only answer "1900".

Examples:
Question: Who is Pittsburgh named after?
Answer: William Pitt
Question: What famous machine learning venue had its first conference in Pittsburgh in 1980?
Answer: ICML
Question: What musical artist is performing at PPG Arena on October 13?
Answer: Billie Eilish

Context: \n\n {context} \n\n
Question: {question} \n\n
Answer:
"""

In [7]:
import os
import torch
import pandas as pd
from tqdm import tqdm

import faiss
import numpy as np
import pickle, random

from dotenv import load_dotenv
from huggingface_hub import login
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    PromptTemplate
)

# ========================================
# Vars that can be set and read from another var file.
# ========================================

def str2bool(value):
    if isinstance(value, bool):
        return value
    if value.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif value.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise ValueError('Boolean value expected.')

# ========================================
# Main Script Execution
# ========================================
if __name__ == "__main__":

    # Step 0: Load environment variables
    load_dotenv()

    os.environ["LANGCHAIN_TRACING_V2"] = "true"
    os.environ["LANGCHAIN_API_KEY"] = os.getenv('LANGCHAIN_API_KEY')
    os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
    os.environ["LANGCHAIN_PROJECT"] = "RAGmodel"
    os.environ["USER_AGENT"] = "LangChain/1.0 (+https://www.langchain.com)"

    login(token = os.getenv('HUGGINGFACE_TOKEN'))

    # Set model name, precision, and other parameters directly
    model_name = "meta-llama/Llama-3.1-8B-Instruct"
    dtype = torch.float16  # or torch.bfloat16
    embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embedding_dim = 384
    splitter_type = "recursive"  # or "character", "token", "semantic"
    chunk_size = 1000
    chunk_overlap = 200
    text_files_path = "./data/scraped/scraped_all"
    sublink_files_path = "./data/scraped/scraped_text_data"
    sublink_files_nums = 0
    qes_file_path = "./data/annotated/QA_pairs_1.csv"
    top_k_search = 3
    retriever_type = "FAISS"  # or "CHROMA"
    retriever_algorithm = "similarity"  # or "mmr"
    rerank = False
    top_k_rerank = 3
    rerank_model_name = "ms-marco-MultiBERT-L-12"
    hypo = False
    output_file = "output_results.txt"  # Set your desired output file path
    qa_nums = 100
    random.seed(42)

    # check if rerank is set to True
    if rerank:
        print("Reranking is set to True.")

    # Step 1: Initialize the Hugging Face model as your LLM
    print("Initializing the Hugging Face model...")
    model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=dtype, device_map="cuda:0")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"

    generation_pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=dtype
    )
    print("Model initialized successfully!")

    # Step 2: Load the Sentence Transformers model for embeddings
    docs_length = f"main160_sublink{sublink_files_nums}"
    model_name_str = embedding_model_name.split('/')[-1]
    embeddings_file_path = f"./data/embeddings/embeddings_{model_name_str}_{docs_length}_{splitter_type}_{chunk_size}_{chunk_overlap}.npy"
    splits_file_path = f"./data/embeddings/splits_{model_name_str}_{docs_length}_{splitter_type}_{chunk_size}_{chunk_overlap}.pkl"
    embeddings = None
    splits = None
    embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
    print(f"Start loading QA from {qes_file_path}")
    qa_test_data_path = qes_file_path
    qa_df = pd.read_csv(qa_test_data_path)
    print(len(qa_df))
    if len(qa_df) != 574:
        qa_df = qa_df.sample(qa_nums, random_state=221)
    print(f"Loaded {len(qa_df)} QAs")
    if not os.path.exists(embeddings_file_path):
        # Step 3: load the text files for building the index and qa evaluation
        print(f"Start loading texts from {text_files_path}")
        docs = load_text_files(path=text_files_path)
        # Step 4: Split the documents into smaller chunks
        # Wrap text strings in Document objects
        documents = []
        for text in tqdm(docs, desc="wrapping text in Document objects"):
            documents.append(Document(page_content=text))
            # print(text)
        del docs

        if sublink_files_nums != 0:
            all_sublink_docs = None
            sublink_file_store_path = "./data/embeddings/sublink_docs.pkl"
            if os.path.exists(sublink_file_store_path):
                print(f"Start loading sublink files from {sublink_file_store_path}")
                with open(sublink_file_store_path, "rb") as f:
                    all_sublink_docs = pickle.load(f)
            else:
                print(f"Start reading all sublink files")
                all_sublink_docs = load_text_files(path=sublink_files_path)
                print(f"Finish loading {len(all_sublink_docs)} sublinks, now store it")
                with open(sublink_file_store_path, 'wb') as f:
                    pickle.dump(all_sublink_docs, f)
                print(f"Store all sublink file in {sublink_file_store_path}")

            sampled_sublink_docs = random.sample(all_sublink_docs, sublink_files_nums)
            for text in tqdm(sampled_sublink_docs, desc="wrapping text in Document objects"):
                documents.append(Document(page_content=text))
            del sampled_sublink_docs
            del all_sublink_docs

        if splitter_type == "recursive":
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        elif splitter_type == "character":
            text_splitter = CharacterTextSplitter(
                separator=" ",
                chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        elif splitter_type == "token":
            text_splitter = TokenTextSplitter(
                chunk_size=int(chunk_size / 4),
                chunk_overlap=int(chunk_overlap / 4))
        elif splitter_type == "semantic":
            text_splitter = SemanticChunker(
                embeddings=embedding_model,
                breakpoint_threshold_type="percentile",
                breakpoint_threshold_amount=80)
        else:
            print("Invalid splitter type. Please choose between recursive, character, token, or semantic.")

        splits = text_splitter.split_documents(documents)
        del documents
        print(f"End Spliting texts -- Number of splits: {len(splits)}")

        # Step 5: Create Chroma vectorstore with embeddings from Sentence Transformers
        embeddings = embedding_model.embed_documents(
            [doc.page_content for doc in tqdm(splits, desc="Embedding texts")])
        print(f"End Embedding texts")
        # Free GPU cache after generating embeddings
        torch.cuda.empty_cache()
        print(f"Start Saving embeddings and splits")
        np.save(embeddings_file_path, embeddings)
        with open(splits_file_path, 'wb') as f:
            pickle.dump(splits, f)
        doc_metadata = [doc.metadata for doc in splits]  # Save metadata for documents
        np.save(f"./data/embeddings/metadata_{docs_length}_{splitter_type}_{chunk_size}_{chunk_overlap}.npy", doc_metadata)
        print(f"embeddings saved in {embeddings_file_path}, splits saved in {splits_file_path}")
    else:
        print("Embeddings already exists! loading embeddings")
        # Step 1: Load embeddings from the saved NumPy file
        embeddings = np.load(embeddings_file_path)
        with open(splits_file_path, 'rb') as f:
            splits = pickle.load(f)
        # Step 2: Load document metadata if needed
        # doc_metadata = np.load("doc_metadata.npy", allow_pickle=True)
        print("end loading")
    # Step 6: Create the RAG prompting pipeline
    prompt_template = PromptTemplate(
        input_variables=['context', 'question'],
        template=PROMPT_TEMPLATE
    )

    # Update the HumanMessagePromptTemplate with the new PromptTemplate
    human_message_template = HumanMessagePromptTemplate(prompt=prompt_template)

    # Update the ChatPromptTemplate with the modified message
    chat_prompt_template = ChatPromptTemplate(
        input_variables=['context', 'question'],
        messages=[human_message_template]
    )

    prompt = chat_prompt_template

    # Step 7: Generate answers for the questions
    print("Building the vectorstore...")
    if retriever_type == "CHROMA":
        print("Building the vectorstore Chroma...")
        vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model, collection_name="collectionChroma")
        chroma_retriever = vectorstore.as_retriever(search_type=retriever_algorithm, search_kwargs={'k': top_k_search})
        retriever = chroma_retriever
    elif retriever_type == "FAISS":
        print("Building FAISS...")
        # embeddings_np = np.array(embeddings).astype("float32")
        faiss_retriever = FAISS.from_documents(splits, embedding_model).as_retriever(search_type=retriever_algorithm, search_kwargs={"k": top_k_search})
        retriever = faiss_retriever
    else:
        print("Invalid retriever type. Please choose between FAISS or CHROMA.")

    print("Retriever built successfully!")
    torch.cuda.empty_cache()
    del splits

    answer_generation(
        qa_df, output_file, retriever,
        generation_pipe, prompt, rerank, rerank_model_name, hypo, top_k_rerank=top_k_rerank)

    print(f"QA evaluation completed! Results saved to {output_file}")

Initializing the Hugging Face model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Device set to use cuda:0
  embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)


Model initialized successfully!


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Start loading qa from ./data/annotated/QA_pairs_1.csv
3938
Loaded 100 qas
Start loading texts from ./data/scraped/scraped_all


wrapping text in Document objects: 100%|██████████| 172/172 [00:00<00:00, 145125.79it/s]


End Spliting texts -- Number of splits: 5326


Embedding texts: 100%|██████████| 5326/5326 [00:00<00:00, 2470293.39it/s]


End Embedding texts
Start Saving embeddings and splits
embeddings saved in ./data/embeddings/embeddings_all-MiniLM-L6-v2_main160_sublink0_recursive_1000_200.npy, splits saved in ./data/embeddings/splits_all-MiniLM-L6-v2_main160_sublink0_recursive_1000_200.pkl
Building the vectorstore...
Building FAISS...
Retriever built successfully!
Generating answers for the questions...


 10%|█         | 10/100 [00:03<00:24,  3.67it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 100/100 [00:31<00:00,  3.21it/s]

QA evaluation completed! Results saved to output_results.txt



