# RAG Pipeline
#### Steps
1. Import, process, and split document base
2. Load, instantiate, and apply embedding model
3. Create vector store with index
4. Prompt the model and feed context.
5. Create and implement retreiver (with optional re ranking)
6. Parse responses

In [3]:
!pip install -q torch transformers transformers sentence-transformers tqdm openpyxl pandas datasets ragatouille
!pip install -q langchain langchain-huggingface langchain-community langchain-huggingface langchain-openai

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-cloud-bigquery 2.34.4 requires packaging<22.0dev,>=14.3, but you have packaging 24.2 which is incompatible.
jupyterlab 4.2.5 requires jupyter-lsp>=2.0.0, but you have jupyter-lsp 1.5.1 which is incompatible.
jupyterlab-lsp 5.1.0 requires jupyter-lsp>=2.0.0, but you have jupyter-lsp 1.5.1 which is incompatible.
kfp 2.5.0 requires google-cloud-storage<3,>=2.2.1, but you have google-cloud-storage 1.44.0 which is incompatible.
kfp 2.5.0 requires requests-toolbelt<1,>=0.8.0, but you have requests-toolbelt 1.0.0 which is incompatible.
libpysal 4.9.2 requires shapely>=2.0.1, but you have shapely 1.8.5.post1 which is incompatible.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.9.1 which is incompatible.
thinc 8.3.2 requires numpy<2.1.0,>=2.0.0; python_version >= "3.9", but you have numpy 1.26.4

In [4]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets
import os
import re
import torch

from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_community.llms import HuggingFaceHub
from langchain_huggingface import HuggingFaceEndpoint
from ragatouille import RAGPretrainedModel
from langchain_core.vectorstores import VectorStore
from langchain_core.language_models.llms import LLM
from langchain.document_loaders import PyPDFLoader, UnstructuredMarkdownLoader, UnstructuredHTMLLoader  # Assumes both loaders exist
from langchain.schema import Document
from langchain_core.language_models import BaseChatModel

from sklearn.model_selection import ParameterGrid

In [5]:
import transformers

In [6]:
#print(transformers.__version__)
#print(document_loaders.__version__)

In [7]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

In [8]:
def load_documents(file_path):
    _, file_extension = os.path.splitext(file_path)

    if file_extension.lower() == '.pdf':
        loader = PyPDFLoader(file_path)
        print("Loading PDF document...")
    elif file_extension.lower() == '.md':
        loader = UnstructuredMarkdownLoader(file_path)
        print("Loading Markdown document...")
    elif file_extension.lower() == '.html':
        loader = UnstructuredHTMLLoader(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a PDF or Markdown file.")
    print(f'{type(loader)}')
    documents = loader.load()
    print(f'{type(documents)}')
    return documents

In [9]:
test = load_documents('/kaggle/input/course-bot-data/documents/rbain_syllabus.pdf')

Loading PDF document...
<class 'langchain_community.document_loaders.pdf.PyPDFLoader'>
<class 'list'>


In [10]:
test[0]

Document(metadata={'source': '/kaggle/input/course-bot-data/documents/rbain_syllabus.pdf', 'page': 0}, page_content='I.\nCourse\nDescription\n1.\nCourse\nSummary\na.\nPHY\n161/PHYS\n215\nGeneral\nPhysics\nI\nis\nan\nalgebra-based\nintroduction\nto\nmechanics,\nthermodynamics,\nand\nwaves.\nTopics\ninclude\nmotion\nin\none\nand\ntwo\ndimensions,\nNewton’s\nlaws\nof\nmotion,\nequilibrium,\nwork,\nenergy,\nmomentum,\nrotational\nmotion,\ngravity,\nheat,\nwaves,\nand\nsound.\nExamples\nfrom\nmedicine\nand\nbiology\nwill\nbe\nincluded\nwhenever\npossible.\n2.\nCollege\nCredit\nHours\n(Dual-Enrollment)\na.\nThis\ncourse\nis\ndual\nenrolled\nwith\nPHYS\n215\nGeneral\nPhysics\nI\nat\nFrancis\nMarion\nUniversity\n(FMU)\nand\ntaught\nby\na\nGSSM\ninstructor.\nStudents\nwill\neach\nhave\na\nFMU\ntranscript\nwith\ntheir\noverall\ngrade\nearned\nin\nthis\ncourse.\nStudents\nmay\nearn\nup\nto\n4\ncollege\ncredit\nhours\ndepending\non\ntheir\ngrade\nand\nthe\ntransfer\npolicies\nof\ntheir\ncollege/un

In [11]:
# Implement splitting of docs via text_splitter
def split_documents(chunk_size: int, knowledge_base: List[Document],tokenizer_name: str,) -> List[Document]:

    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=["\n\n", "\n", ".", " ", ""],
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

In [12]:
# Method to load embeddins and create vectore store
def load_embeddings(langchain_docs, chunk_size, embedding_model_name: Optional[str] = "thenlper/gte-small",) -> FAISS:

    embedding_model = HuggingFaceEmbeddings(
        model_name=embedding_model_name,
        multi_process=True,
        #model_kwargs={"device": "cuda"},
        encode_kwargs={"normalize_embeddings": True},  # set True to compute cosine similarity
    )

    # Check if embeddings already exist on disk
    index_name = f"index_chunk:{chunk_size}_embeddings:{embedding_model_name.replace('/', '~')}"
    index_folder_path = f"./data/indexes/{index_name}/"
    if os.path.isdir(index_folder_path):
        return FAISS.load_local(
            index_folder_path,
            embedding_model,
            distance_strategy=DistanceStrategy.COSINE,
            allow_dangerous_deserialization=True
        )

    else:
        print("Index not found, generating it...")
        docs_processed = split_documents(
            chunk_size,
            langchain_docs,
            embedding_model_name,
        )
        knowledge_index = FAISS.from_documents(
            docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE,
            #allow_dangerous_deserialization=True
        )
        knowledge_index.save_local(index_folder_path)
        return knowledge_index

In [13]:
RAG_PROMPT_TEMPLATE = """
<|system|>
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.</s>
<|user|>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
</s>
<|assistant|>
"""

In [14]:
# Zephyr 7b from Mistral AI
#repo_id = "HuggingFaceH4/zephyr-7b-beta"
#READER_MODEL_NAME = "zephyr-7b-beta"

# LLama 3.1 8B Instruct from Meta
repo_id = 'meta-llama/Llama-3.1-8B-Instruct'
READER_MODEL_NAME = 'Llama-3.1-8B-Instruct'

# Declare some parameters for the RAG Q&A LLM
reader_model_params = {"max_new_tokens": 512, "top_k": 30,"temperature": 0.1,"repetition_penalty": 1.03,}

reader_llm = HuggingFaceHub(
    repo_id=repo_id,
    task="text-generation", 
    huggingfacehub_api_token=HF_TOKEN,
    model_kwargs = reader_model_params,
)

  reader_llm = HuggingFaceHub(


In [15]:
def answer_with_rag(question: str, llm: LLM, knowledge_index: VectorStore, 
                    reranker: Optional[RAGPretrainedModel] = None, num_retrieved_docs: int = 30,
                    num_docs_final: int = 7,) -> Tuple[str, List[Document]]:
    
    # Gather documents with retriever
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results using RAGatoulli ColBERT model
    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Redact an answer
    answer = llm(final_prompt)
    #answer = llm.invoke(final_prompt)

    return answer, relevant_docs

In [16]:
# Method to run rag on set of questions
def run_rag_tests(eval_dataset, llm, knowledge_index, output_file,
                  reranker: Optional[RAGPretrainedModel] = None, verbose: Optional[bool] = True,
                  test_settings: Optional[str] = None,):
    
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    # loop over the q&a pairs in the eval dataset 
    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(question, llm, knowledge_index, reranker=reranker)
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }
        # Optionally include the details about the model settings throughout the pipeline
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        # Print everything to an output json file
        with open(output_file, "w") as f:
            json.dump(outputs, f)

In [17]:
RAW_KNOWLEDGE_BASE = load_documents('/kaggle/input/course-bot-data/documents/rbain_syllabus.pdf')

Loading PDF document...
<class 'langchain_community.document_loaders.pdf.PyPDFLoader'>
<class 'list'>


In [18]:
from langchain_community.llms import HuggingFaceHub
from langchain_huggingface import HuggingFaceEndpoint

# Zephyr 7b from Mistral AI
repo_id = "HuggingFaceH4/zephyr-7b-beta"
READER_MODEL_NAME = "zephyr-7b-beta"

# LLama 3.1 8B Instruct from Meta
#repo_id = 'meta-llama/Llama-3.1-8B-Instruct'
#READER_MODEL_NAME = 'Llama-3.1-8B-Instruct'

# Declare some parameters for the RAG Q&A LLM
reader_model_params = {"max_new_tokens": 512, "top_k": 30,"temperature": 0.1,"repetition_penalty": 1.03,}

reader_llm = HuggingFaceHub(
    repo_id=repo_id,
    task="text-generation", 
    huggingfacehub_api_token=HF_TOKEN,
    model_kwargs = reader_model_params,
)

In [19]:
# Prepare data set of questions
questions = pd.read_json('/kaggle/input/course-bot-data/documents/rbain_syllabus_questions.json')
question = questions.iloc[0].question
question

'What topics are included in this course?'

In [20]:
'''
hyperparameters = {
    "chunk_size": [200],  # Add other chunk sizes as needed
    "embeddings": ["thenlper/gte-small"],  # Add other embeddings as needed
    "rerank": [True, False],
}

# Generate all combinations of hyperparameters
for params in ParameterGrid(hyperparameters):
    chunk_size = params["chunk_size"]
    embeddings = params["embeddings"]
    rerank = params["rerank"]
'''

'\nhyperparameters = {\n    "chunk_size": [200],  # Add other chunk sizes as needed\n    "embeddings": ["thenlper/gte-small"],  # Add other embeddings as needed\n    "rerank": [True, False],\n}\n\n# Generate all combinations of hyperparameters\nfor params in ParameterGrid(hyperparameters):\n    chunk_size = params["chunk_size"]\n    embeddings = params["embeddings"]\n    rerank = params["rerank"]\n'

In [21]:
def parse_question_answer(rag_output):
    # Split the output into lines for easier parsing
    lines = rag_output.splitlines()

    # Initialize variables
    question = None
    answer = None

    # Look for the question and answer markers
    for line in lines:
        line = line.strip()
        if line.startswith("Question:"):
            # Extract the question text
            question = line.replace("Question:", "").strip()
        elif line.startswith("<|assistant|>"):
            # Answer follows <|assistant|>, so join subsequent lines
            answer_start_index = lines.index(line) + 1
            answer = " ".join(l.strip() for l in lines[answer_start_index:])
            break

    # Return the parsed question and answer
    #return {"question": question, "answer": answer}
    return question

In [23]:
def process_questions_with_answers(json_file_path, output_file_path, llm, knowledge_index):

    with open(json_file_path, "r") as file:
        questions = json.load(file)

    for question_obj in questions:
        question = question_obj.get("question", "")
        if question:  # Ensure there is a valid question
            answer =  answer_with_rag(question, llm, knowledge_index)[0]
            question_obj["answer"] = parse_question_answer(answer)

    with open(output_file_path, "w") as file:
        json.dump(questions, file, indent=4)


In [None]:
if not os.path.exists("./output"):
    os.mkdir("./output")

hyperparameters = {
    "chunk_size": [200],  # Add other chunk sizes as needed
    "embeddings": ["thenlper/gte-small"],  # Add other embeddings as needed
    "rerank": [True, False],
}

# Generate all combinations of hyperparameters
for params in ParameterGrid(hyperparameters):
    chunk_size = params["chunk_size"]
    embeddings = params["embeddings"]
    rerank = params["rerank"]


#for chunk_size in [200]:  # Add other chunk sizes (in tokens) as needed
#    for embeddings in ["thenlper/gte-small"]:  # Add other embeddings as needed
#        for rerank in [True, False]:
    settings_name = f"chunk:{chunk_size}_embeddings:{embeddings.replace('/', '~')}_rerank:{rerank}_reader-model:{READER_MODEL_NAME}"
    output_file_name = f"./output/rag_{settings_name}.json"

    print(f"Running evaluation for {settings_name}:")

    print("Creating embeddings...")
    knowledge_index = load_embeddings(
        RAW_KNOWLEDGE_BASE,
        chunk_size=chunk_size,
        embedding_model_name=embeddings,
    )

    print("Implementing RAG...")
    reranker = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0") if rerank else None
    
    # Example usage
    input_file = "/kaggle/input/course-bot-data/documents/rbain_syllabus_questions.json"  # Replace with your input file path
    output_file = "questions_with_answers.json"  # Replace with your output file path

    process_questions_with_answers(input_file, output_file, reader_llm, knowledge_index)
    #answer_with_rag(question, reader_llm, knowledge_index)
    #run_rag_tests(
    #    eval_dataset=eval_dataset,
    #    llm=reader_llm,
    #    knowledge_index=knowledge_index,
    #    output_file=output_file_name,
    #    reranker=reranker,
    #    verbose=False,
    #    test_settings=settings_name,
    #)