# Chatbot with Retrieval Augmented Generation (RAG)

## Install

In [11]:
!pip install faiss-cpu --quiet
!pip install sentence-transformers --quiet
!pip install transformers --quiet
print('Libraries installed.')

Libraries installed.


## Import

In [12]:
import os
import re
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from torch.utils.data import DataLoader
from tqdm.notebook import trange, tqdm
print('Libraries imported.')

Libraries imported.


In [13]:
# Load data
DATA_DIR = "/kaggle/input/sertis-datascience-2025-chatbot-rag-data"

def load_data():
    documents_df = pd.read_csv(os.path.join(DATA_DIR, "documents.csv"))
    single_questions_df = pd.read_csv(os.path.join(DATA_DIR, "single_passage_answer_questions.csv"))
    multi_questions_df = pd.read_csv(os.path.join(DATA_DIR, "multi_passage_answer_questions.csv"))
    no_answer_questions_df = pd.read_csv(os.path.join(DATA_DIR, "no_answer_questions.csv"))
    print('Data loaded successfully.')
    return documents_df, single_questions_df, multi_questions_df, no_answer_questions_df
documents_df, single_questions_df, multi_questions_df, no_answer_questions_df = load_data()

Data loaded successfully.


## Chunking

In [14]:
CHUNK_SIZE = 300  # number of words per chunk

# def chunk_text(text: str, chunk_size: int = 300) -> list:
#     """
#     Split the text into a list of chunks, each containing up to `chunk_size` words.
    
#     :param text: The text to be chunked.
#     :param chunk_size: Number of words per chunk.
#     :return: List of text chunks.
#     """
#     words = text.split()
#     chunks = []
#     for i in range(0, len(words), chunk_size):
#         chunk = words[i:i+chunk_size]
#         chunks.append(" ".join(chunk))
#     return chunks

def chunk_paragraphs(text: str, max_tokens: int = 300) -> list:
    """
    Break text into paragraph-based chunks with a maximum token (word) limit.
    """
    paragraphs = text.split("\n\n")
    chunks = []
    current_chunk = []
    current_length = 0
    
    for paragraph in paragraphs:
        paragraph_length = len(paragraph.split())
        # If adding the paragraph exceeds the limit, start a new chunk
        if current_length + paragraph_length > max_tokens and current_chunk:
            chunks.append("\n\n".join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(paragraph)
        current_length += paragraph_length
    
    # Append any leftover text
    if current_chunk:
        chunks.append("\n\n".join(current_chunk))
    return chunks

# For each document, chunk the text
all_chunks = []
doc_ids = []
for idx, row in documents_df.iterrows():
    print(f"Chunking doc from {row['source_url']}")
    doc_id = row["index"]
    text = row["text"]
    # chunks = chunk_text(text, chunk_size=CHUNK_SIZE)
    chunks = chunk_paragraphs(text, max_tokens=CHUNK_SIZE)
    for chunk in chunks:
        all_chunks.append(chunk)
        doc_ids.append(doc_id)

Chunking doc from https://enterthegungeon.fandom.com/wiki/Bullet_Kin
Chunking doc from https://www.dropbox.com/scl/fi/ljtdg6eaucrbf1aksw5rm/c2%20-%20session%2050%20-%20underground.docx?rlkey=ioqwgkd14i5xk20i3fp38nzgs&e=1&dl=0
Chunking doc from https://bytes-and-nibbles.web.app/bytes/stici-note-part-1-planning-and-prototyping
Chunking doc from https://github.com/llmware-ai/llmware
Chunking doc from https://docs.marimo.io/recipes.html
Chunking doc from https://towardsdatascience.com/how-to-maximize-your-impact-as-a-data-scientist-3881995a9cb1
Chunking doc from https://ec.europa.eu/commission/presscorner/detail/en/QANDA_21_1683
Chunking doc from https://bg3.wiki/wiki/The_Emperor
Chunking doc from https://whattocook.substack.com/p/so-into-northern-spain
Chunking doc from https://dmtalkies.com/the-zone-of-interest-ending-explained-and-summary-2023-film/
Chunking doc from https://www.loonyparty.com/about/policy-proposals/
Chunking doc from https://timdettmers.com/2023/01/30/which-gpu-for-dee

## Embedding
"BAAI/bge-large-en" 
ref. https://huggingface.co/BAAI/bge-large-en

In [15]:
# EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
# EMBED_DIM = 384   # dimension for 'all-MiniLM-L6-v2' embeddings
EMBEDDING_MODEL_NAME = "BAAI/bge-large-en"
EMBED_DIM = 1024   # dimension for 'BAAI/bge-large-en' embeddings

embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
print(f"Load model {EMBEDDING_MODEL_NAME} successfully.")

# Compute embeddings for all chunks
print("Computing embeddings for chunks...")
chunk_embeddings = embedding_model.encode(all_chunks, show_progress_bar=True)
chunk_embeddings = np.array(chunk_embeddings, dtype="float32")
print("Embeddings computed successfully.")

Load model BAAI/bge-large-en successfully.
Computing embeddings for chunks...


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Embeddings computed successfully.


## FAISS Vector Index

In [16]:
index = faiss.IndexFlatL2(EMBED_DIM)
index.add(chunk_embeddings)
print(f"FAISS index size: {index.ntotal}")

# Keep a lookup of chunk_id -> text
chunk_id_to_text = {i: chunk for i, chunk in enumerate(all_chunks)}
chunk_id_to_docid = {i: doc_ids[i] for i in range(len(doc_ids))}

FAISS index size: 321


## Generation Model
"google/flan-t5-large"
ref. https://huggingface.co/google/flan-t5-large

In [17]:
# GENERATION_MODEL_NAME = "google/flan-t5-small"
GENERATION_MODEL_NAME = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(GENERATION_MODEL_NAME)
def generate_answer(context: str, question: str, 
                    max_length: int = 128) -> str:
    """
    Use a seq2seq model to generate an answer from the given context and question.
    
    :param context: Relevant textual context.
    :param question: The question posed by the user.
    :param max_length: Maximum tokens for generation.
    :return: Generated answer string.
    """
    # Create a simple prompt
    prompt = (
        "You are an AI assistant. Provide a helpful, concise answer.\n\n"
        "Context:\n"
        f"{context}\n\n"
        "Question:\n"
        f"{question}\n\n"
        "Instructions: Provide a direct and accurate answer. Avoid speculation.\n"
        "Answer:"
    )
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = gen_model.generate(**inputs, max_length=max_length)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

## RAG Retrieval

In [18]:
def retrieve_top_k(query: str, top_k: int = 3, distance_threshold: float = 1.0) -> list:
    """
    Retrieve the top_k chunks from the FAISS index for a given query.
    
    :param query: The user query/question.
    :param top_k: Number of chunks to retrieve.
    :return: A list of (score, chunk_text) sorted by ascending distance.
    """
    query_embedding = embedding_model.encode([query]).astype("float32")
    distances, indices = index.search(query_embedding, top_k)
    
    # If best distance is above threshold, we conclude no answer is found
    if distances[0][0] > distance_threshold:
        return "No answer found in the knowledge base."
        
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        chunk_text = chunk_id_to_text[idx]
        results.append((dist, chunk_text))
    return results

GUARDRAILS_KEYWORDS = [
    "politics",
    "terrorism",
    "violence",
    "hate speech",
    "abuse",
    "amphetamines",
]
def is_guarded(query: str) -> bool:
    """
    Check if the query contains guarded keywords.
    Naive keyword-based.
    
    :param query: The input user query.
    :return: True if the query is guarded (contains certain sensitive or misused terms), else False.
    """
    lower_query = query.lower()
    for keyword in GUARDRAILS_KEYWORDS:
        if keyword in lower_query:
            return True
    return False

TOP_K = 3         # number of retrieved chunks
def rag_pipeline(question: str, top_k: int = TOP_K) -> str:
    """
    End-to-end RAG pipeline to answer a question:
    1) Guardrail checks
    2) Retrieve relevant chunks
    3) Generate answer
    
    :param question: Input user question.
    :param top_k: Number of chunks to retrieve.
    :return: Final answer string.
    """
    # 1) Guardrail
    if is_guarded(question):
        return "I’m sorry, but I can’t help with that."
    
    # 2) Retrieval
    top_chunks = retrieve_top_k(question, top_k=top_k)
    # Concatenate all relevant chunks into a single context
    combined_context = "\n".join([chunk for _, chunk in top_chunks])
    
    # 3) Generation
    answer = generate_answer(combined_context, question)
    return answer

In [19]:
if __name__ == "__main__":
    # Quick test on single-passage questions
    print("\n===== Example: Single-Passage Question =====")
    sample_single = single_questions_df.sample(1).iloc[0]
    question_sp = sample_single["question"]
    true_answer_sp = sample_single["answer"]
    print(f"Question: {question_sp}")
    predicted_answer_sp = rag_pipeline(question_sp, TOP_K)
    print(f"Predicted Answer: {predicted_answer_sp}")
    print(f"True Answer: {true_answer_sp}")

    # Quick test on multi-passage questions
    print("\n===== Example: Multi-Passage Question =====")
    sample_multi = multi_questions_df.sample(1).iloc[0]
    question_mp = sample_multi["question"]
    true_answer_mp = sample_multi["answer"]
    print(f"Question: {question_mp}")
    predicted_answer_mp = rag_pipeline(question_mp, TOP_K)
    print(f"Predicted Answer: {predicted_answer_mp}")
    print(f"True Answer: {true_answer_mp}")

    # Quick test on no-answer questions
    print("\n===== Example: No-Answer Question =====")
    sample_noans = no_answer_questions_df.sample(1).iloc[0]
    question_noans = sample_noans["question"]
    print(f"Question: {question_noans}")
    predicted_answer_noans = rag_pipeline(question_noans, TOP_K)
    print(f"Predicted Answer: {predicted_answer_noans} (No known answer in docs)")
    
    # Test guardrail
    print("\n===== Example: Guardrail Trigger =====")
    # blocked_query = "What are the politics of this country?"
    blocked_query = "How to cook amphetamines?"
    print(f"Question: {blocked_query}")
    blocked_answer = rag_pipeline(blocked_query)
    print(f"Answer: {blocked_answer}")

    print("\nAll steps completed!")


===== Example: Single-Passage Question =====
Question: What is the policy on Tai Chi?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Predicted Answer: Model distillation can actually improve accuracy in domain-specific tasks while making a smaller model.
True Answer: In order to calm down the passions and stresses currently exhibited in Parliament, the Loony Party would make all M.P’s have half an hours compulsory Tai chi everyday.

===== Example: Multi-Passage Question =====
Question: Which books contain multiple narratives?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Predicted Answer: The Scarlet Alchemist is that it is a book that is a mix of fiction and nonfiction.
True Answer: The books 'Same Bed Different Dreams', 'EVERY DROP IS A MAN’S NIGHTMARE', 'The Museum of Human History', 'Witch King', and 'The Terraformers' each use multiple narratives.

===== Example: No-Answer Question =====
Question: What caliber is the bullet of light?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Predicted Answer: Bullet Kin (No known answer in docs)

===== Example: Guardrail Trigger =====
Question: How to cook amphetamines?
Answer: I’m sorry, but I can’t help with that.

All steps completed!
