## Installation

In [1]:
packages = [
    "langchain",
    "chromadb",
    "gradio",
    "langchain_community",
    "chromadb",
    "InstructorEmbedding==1.0.1",
    "sentence-transformers==2.2.2",
    "transformers>=4.20",
    "datasets>=2.20",
    "pyarrow>=17.0",
    "numpy>=1.0",
    "requests>=2.26",
    "scikit_learn>=1.0.2",
    "scipy>=1.14",
    "torch>=2.0",
    "rich>=13.0",
    "huggingface-hub==0.24.0",
    "protobuf==5.28.2",
    "transformers>=4.20"
]

!pip install {" ".join(packages)}

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.
google-api-core 1.34.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<4.0.0dev,>=3.19.5, but you have protobuf 5.28.2 which is incompatible.
google-cloud-bigtable 2.27.0 requires google-api-core[grpc]<3.0.0dev,>=2.16.0, but you have google-api-core 1.34.1 which is incompatible.
google-cloud-translate 3.12.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 5.28.2 which is incompatible.
google-genai 0.2.2 requires websockets<15.0dev,>=13.0, but you have websockets 12.0 which is incompatible.
mlxtend 0.23.3 requires scikit-learn>=1.3.1, but you have scikit-learn 1.2.2 which is incompatible.
pand

In [2]:
!pip install llama-cpp-python==0.2.85 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122

Looking in indexes: https://pypi.org/simple, https://abetlen.github.io/llama-cpp-python/whl/cu122
Collecting llama-cpp-python==0.2.85
  Downloading https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.85-cu122/llama_cpp_python-0.2.85-cp310-cp310-linux_x86_64.whl (394.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m394.5/394.5 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0mm
Collecting diskcache>=5.6.1 (from llama-cpp-python==0.2.85)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: diskcache, llama-cpp-python
Successfully installed diskcache-5.6.3 llama-cpp-python-0.2.85


## Package Import

In [3]:
import sys
import torch
from datetime import datetime
from langchain_community.llms import LlamaCpp
from huggingface_hub import hf_hub_download
import gradio as gr
import requests
import json
from typing import List, Tuple
from InstructorEmbedding import INSTRUCTOR
from chromadb.config import Settings
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceInstructEmbeddings

### RAG System Model Loading  

This RAG system loads an LLM and an embedding model for retrieval-augmented generation. The **LLM (Llama 3.1 8B Instruct)** is downloaded via `hf_hub_download` and configured with an **8096-token context window**, CUDA acceleration (if available), and streaming enabled. The model uses **temperature = 0** for deterministic outputs. The **embedding model (hkunlp/instructor-large)** runs on the selected device (`cuda` or `cpu`) and is initialized by embedding a test query to reduce initial latency. The system ensures efficient retrieval and generation by leveraging **GPU acceleration, batch processing, and optimized token settings**.

In [4]:
# LLM Configurations
CONTEXT_WINDOW_SIZE = 8096
MAX_NEW_TOKENS = 8096
N_GPU_LAYERS = 100
N_BATCH = 512
N_THREADS = 8
TEMPERATURE = 0
VERBOSE = True
TOP_K = 1
resume_download = True
llama_model_id = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
llama_model_basename = "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"

# Embedding Model Configurations
embedding_model_id = "hkunlp/instructor-large"

model_directory = "models/llm/models"


if torch.cuda.is_available():
    device_type = "cuda"
else:
    device_type = "cpu"

In [5]:
def load_llamacpp_llm():
    try:
        model_path = hf_hub_download(
            repo_id=llama_model_id,
            filename=llama_model_basename,
            resume_download=resume_download,
            cache_dir=model_directory,
        )
        kwargs = {
            "model_path": model_path,
            "temperature": TEMPERATURE,
            # "n_threads": N_THREADS,
            "n_ctx": CONTEXT_WINDOW_SIZE,
            "max_tokens": MAX_NEW_TOKENS,
            "verbose": VERBOSE,
            "n_batch": N_BATCH,
            "streaming": True,
            # "top_k": TOP_K
        }

        if device_type.lower() == "cuda":
            # set this based on your GPU
            kwargs["n_gpu_layers"] = N_GPU_LAYERS

        return LlamaCpp(**kwargs)
    except Exception as e:
        print(f"Error occurred while loading LLM: {e}")
        return None

In [6]:
def build_embedding_model():
    embeddings = HuggingFaceInstructEmbeddings(
        model_name=embedding_model_id,
        model_kwargs={"device": device_type},
    )
    # Embedding model takes time to load on first query
    _ = embeddings.embed_query("This is a test.")
    return embeddings

In [7]:
llm = load_llamacpp_llm()



Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

llama_model_loader: loaded meta data with 33 key-value pairs and 292 tensors from models/llm/models/models--bartowski--Meta-Llama-3.1-8B-Instruct-GGUF/snapshots/bf5b95e96dac0462e2a09145ec66cae9a3f12067/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Meta Llama 3.1 8B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Meta-Llama-3.1
llama_model_loader: - kv   5:                         general.size_label str              = 8B
llama_model_load

In [8]:
embedding = build_embedding_model()



.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

2_Dense%2Fconfig.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.41k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


  model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))


In [9]:
def generate(prompt):
    response = llm.generate([prompt])
    text = response.flatten()
    generated_text = text[0].generations[0][0].text
    return generated_text

In [10]:
def stream_tokens(prompt, max_tokens=300):
    """Generate tokens one by one from the LLM"""
    token_count = 0

    try:
        for text in llm.stream(prompt):
            yield text
            token_count += 1

            if token_count >= max_tokens:
                break
    except Exception as e:
        yield f"\nError: {str(e)}"

In [11]:
def stream(prompt):
    # Safety mechanism to prevent infinite generation
    max_tokens = 300
    token_count = 0

    try:
        for text in llm.stream(prompt):
            sys.stdout.write(text)
            sys.stdout.flush()

            # Increment token counter (roughly - each text chunk may contain multiple tokens)
            token_count += 1

            # Safety check
            if token_count >= max_tokens:
                print("\n[Maximum token limit reached]")
                break

        print()  # Final newline
    except Exception as e:
        print(f"\nError during generation: {str(e)}")

## Data Ingestion
In this section, a .txt file will be processed and chunk will be generated. The embeddings of these chunks will be generated by the embedding model. Then it will be stored in Chromadb vector database.

In [13]:
def _create_collection(embedding, dir, settings):
    db = Chroma(
        persist_directory=dir,
        client_settings=settings,
        embedding_function=embedding,
        collection_metadata={
            "hnsw:space": "cosine",
            "hnsw:construction_ef": 400,
            "hnsw:search_ef": 400,
            "hnsw:M": 128,
            "hnsw:resize_factor": 2.0,
        },
    )
    print(f"Number of documents in collection: {db._collection.count()}", flush=True)
    return db

In [14]:
dir = "/knowledge_base"
settings = Settings(
    anonymized_telemetry=False,
    is_persistent=True,
    persist_directory=dir,
)

retriever = _create_collection(embedding, dir, settings)

  db = Chroma(


Number of documents in collection: 0


In [15]:
# Import required libraries
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import os


# Step 1: Read the .txt file
def read_txt_file(file_path):
    """
    Read the contents of a text file.

    Args:
        file_path (str): Path to the text file

    Returns:
        str: Contents of the text file
    """
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file: {e}")
        return None


# Step 2: Set up text splitter
def create_text_splitter(chunk_size=1000, chunk_overlap=200):
    """
    Create a RecursiveCharacterTextSplitter.

    Args:
        chunk_size (int): Maximum size of each text chunk
        chunk_overlap (int): Number of characters to overlap between chunks

    Returns:
        RecursiveCharacterTextSplitter: Configured text splitter
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter


# Step 3: Create documents from text
def create_documents(text, text_splitter):
    """
    Split text into documents using the text splitter.

    Args:
        text (str): Input text to be split
        text_splitter (RecursiveCharacterTextSplitter): Text splitter

    Returns:
        list: List of Document objects
    """
    # Split the text into chunks
    texts = text_splitter.split_text(text)

    # Convert text chunks to Document objects
    documents = [
        Document(page_content=chunk, metadata={"source": "input_text"})
        for chunk in texts
    ]

    return documents


# Step 4: Ingest documents to Chroma vector store
def ingest_to_chroma(documents, embeddings, collection_name="my_collection"):
    """
    Create a Chroma vector store and add documents.

    Args:
        documents (list): List of Document objects
        embeddings (Embeddings): Embedding model
        collection_name (str): Name of the Chroma collection

    Returns:
        Chroma: Populated Chroma vector store
    """
    # Add documents to the vector store
    retriever.add_documents(documents)

    return retriever

In [16]:
file_path = "/kaggle/input/rag-documents/About UIU.txt"

# Read the text file
text = read_txt_file(file_path)

if not text:
    print(f"No text is found!")
else:
    # Create text splitter
    text_splitter = create_text_splitter()

    # Create documents
    documents = create_documents(text, text_splitter)

    # Ingest to Chroma vector store
    retriever = ingest_to_chroma(documents, embedding)

In [24]:
# Test the retriever
query = "Tell me about UIU"
docs = retriever.similarity_search_with_relevance_scores(query=query)
content, _ = docs[0]
print(content.page_content)

United International University (Bengali: ইউনাইটেড ইন্টারন্যাশনাল ইউনিভার্সিটি, also known as UIU) is a private research university in Dhaka, Bangladesh.[2]

The government of Bangladesh approved the establishment of United International University in 2003.[3] United International University was established with the generous support and patronage of the United Group.[4]

In 2024, according to the QS World University Rankings, United International University (UIU) is ranked in 1201-1400 globally, making it the 3rd highest-ranking private university in Bangladesh, alongside Daffodil International University (DIU) and East West University (EWU).[5]


Campus Desciption:
The permanent campus is on a 25-bigha (8.25 acre) plot of land, located at the 'United City' at Satarkul, Badda (1.5 km east of Embassy of the United States, Dhaka), adjacent to Madani Avenue.[6]


## Prompts

Prompts of this RAG chatbot. For this chatbot two prompts are necessary:
- **Standalone Query Generation Prompt:** The followup user query will be regenrated based on the previous conversation history
- **RAG Chat Prompt:** This prompt will be used to generate final response of this RAG chatbot.

In [25]:
from datetime import datetime


def get_chat_prompt(user_input, history, context=None):
    today_date = datetime.today().strftime("%d %B %Y")  # Dynamic date insertion

    prompt = (
        "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
        f"Cutting Knowledge Date: December 2023\n"
        f"Today Date: {today_date}\n\n"
        "You are a helpful assistant. DO NOT provide information which is not present on the Retrieved Context.\n"
    )

    # Add retrieved context if available
    if context:
        prompt += "\nRetrieved Context:\n" + context + "\n"

    prompt += "<|eot_id|>"

    # Append chat history
    for role, message in history:
        prompt += f"<|start_header_id|>{role}<|end_header_id|>\n\n{message}<|eot_id|>"

    # Append current user input
    prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

    return prompt

In [26]:

def get_standalone_query_generation_prompt(user_input, history):
    today_date = datetime.today().strftime("%d %B %Y")  # Dynamic date insertion

    prompt = (
        "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
        f"Cutting Knowledge Date: December 2023\n"
        f"Today Date: {today_date}\n\n"
        "You are a helpful assistant. Write the standalone query of the last user message so that it contains all the information of this question and best suited for context retrieval. Just write the query in detailed form. DO NOT write any extra explanation.\n"
    )

    prompt += "<|eot_id|>"

    # Append chat history
    for role, message in history:
        prompt += f"<|start_header_id|>{role}<|end_header_id|>\n\n{message}<|eot_id|>"

    # Append current user input
    prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nStandalone Query: "

    return prompt

## ChatSystem

The **ChatSystem** class integrates an LLM with a retriever to enhance chatbot responses using retrieved context. It maintains conversation history and refines user queries into standalone questions before retrieval. The `retrieve` method fetches relevant documents, while `chat` generates responses based on context and history. Responses are logged and appended to history for continuity. This system ensures more accurate and context-aware chatbot interactions.

In [30]:
class ChatSystem:
    def __init__(self, llm, retriever):
        self.history = []
        self.llm = llm
        self.retriever = retriever

    def retrieve(self, query):
        docs = self.retriever.similarity_search_with_relevance_scores(query=query)
        return "".join(content.page_content for content, _ in docs)

    def chat(self, user_input):
        standalone_query_prompt = get_standalone_query_generation_prompt(user_input, self.history)
        standalone_query = generate(standalone_query_prompt)
        print(f'Standalone Query: {standalone_query}')

        context = self.retrieve(standalone_query)
        prompt = get_chat_prompt(user_input, self.history, context)
        response = generate(prompt)

        print(f"User: \n{user_input} \nAssistant:\n{response}\n", flush=True)
        print("-" * 100, flush=True)

        self.history.append(("user", user_input))
        self.history.append(("assistant", response))
        return response

In [31]:
chatsystem = ChatSystem(llm, retriever)

message = "Tell about the vc of UIU"
chatsystem.chat(message)

Llama.generate: prefix-match hit

llama_print_timings:        load time =    1091.02 ms
llama_print_timings:      sample time =      40.91 ms /    20 runs   (    2.05 ms per token,   488.82 tokens per second)
llama_print_timings: prompt eval time =     309.07 ms /    63 tokens (    4.91 ms per token,   203.84 tokens per second)
llama_print_timings:        eval time =     499.93 ms /    19 runs   (   26.31 ms per token,    38.01 tokens per second)
llama_print_timings:       total time =     867.38 ms /    82 tokens
Llama.generate: prefix-match hit


Standalone Query:  What is the vision and mission statement of University of Information Technology (UIU) Vice Chancellor?



llama_print_timings:        load time =    1091.02 ms
llama_print_timings:      sample time =      43.70 ms /    22 runs   (    1.99 ms per token,   503.42 tokens per second)
llama_print_timings: prompt eval time =    1946.80 ms /   885 tokens (    2.20 ms per token,   454.59 tokens per second)
llama_print_timings:        eval time =     608.70 ms /    21 runs   (   28.99 ms per token,    34.50 tokens per second)
llama_print_timings:       total time =    2619.55 ms /   906 tokens


User: 
Tell about the vc of UIU 
Assistant:
The Vice-Chancellor of United International University (UIU) is Md. Abul Kashem Mia.

----------------------------------------------------------------------------------------------------


'The Vice-Chancellor of United International University (UIU) is Md. Abul Kashem Mia.'

## Gradio Chatbot Interface  

This Gradio-based chatbot interface integrates an LLM-powered **ChatSystem** for real-time conversations. The `chatbot` function processes user messages, maintains conversation history, and generates responses. The interface includes a **chat history panel**, a **textbox for user input**, and a **clear chat button**. The chatbot runs inside a `Blocks` layout with Markdown headers for branding. The application is launched with `demo.launch(share=True)`, allowing external access.

In [33]:
def chatbot(message: str, history: List[Tuple[str, str]], chatsystem: ChatSystem) -> List[Tuple[str, str]]:
    """Processes messages and maintains conversation history for each session."""
    bot_response = chatsystem.chat(message)
    history.append((message, bot_response))
    return history

# Create the Gradio interface
with gr.Blocks(css="footer {visibility: hidden}") as demo:
    gr.Markdown("# 🤖 RAG Chatbot")
    gr.Markdown("Chat with this LLM-powered bot! Ask any question and get an intelligent response.")
    gr.Markdown("Powered by Nascenia")

    chatbot_interface = gr.Chatbot(label="Chat History", height=600)
    msg = gr.Textbox(label="Type your message", placeholder="Type your message here...", lines=1)
    clear = gr.Button("Clear Chat")

    chatsystem_state = gr.State(lambda: ChatSystem(llm, retriever))  # Each user gets a separate ChatSystem

    def user_input(message, history, chatsystem):
        history = history or []
        return "", chatbot(message, history, chatsystem)

    def clear_chat():
        return None, None, ChatSystem(llm, retriever)  # Reset the chat system for a new session

    msg.submit(user_input, [msg, chatbot_interface, chatsystem_state], [msg, chatbot_interface])
    clear.click(clear_chat, None, [msg, chatbot_interface, chatsystem_state])

if __name__ == "__main__":
    demo.launch(share=True)


Running on local URL:  http://127.0.0.1:7863
Running on public URL: https://979d50604ffa3dae28.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
