## Installation

In [4]:
packages = [
    "langchain",
    "chromadb",
    "gradio",
    "langchain_community",
    "chromadb",
    "InstructorEmbedding==1.0.1",
    "sentence-transformers==2.2.2",
    "transformers>=4.20",
    "datasets>=2.20",
    "pyarrow>=17.0",
    "numpy>=1.0",
    "requests>=2.26",
    "scikit_learn>=1.0.2",
    "scipy>=1.14",
    "torch>=2.0",
    "rich>=13.0",
    "huggingface-hub==0.24.0",
    "protobuf==5.28.2",
    "transformers>=4.20"
]

!pip install {" ".join(packages)}

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.
google-genai 0.2.2 requires websockets<15.0dev,>=13.0, but you have websockets 12.0 which is incompatible.
mlxtend 0.23.3 requires scikit-learn>=1.3.1, but you have scikit-learn 1.2.2 which is incompatible.
plotnine 0.14.4 requires matplotlib>=3.8.0, but you have matplotlib 3.7.5 which is incompatible.[0m[31m
[0m

In [5]:
!pip install llama-cpp-python==0.2.85 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122

Looking in indexes: https://pypi.org/simple, https://abetlen.github.io/llama-cpp-python/whl/cu122


## Package Import

In [21]:
import sys
import torch
from datetime import datetime
from langchain_community.llms import LlamaCpp
from huggingface_hub import hf_hub_download
import gradio as gr
import requests
import json
from typing import List, Tuple
from InstructorEmbedding import INSTRUCTOR
from chromadb.config import Settings
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceInstructEmbeddings

## Load Models

In [7]:
# LLM Configurations
CONTEXT_WINDOW_SIZE = 8096
MAX_NEW_TOKENS = 8096
N_GPU_LAYERS = 100
N_BATCH = 512
N_THREADS = 8
TEMPERATURE = 0
VERBOSE = True
TOP_K = 1
resume_download = True
llama_model_id = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
llama_model_basename = "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"

# Embedding Model Configurations
embedding_model_id = "hkunlp/instructor-large"

model_directory = "models/llm/models"


if torch.cuda.is_available():
    device_type = "cuda"
else:
    device_type = "cpu"

In [8]:
def load_llamacpp_llm():
    try:
        model_path = hf_hub_download(
            repo_id=llama_model_id,
            filename=llama_model_basename,
            resume_download=resume_download,
            cache_dir=model_directory,
        )
        kwargs = {
            "model_path": model_path,
            "temperature": TEMPERATURE,
            # "n_threads": N_THREADS,
            "n_ctx": CONTEXT_WINDOW_SIZE,
            "max_tokens": MAX_NEW_TOKENS,
            "verbose": VERBOSE,
            "n_batch": N_BATCH,
            "streaming": True
            # "top_k": TOP_K
        }

        if device_type.lower() == "cuda":
            # set this based on your GPU
            kwargs["n_gpu_layers"] = N_GPU_LAYERS

        return LlamaCpp(**kwargs)
    except Exception as e:
        print(f"Error occurred while loading LLM: {e}")
        return None

In [9]:
def build_embedding_model():
    embeddings = HuggingFaceInstructEmbeddings(
        model_name=embedding_model_id,
        model_kwargs={"device": device_type},
    )
    # Embedding model takes time to load on first query
    _ = embeddings.embed_query("This is a test.")
    return embeddings

In [10]:
llm = load_llamacpp_llm()



Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

llama_model_loader: loaded meta data with 33 key-value pairs and 292 tensors from models/llm/models/models--bartowski--Meta-Llama-3.1-8B-Instruct-GGUF/snapshots/bf5b95e96dac0462e2a09145ec66cae9a3f12067/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Meta Llama 3.1 8B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Meta-Llama-3.1
llama_model_loader: - kv   5:                         general.size_label str              = 8B
llama_model_load

In [11]:
embedding = build_embedding_model()



.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

2_Dense%2Fconfig.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.41k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


  model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))


In [12]:
def generate(prompt):
    response = llm.generate([prompt])
    text = response.flatten()
    generated_text = text[0].generations[0][0].text
    return generated_text

In [13]:
def stream_tokens(prompt, max_tokens=300):
    """Generate tokens one by one from the LLM"""
    token_count = 0
    
    try:
        for text in llm.stream(prompt):
            yield text
            token_count += 1
            
            if token_count >= max_tokens:
                break
    except Exception as e:
        yield f"\nError: {str(e)}"

In [14]:

def stream(prompt):
    # Safety mechanism to prevent infinite generation
    max_tokens = 300
    token_count = 0
    
    try:
        for text in llm.stream(prompt):
            sys.stdout.write(text)
            sys.stdout.flush()
            
            # Increment token counter (roughly - each text chunk may contain multiple tokens)
            token_count += 1
            
            # Safety check
            if token_count >= max_tokens:
                print("\n[Maximum token limit reached]")
                break
                
        print()  # Final newline
    except Exception as e:
        print(f"\nError during generation: {str(e)}")

In [15]:
prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 23 July 2024

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

print("Tokens generated one by one:")
for i, token in enumerate(stream_tokens(llm, prompt)):
    print(f"Token {i}: {repr(token)}")  # repr() shows whitespace characters clearly


Tokens generated one by one:
Token 0: "\nError: Invalid input type <class 'langchain_community.llms.llamacpp.LlamaCpp'>. Must be a PromptValue, str, or list of BaseMessages."


## Data Ingestion

In [26]:
def _create_collection(embedding, dir, settings):
        db = Chroma(
            persist_directory=dir,
            client_settings=settings,
            embedding_function=embedding,
            collection_metadata={
                "hnsw:space": "cosine",
                "hnsw:construction_ef": 400,
                "hnsw:search_ef": 400,
                "hnsw:M": 128,
                "hnsw:resize_factor": 2.0,
            }
        )        
        print(f"Number of documents in collection: {db._collection.count()}", flush=True)
        return db

In [27]:
dir = '/knowledge_base'
settings = Settings(
            anonymized_telemetry=False,
            is_persistent=True,
            persist_directory=dir,
        )

retriever = _create_collection(embedding, dir, settings)

Number of documents in collection: 0


In [32]:
# Import required libraries
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import os

# Step 1: Read the .txt file
def read_txt_file(file_path):
    """
    Read the contents of a text file.
    
    Args:
        file_path (str): Path to the text file
    
    Returns:
        str: Contents of the text file
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

# Step 2: Set up text splitter
def create_text_splitter(chunk_size=1000, chunk_overlap=200):
    """
    Create a RecursiveCharacterTextSplitter.
    
    Args:
        chunk_size (int): Maximum size of each text chunk
        chunk_overlap (int): Number of characters to overlap between chunks
    
    Returns:
        RecursiveCharacterTextSplitter: Configured text splitter
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False
    )
    return text_splitter

# Step 3: Create documents from text
def create_documents(text, text_splitter):
    """
    Split text into documents using the text splitter.
    
    Args:
        text (str): Input text to be split
        text_splitter (RecursiveCharacterTextSplitter): Text splitter
    
    Returns:
        list: List of Document objects
    """
    # Split the text into chunks
    texts = text_splitter.split_text(text)
    
    # Convert text chunks to Document objects
    documents = [
        Document(
            page_content=chunk, 
            metadata={"source": "input_text"}
        ) for chunk in texts
    ]
    
    return documents

# Step 4: Ingest documents to Chroma vector store
def ingest_to_chroma(documents, embeddings, collection_name="my_collection"):
    """
    Create a Chroma vector store and add documents.
    
    Args:
        documents (list): List of Document objects
        embeddings (Embeddings): Embedding model
        collection_name (str): Name of the Chroma collection
    
    Returns:
        Chroma: Populated Chroma vector store
    """
    # Add documents to the vector store
    retriever.add_documents(documents)
    
    return retriever


file_path = "/kaggle/input/rag-documents/About UIU.txt"

# Read the text file
text = read_txt_file(file_path)

if not text:
    print(f'No text is found!')
else:
    # Create text splitter
    text_splitter = create_text_splitter()
    
    # Create documents
    documents = create_documents(text, text_splitter)
    
    
    # Ingest to Chroma vector store
    retriever = ingest_to_chroma(documents, embedding)

In [38]:
query = "Tell me the VC"
docs = retriever.similarity_search_with_relevance_scores(
                        query=query
                    )
content, _= docs[0]
print(content.page_content)

Computer Laboratory:
- Software Engineering Laboratory
- Network Laboratory
- Multimedia Laboratory
- Hardware Laboratory


Library and Documentation Center:
UIU Central library has a collection of 40,293 items of information materials. Among the materials, 86,200 and 12,458 are books and bound periodicals respectively. Besides, 141 titles are in the current subscription list of journals. Every year, 500 volumes are added to the main reading room of the central library.


Research Center:
- Center for Energy Research (CER)
- Biomedical Engineering Center
- Center for Emerging Networks and Technologies Research (CENTeR)
- Brain-Computer Interface(BCI) Research Lab


Events:
- International Career Summit
- Photography Festival
- BANMUN
- Tech Quest '18
- Tech Quest '16


List of vice-chancellors:
- Md. Abul Kashem Mia


## Prompts

In [41]:
from datetime import datetime

def get_chat_prompt(user_input, history, context=None):
    today_date = datetime.today().strftime("%d %B %Y")  # Dynamic date insertion

    prompt = (
        "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
        f"Cutting Knowledge Date: December 2023\n"
        f"Today Date: {today_date}\n\n"
        "You are a helpful assistant. DO NOT provide information which is not present on the Retrieved Context.\n"
    )

    # Add retrieved context if available
    if context:
        prompt += "\nRetrieved Context:\n" + context + "\n"

    prompt += "<|eot_id|>"

    # Append chat history
    for role, message in history:
        prompt += f"<|start_header_id|>{role}<|end_header_id|>\n\n{message}<|eot_id|>"

    # Append current user input
    prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

    return prompt

## Chain

In [None]:
history = []

## Conversation

In [None]:
while True:
    user_input = input("Enter your message: ").strip()
    if user_input.lower() == "exit":
        break

    prompt = get_chat_prompt(user_input, history)
    response = generate(prompt)

    print(f'User: \n{user_input} \nAssistant:\n{response}\n', flush=True)
    print('-'*100, flush=True)
    
    history.append(("user", user_input))
    history.append(("assistant", response))
    

## Gradio

In [46]:
class ChatSystem:
    def __init__(self, llm, retriever):
        self.history = []
        self.llm = llm
        self.retriever = retriever

    def retrieve(self, query):
        docs = self.retriever.similarity_search_with_relevance_scores(query=query)
        
        context = ""
        for doc in docs:
            content, _ = doc
            context += content.page_content
        return context

    
    def chat(self, user_input):
        context = self.retrieve(user_input)
        prompt = get_chat_prompt(user_input, self.history, context)
        response = generate(prompt)
    
        print(f'User: \n{user_input} \nAssistant:\n{response}\n', flush=True)
        print('-'*100, flush=True)
        
        self.history.append(("user", user_input))
        self.history.append(("assistant", response))
        return response

In [47]:
chatsystem = ChatSystem(llm, retriever)

message = "Tell about the vc of UIU"
chatsystem.chat(message)



llama_print_timings:        load time =    1103.21 ms
llama_print_timings:      sample time =      43.33 ms /    22 runs   (    1.97 ms per token,   507.68 tokens per second)
llama_print_timings: prompt eval time =    2194.95 ms /  1005 tokens (    2.18 ms per token,   457.87 tokens per second)
llama_print_timings:        eval time =     631.21 ms /    21 runs   (   30.06 ms per token,    33.27 tokens per second)
llama_print_timings:       total time =    2898.11 ms /  1026 tokens


User: 
Tell about the vc of UIU 
Assistant:
The Vice-Chancellor of United International University (UIU) is Md. Abul Kashem Mia.

----------------------------------------------------------------------------------------------------
<__main__.ChatSystem object at 0x795e002ca3b0>


In [48]:
chatsystem = ChatSystem(llm, retriever)

def chatbot(message: str, history: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
    """Main chatbot function that processes messages and maintains conversation history"""

    # Get response from API
    bot_response = chatsystem.chat(message)

    # Return the updated history
    history = history or []
    history.append((message, bot_response))

    return history


# Create the Gradio interface
with gr.Blocks(css="footer {visibility: hidden}") as demo:
    gr.Markdown("# 🤖 Rokomari Chatbot")
    gr.Markdown("Chat with this LLM-powered bot! Ask any question and get an intelligent response.")
    gr.Markdown("Powered by Nascenia")

    chatbot_interface = gr.Chatbot(
        label="Chat History",
        height=600
    )

    msg = gr.Textbox(
        label="Type your message",
        placeholder="Type your message here...",
        lines=1
    )

    clear = gr.Button("Clear Chat")


    def user_input(message, history):
        history = history or []
        return "", chatbot(message, history)


    def clear_chat():        
        return None, None


    msg.submit(
        user_input,
        [msg, chatbot_interface],
        [msg, chatbot_interface]
    )

    clear.click(
        clear_chat,
        None,
        [msg, chatbot_interface]
    )

if __name__ == "__main__":
    
    demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://d2067b5706933fcc13.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1103.21 ms
llama_print_timings:      sample time =      36.68 ms /    20 runs   (    1.83 ms per token,   545.33 tokens per second)
llama_print_timings: prompt eval time =    1735.70 ms /   748 tokens (    2.32 ms per token,   430.95 tokens per second)
llama_print_timings:        eval time =     545.68 ms /    19 runs   (   28.72 ms per token,    34.82 tokens per second)
llama_print_timings:       total time =    2345.52 ms /   767 tokens


User: 
Who is the VC? 
Assistant:
The Vice-Chancellor (VC) of UIU is Md. Abul Kashem Mia.

----------------------------------------------------------------------------------------------------


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1103.21 ms
llama_print_timings:      sample time =      49.59 ms /    26 runs   (    1.91 ms per token,   524.33 tokens per second)
llama_print_timings: prompt eval time =     617.19 ms /   188 tokens (    3.28 ms per token,   304.61 tokens per second)
llama_print_timings:        eval time =     717.28 ms /    25 runs   (   28.69 ms per token,    34.85 tokens per second)
llama_print_timings:       total time =    1417.46 ms /   213 tokens


User: 
Tell me something about him  
Assistant:
I do not have any information about Md. Abul Kashem Mia, apart from being the Vice Chancellor of UIU.

----------------------------------------------------------------------------------------------------


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1103.21 ms
llama_print_timings:      sample time =     383.04 ms /   192 runs   (    1.99 ms per token,   501.26 tokens per second)
llama_print_timings: prompt eval time =    2268.23 ms /  1030 tokens (    2.20 ms per token,   454.10 tokens per second)
llama_print_timings:        eval time =    5713.48 ms /   191 runs   (   29.91 ms per token,    33.43 tokens per second)
llama_print_timings:       total time =    8626.82 ms /  1221 tokens


User: 
Tell me about UIU  
Assistant:
United International University (UIU) is a private research university in Dhaka, Bangladesh.

Here are some key points about UIU:

1. **Established**: UIU was established in 2003.
2. **Location**: The permanent campus of UIU is located at the 'United City' at Satarkul, Badda (Dhaka).
3. **Accreditation**: UIU is accredited by the Institution of Engineers, Bangladesh and ACBSP.
4. **School types**: UIU is a private university that offers co-education.
5. **Motto**: The motto of UIU is "Quest for Excellence".
6. **Affiliations**: UIU is affiliated with the University Grants Commission (UGC).
7. **Ranking**: According to the QS World University Rankings, UIU is ranked in 1201-1400 globally.

These are some key points about United International University (UIU).

----------------------------------------------------------------------------------------------------


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1103.21 ms
llama_print_timings:      sample time =     356.64 ms /   182 runs   (    1.96 ms per token,   510.32 tokens per second)
llama_print_timings: prompt eval time =    2299.11 ms /  1032 tokens (    2.23 ms per token,   448.87 tokens per second)
llama_print_timings:        eval time =    5411.81 ms /   181 runs   (   29.90 ms per token,    33.45 tokens per second)
llama_print_timings:       total time =    8294.65 ms /  1213 tokens


User: 
explain point 2 in details. 
Assistant:
**Location of United International University (UIU)**

The permanent campus of UIU is located at the 'United City' at Satarkul, Badda (Dhaka).

Here are some key points about the location:

1. **Satarkul**: The university's campus is situated in Satarkul, a locality in Dhaka.
2. **Badda**: Satarkul is part of the Badda area, which is located in the northern part of Dhaka.
3. **Dhaka**: The city of Dhaka is the capital and largest city of Bangladesh. It is a major economic, cultural, and educational hub in South Asia.

The location of UIU's campus at Satarkul, Badda (Dhaka) provides easy access to various parts of the city, making it an ideal location for students, faculty members, and staff.

----------------------------------------------------------------------------------------------------


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1103.21 ms
llama_print_timings:      sample time =      45.29 ms /    22 runs   (    2.06 ms per token,   485.79 tokens per second)
llama_print_timings: prompt eval time =    3304.53 ms /  1417 tokens (    2.33 ms per token,   428.81 tokens per second)
llama_print_timings:        eval time =     648.61 ms /    21 runs   (   30.89 ms per token,    32.38 tokens per second)
llama_print_timings:       total time =    4027.85 ms /  1438 tokens


User: 
who established this varsity? 
Assistant:
United International University (UIU) was established with the generous support and patronage of the United Group.

----------------------------------------------------------------------------------------------------
