<a href="https://colab.research.google.com/github/shahidaryan/agentic-rag-chatbot/blob/main/Coding_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
!pip install -q faiss-cpu sentence-transformers python-docx python-pptx PyMuPDF pandas ipywidgets transformers

In [40]:
import io
import faiss
import fitz
import pandas as pd
import docx
from pptx import Presentation
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import ipywidgets as widgets
from IPython.display import display
import uuid

class MCPMessage:
    def __init__(self, sender, receiver, type_, payload, trace_id=None):
        self.message = {
            "sender": sender,
            "receiver": receiver,
            "type": type_,
            "trace_id": trace_id or str(uuid.uuid4()),
            "payload": payload
        }

    def get(self):
        return self.message

In [41]:
class IngestionAgent:
    @staticmethod
    def split_text(text, max_chunk_length=500):
        sentences = re.split(r'(?<=[.!?]) +', text)
        chunks, current = [], ""
        for sentence in sentences:
            if len(current) + len(sentence) <= max_chunk_length:
                current += sentence + " "
            else:
                chunks.append(current.strip())
                current = sentence + " "
        if current:
            chunks.append(current.strip())
        return chunks

    @staticmethod
    def parse_file(file_obj):
        name = file_obj.name.lower()
        content = []

        if name.endswith(".pdf"):
            doc = fitz.open(stream=file_obj.read(), filetype="pdf")
            for page in doc:
                content.append(page.get_text())
        elif name.endswith(".docx"):
            d = docx.Document(file_obj)
            content.append("\n".join(p.text for p in d.paragraphs))
        elif name.endswith(".pptx"):
            prs = Presentation(file_obj)
            slides = []
            for slide in prs.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text"):
                        slides.append(shape.text)
            content.append("\n".join(slides))
        elif name.endswith(".csv"):
            df = pd.read_csv(file_obj)
            content.append(df.to_string())
        elif name.endswith(".txt") or name.endswith(".md"):
            content.append(file_obj.read().decode())
        else:
            content.append("Unsupported file format.")

        all_text = "\n".join(content)
        return [(i, chunk) for i, chunk in enumerate(IngestionAgent.split_text(all_text))]

In [42]:
class RetrievalAgent:
    def __init__(self):
        self.model = SentenceTransformer("all-MiniLM-L6-v2")
        self.index = None
        self.indexed_chunks = []

    def build_index(self, indexed_chunks):

        self.indexed_chunks = indexed_chunks
        chunk_texts = [chunk for (_, chunk) in indexed_chunks]
        embeddings = self.model.encode(chunk_texts)
        dim = embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dim)
        self.index.add(np.array(embeddings))

    def retrieve(self, query, k=5):
        query_vec = self.model.encode([query])
        _, I = self.index.search(np.array(query_vec), k)

        return [self.indexed_chunks[i] for i in I[0]]


In [43]:
class LLMResponseAgent:
    @staticmethod
    def generate(mcp_msg):
        unique_chunks = list(set(mcp_msg["payload"]["retrieved_context"]))
        context = "\n".join(unique_chunks)
        query = mcp_msg["payload"]["query"]

        prompt = (
    "You are an expert assistant. The following context includes multiple machine learning topics.\n"
    "For the given question, extract and summarize all relevant concepts in a clean, organized, and concise answer.\n"
    "If multiple algorithms or terms are present, list and explain them clearly without repeating text.\n\n"
    f"Context:\n{context}\n\nQuestion:\n{query}\n\nAnswer:"
)


        try:
            print("🚀 Sending prompt to model...")
            result = hf_pipe(prompt, max_new_tokens=256, do_sample=False)[0]['generated_text']
            print("✅ LLM responded.")
        except Exception as e:
            print("❌ Hugging Face model error:", e)
            return {
                "answer": "LLM generation failed due to an error.",
                "sources": []
            }

        answer = result.split("Answer:")[-1].strip()
        return {
            "answer": answer,
            "sources": unique_chunks
        }


In [44]:
upload_widget = widgets.FileUpload(accept='.pdf,.docx,.pptx,.csv,.txt,.md', multiple=True)
query_input = widgets.Textarea(placeholder="Ask a question...", layout=widgets.Layout(width='600px', height='60px'))
submit_button = widgets.Button(description="Submit", button_style='success')
output = widgets.Output()

display(widgets.VBox([upload_widget, query_input, submit_button, output]))
retriever = RetrievalAgent()

def handle_click(_):
    output.clear_output()
    with output:
        print("✅ Button clicked!")

        if not upload_widget.value:
            print("❗ Please upload at least one file.")
            return

        query = query_input.value.strip()
        if not query:
            print("❗ Please enter a question.")
            return

        indexed_chunks = []  # (index, chunk_text)
        for f in upload_widget.value.values():
            file_obj = io.BytesIO(f["content"])
            file_obj.name = f["metadata"]["name"]
            file_chunks = IngestionAgent.parse_file(file_obj)  # returns (index, text)
            indexed_chunks.extend(file_chunks)

        print(f"📄 Parsed {len(indexed_chunks)} chunks from {len(upload_widget.value)} file(s).")

        retriever.build_index(indexed_chunks)

        retrieved_pairs = retriever.retrieve(query, k=5)

        sorted_chunks = [text for (idx, text) in sorted(retrieved_pairs, key=lambda x: x[0])]

        mcp_msg = MCPMessage(
            sender="RetrievalAgent",
            receiver="LLMResponseAgent",
            type_="RETRIEVAL_RESULT",
            payload={"retrieved_context": sorted_chunks, "query": query}
        ).get()

        print("🤖 Generating answer...")
        result = LLMResponseAgent.generate(mcp_msg)

        print("\n📌 Answer:\n", result["answer"])
        print("\n📚 Source Chunks:")
        for i, chunk in enumerate(result["sources"]):
            print(f"\n🔹 Chunk {i+1}:\n{chunk[:300]}...")
submit_button.on_click(handle_click)

VBox(children=(FileUpload(value={}, accept='.pdf,.docx,.pptx,.csv,.txt,.md', description='Upload', multiple=Tr…