In [12]:
!pip install faiss-cpu langchain PyMuPDF python-docx python-pptx pandas google-generativeai unstructured tiktoken openai streamlit -q

In [13]:
!pip install -U langchain-google-genai langchain langchain-community google-generativeai -q

from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain_google_genai.llms import GoogleGenerativeAI
from langchain_core.documents import Document as LCDocument



In [14]:
import os, json
import google.generativeai as genai

os.environ["GOOGLE_API_KEY"] = "AIzaSyDPlJn5jQx0p8Svdv4KtkG2bHV0CJI-jXA"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

models = genai.list_models()
for m in models:
    # Only show models supporting generateContent
    if "generateContent" in m.supported_generation_methods:
        print(m.name)




models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-flash-latest
models/gemini-1.5-flash
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-2.5-pro-preview-03-25
models/gemini-2.5-flash-preview-04-17
models/gemini-2.5-flash-preview-05-20
models/gemini-2.5-flash
models/gemini-2.5-flash-preview-04-17-thinking
models/gemini-2.5-flash-lite-preview-06-17
models/gemini-2.5-pro-preview-05-06
models/gemini-2.5-pro-preview-06-05
models/gemini-2.5-pro
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-preview-image-generation
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp

In [15]:
from langchain_google_genai import GoogleGenerativeAI
llm = GoogleGenerativeAI(model="models/gemini-2.5-flash")
response = llm.invoke("Tell me about Paris?")
print(response)


Paris, often called the **"City of Lights" (La Ville Lumière)** and the **"City of Love,"** is the capital and most populous city of France. It's globally renowned for its art, fashion, gastronomy, culture, and iconic landmarks. More than just a city, Paris is an experience – a captivating blend of history, romance, and modern vibrancy.

Here's a breakdown of what makes Paris so special:

1.  **Iconic Landmarks:**
    *   **Eiffel Tower:** The undisputed symbol of Paris, offering breathtaking panoramic views of the city.
    *   **Louvre Museum:** Home to thousands of works of art, including the Mona Lisa, Venus de Milo, and countless masterpieces. It's the world's largest art museum.
    *   **Notre Dame Cathedral:** A stunning Gothic masterpiece, currently undergoing restoration after the 2019 fire, but still an awe-inspiring sight.
    *   **Arc de Triomphe & Champs-Élysées:** The grandest avenue in Paris, leading up to the monumental Arc de Triomphe, which commemorates French victo

In [16]:
def load_text_from_pdf(file):
    doc = fitz.open(stream=file.read(), filetype="pdf")
    return "\n".join(page.get_text() for page in doc)

def load_text_from_csv(file):
    df = pd.read_csv(file)
    return df.to_string(index=False)

def load_text_from_pptx(file):
    prs = Presentation(file)
    return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text"))



from io import BytesIO
from docx import Document

def load_text_from_docx(file):
    file.seek(0)
    byte_stream = BytesIO(file.read())  # ✅ Read the bytes
    doc = Document(byte_stream)         # ✅ Parse it using python-docx
    text = "\n".join([para.text for para in doc.paragraphs])  # ✅ Extract text
    return text                         # ✅ Return a string (not BytesIO!)



def load_text_from_txt(file):
    return file.read().decode("utf-8")

def extract_text(file):
    name = file.name.lower()
    if name.endswith(".pdf"):
        return load_text_from_pdf(file)
    elif name.endswith(".csv"):
        return load_text_from_csv(file)
    elif name.endswith(".pptx"):
        return load_text_from_pptx(file)
    elif name.endswith(".docx"):
        return load_text_from_docx(file)
    elif name.endswith(".txt") or name.endswith(".md"):
        return load_text_from_txt(file)
    return ""


In [17]:
class IngestionAgent:
    def __init__(self, embedding_model):
        self.embedding_model = embedding_model
        self.text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)

    def handle(self, mcp_msg):
        docs = []

        for file in mcp_msg["payload"]["files"]:
            file.seek(0)
            text = extract_text(file)
            splits = self.text_splitter.split_text(text)
            docs.extend([LCDocument(page_content=chunk) for chunk in splits])

        self.vectorstore = FAISS.from_documents(docs, self.embedding_model)

        return {
            "sender": "IngestionAgent",
            "receiver": "RetrievalAgent",
            "type": "DOCUMENT_INGESTED",
            "trace_id": mcp_msg["trace_id"],
            "payload": {"doc_count": len(docs)}
        }


class RetrievalAgent:
    def __init__(self, vectorstore):
        self.vectorstore = vectorstore

    def handle(self, mcp_msg):
        query = mcp_msg["payload"]["query"]
        docs = self.vectorstore.similarity_search(query, k=5)
        context = [doc.page_content for doc in docs]
        return {
            "sender": "RetrievalAgent",
            "receiver": "LLMResponseAgent",
            "type": "RETRIEVAL_RESULT",
            "trace_id": mcp_msg["trace_id"],
            "payload": {
                "retrieved_context": context,
                "query": query
            }
        }


class LLMResponseAgent:
    def __init__(self, model="models/gemini-1.5-flash-latest"):
        self.llm = GoogleGenerativeAI(model=model)

    def handle(self, mcp_msg):
        context = "\n".join(mcp_msg["payload"]["retrieved_context"])
        query = mcp_msg["payload"]["query"]

        prompt = f"""Use the following context to answer the user's question.

Context:
{context}

Question:
{query}

Answer:"""

        response = self.llm.invoke(prompt)
        return {
            "sender": "LLMResponseAgent",
            "receiver": "UI",
            "type": "ANSWER",
            "trace_id": mcp_msg["trace_id"],
            "payload": {
                "answer": response,
                "sources": context[:2]
            }
        }


In [18]:
# ✅ 1. Upload file
from google.colab import files
uploaded = files.upload()
uploaded_files = [open(name, "rb") for name in uploaded]

# ✅ 2. Build embedding model
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# ✅ 3. Initialize ingestion agent
ingestion_agent = IngestionAgent(embedding_model)

# ✅ 4. Create & send MCP message to ingestion agent
import uuid
trace_id = str(uuid.uuid4())
ingest_msg = {
    "sender": "UI",
    "receiver": "IngestionAgent",
    "type": "UPLOAD",
    "trace_id": trace_id,
    "payload": {"files": uploaded_files}
}

# ✅ 5. Run ingestion (build FAISS in memory)
ingest_response = ingestion_agent.handle(ingest_msg)

# ✅ ✅ 6. Now use in-memory vectorstore for retrieval
retrieval_agent = RetrievalAgent(ingestion_agent.vectorstore)

# ✅ 7. Setup LLM Agent
llm_agent = LLMResponseAgent()

# ✅ 8. Ask question via MCP
query = "What are the key ideas in the uploaded document?"
retrieval_msg = {
    "sender": "UI",
    "receiver": "RetrievalAgent",
    "type": "QUESTION",
    "trace_id": trace_id,
    "payload": {"query": query}
}

retrieval_response = retrieval_agent.handle(retrieval_msg)

# ✅ 9. Get final LLM response
llm_response = llm_agent.handle(retrieval_response)
print("💬 Answer:\n", llm_response["payload"]["answer"])
while True:
    query = input("Ask a question (or type 'exit'): ")
    if query.lower() == "exit":
        break

    trace_id = str(uuid.uuid4())
    retrieval_msg = {
        "sender": "UI",
        "receiver": "RetrievalAgent",
        "type": "QUESTION",
        "trace_id": trace_id,
        "payload": {"query": query}
    }

    retrieval_response = retrieval_agent.handle(retrieval_msg)
    llm_response = llm_agent.handle(retrieval_response)

    print("\n💬 Answer:", llm_response["payload"]["answer"])





Saving all in 1.docx to all in 1 (1).docx
💬 Answer:
 The document showcases a portfolio of data science and software engineering projects.  Key ideas include:

* **Expertise in various machine learning models:**  The author demonstrates proficiency in diverse models, including Random Forest, LightGBM, LSTM, and GAN-CNN-LSTM, applied to different tasks like GST compliance prediction, spam detection, and mental health condition forecasting.

* **Natural Language Processing (NLP) capabilities:**  A significant project focuses on building a GPT-2 based text generation platform using LangChain and HuggingFace Transformers.

* **Full-stack development skills:** The portfolio includes projects demonstrating front-end skills (React.js, JavaScript, Bootstrap) and back-end skills (Flask).

* **Data visualization and analysis:**  Projects involve data exploration (EDA), feature engineering, and the creation of visualizations using libraries like Seaborn and Matplotlib.

* **Deployment and accessi

In [19]:
# ✅ Save FAISS vectorstore to local files
ingestion_agent.vectorstore.save_local("faiss_store")  # saves index + docstore


In [20]:
 %%writefile app.py
import streamlit as st
import os
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings, GoogleGenerativeAI

# ✅ Set Gemini API Key securely
os.environ["GOOGLE_API_KEY"] = "AIzaSyDPlJn5jQx0p8Svdv4KtkG2bHV0CJI-jXA"

# ✅ Define embedding model
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# ✅ Load FAISS vectorstore with deserialization allowed
vectorstore = FAISS.load_local(
    "faiss_store", embeddings=embedding_model, allow_dangerous_deserialization=True
)

# ✅ Load Gemini model (use supported model)
llm = GoogleGenerativeAI(model="models/gemini-1.5-flash")

# ✅ Streamlit UI
st.title("📄 Gemini RAG Chatbot")

query = st.text_input("Ask a question about your document:")

if query:
    docs = vectorstore.similarity_search(query, k=5)
    context = "\n\n".join([doc.page_content for doc in docs])

    prompt = f"""Answer the question using only the context below.

    Context:
    {context}

    Question:
    {query}
    """

    response = llm.invoke(prompt)
    st.markdown("### 💬 Answer:")
    st.write(response)  # ✅ correct





Overwriting app.py


In [21]:
!streamlit run app.py & npx localtunnel --port 8501


[1G[0K⠙
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.74.176.134:8501[0m
[0m
[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0Kyour url is: https://twelve-states-juggle.loca.lt
[34m  Stopping...[0m
^C
