In [4]:
!pip install -U langchain langchain-community langchain-groq langchain-huggingface
!pip install faiss-cpu chromadb sentence-transformers pypdf

Collecting langchain-huggingface
  Downloading langchain_huggingface-1.2.0-py3-none-any.whl.metadata (2.8 kB)
Downloading langchain_huggingface-1.2.0-py3-none-any.whl (30 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-1.2.0


In [2]:
# Install reportlab
!pip install reportlab

from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas

file_name = "sample.pdf"
c = canvas.Canvas(file_name, pagesize=A4)
width, height = A4

text = c.beginText(40, height - 50)
text.setFont("Helvetica", 11)

content = [
    "AI POLICY AND ACADEMIC QUALITY DOCUMENT",
    "",
    "1. Introduction",
    "This document describes the academic quality framework adopted by higher",
    "education institutions to improve teaching, learning, and research outcomes.",
    "",
    "2. Objectives",
    "The key objectives of this policy are:",
    "- Improving academic quality",
    "- Strengthening industry–academia collaboration",
    "- Promoting outcome-based education (OBE)",
    "- Enhancing research and innovation culture",
    "",
    "3. Industry Collaboration",
    "Institutions are encouraged to partner with industries for internships,",
    "projects, curriculum design, and faculty training programs.",
    "",
    "4. Outcome-Based Education",
    "OBE focuses on measurable learning outcomes, continuous assessment,",
    "and alignment of COs, POs, and PSOs.",
    "",
    "5. Conclusion",
    "This policy aims to ensure continuous improvement in higher education."
]

for line in content:
    text.textLine(line)

c.drawText(text)
c.showPage()
c.save()

print("sample.pdf created successfully!")

Collecting reportlab
  Downloading reportlab-4.4.7-py3-none-any.whl.metadata (1.7 kB)
Downloading reportlab-4.4.7-py3-none-any.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/2.0 MB[0m [31m10.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/2.0 MB[0m [31m31.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.4.7
sample.pdf created successfully!


In [None]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS, Chroma
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage

# -----------------------------
# CONFIGURATION
# -----------------------------
PDF_PATH = "sample.pdf"
CHROMA_DIR = "./chroma_store"

os.environ["GROQ_API_KEY"] = "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"

# -----------------------------
# STEP 1: LOAD & SPLIT DOCUMENT
# -----------------------------
print("Loading document...")
loader = PyPDFLoader(PDF_PATH)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100
)
docs = splitter.split_documents(documents)

print(f"Document split into {len(docs)} chunks")

# -----------------------------
# STEP 2: EMBEDDINGS
# -----------------------------
embedding = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# -----------------------------
# STEP 3: VECTOR STORES
# -----------------------------
print("Creating FAISS index...")
faiss_db = FAISS.from_documents(docs, embedding)

print("Creating ChromaDB index...")
chroma_db = Chroma.from_documents(
    docs,
    embedding,
    persist_directory=CHROMA_DIR
)

# -----------------------------
# STEP 4: HYBRID RETRIEVER
# -----------------------------
def hybrid_retrieve(query, k=3):
    faiss_docs = faiss_db.similarity_search(query, k=k)
    chroma_docs = chroma_db.similarity_search(query, k=k)
    return faiss_docs + chroma_docs

# -----------------------------
# STEP 5: LLM (ChatGroq)
# -----------------------------
llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0
)

# -----------------------------
# STEP 6: AI DOCUMENT AGENT
# -----------------------------
def document_qa_agent(question):
    retrieved_docs = hybrid_retrieve(question)

    context = "\n\n".join([doc.page_content for doc in retrieved_docs])

    prompt = f"""
You are an AI document analysis agent.
Answer ONLY from the provided context.
If the answer is not found, say "Not available in the document".

Context:
{context}

Question:
{question}

Answer:
"""

    response = llm.invoke([HumanMessage(content=prompt)])
    return response.content

# -----------------------------
# STEP 7: INTERACTIVE MODE
# -----------------------------
print("\nAI Document QA Agent Ready!")
print("Type your question (or type 'exit' to stop)\n")

while True:
    query = input("Question: ")
    if query.lower() == "exit":
        break

    answer = document_qa_agent(query)
    print("\nAnswer:", answer)
    print("-" * 60)

Loading document...
Document split into 1 chunks


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Creating FAISS index...
Creating ChromaDB index...

AI Document QA Agent Ready!
Type your question (or type 'exit' to stop)

Question: What are the key objectives mentioned in the policy?

Answer: The key objectives of this policy are:
- Improving academic quality
- Strengthening industry–academia collaboration
- Promoting outcome-based education (OBE)
- Enhancing research and innovation culture
------------------------------------------------------------
