In [None]:
import os
import torch
import chromadb
from dotenv import load_dotenv
from google import generativeai as genai
from transformers import AutoTokenizer, AutoModel
from chromadb.utils.embedding_functions import EmbeddingFunction

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [3]:
if not GEMINI_API_KEY:
    raise ValueError("GEMINI_API_KEY environment variable not set.")
if not HUGGINGFACEHUB_API_TOKEN:
    raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable not set.")

In [None]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_id="google/embeddinggemma-300m", 
    local_dir="./sentence-transformers/all-mpnet-base-v2",
    token=HUGGINGFACEHUB_API_TOKEN)

Fetching 19 files: 100%|██████████| 19/19 [00:00<00:00, 142.39it/s]


'D:\\Programs\\Python\\vector database\\sentence-transformers\\all-mpnet-base-v2'

In [None]:
class LocalHuggingFaceEmbeddingFunction(EmbeddingFunction):
    def __init__(self, model_path, device=None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModel.from_pretrained(model_path).to(self.device)
        self.model.eval()

    def __call__(self, texts):
        if isinstance(texts, str):
            texts = [texts]
        
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(self.device)
        
        with torch.no_grad():
            outputs = self.model(**inputs)

        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        return embeddings

In [7]:
current_dir = os.getcwd()
data_folder_path = os.path.join(current_dir,"data")

client = chromadb.PersistentClient(path=data_folder_path)
model_path = "./sentence-transformers/all-mpnet-base-v2"

In [8]:
local_ef = LocalHuggingFaceEmbeddingFunction(model_path=model_path)
collection = client.get_or_create_collection(name='test_collection',embedding_function=local_ef)

Using device: cuda


In [9]:
collections = client.list_collections()
print(collections)

[Collection(name=test_collection)]


While retriving an collection we should specify same embedding function that we used to create a collection

In [10]:
collection = client.get_collection(name='test_collection',embedding_function=local_ef)


collection.add(ids=['id1', 'id2'],documents=['this is doc1','this is doc2'])

collection.count()

collection.get(ids=['id1'],include=['embeddings','documents'])

query = "example text to search"
results = collection.query(query_texts=[query], n_results=2)
print(results)

query = "Explain the content of document one"
results = collection.query(query_texts=[query], n_results=3, include=['documents', 'distances'])
retrieved_docs = results['documents'][0]  
retrieved_docs

prompt = "You are a helpful assistant. Use the following documents to answer the query."

for i, doc in enumerate(retrieved_docs, 1):
    prompt += f"Document {i}: {doc} "

prompt += f'\n Query: {query} \n Answer:'
prompt

client = genai.configure(api_key=GEMINI_API_KEY)

model = genai.GenerativeModel('models/gemma-3-27b-it')

response = model.generate_content(
    contents=prompt
)
print(response.text)

# Integration with Mongodb

In [11]:
from pymongo import MongoClient

# Connect to your MongoDB instance
mongo_client = MongoClient("mongodb://localhost:27017/")
db = mongo_client["student-mvp"]
collection_mongo = db["pdf_content"]


In [None]:
mongo_docs = list(collection_mongo.find({}, {"_id": 1, "content": 1}))
ids = [str(doc["_id"]) for doc in mongo_docs]
texts = [doc["content"] for doc in mongo_docs if "content" in doc]

In [None]:
collection.add(
    ids=ids,
    documents=texts
)
print("Added MongoDB documents to ChromaDB successfully!")


✅ Added MongoDB documents to ChromaDB successfully!


In [19]:
query = "What does it talk about?"
results = collection.query(query_texts=[query], n_results=3, include=['documents', 'distances'])
retrieved_docs = results['documents'][0]  
retrieved_docs


# for doc, score in zip(results['documents'][0], results['distances'][0]):
#     print(f"🔹 Score: {score:.4f} | Document: {doc[:100]}...")


['\n\n--- Page 1 ---\nUNIT Il\nPART -B\n\nStructure of the Project Report: (Part 1) Framing a Title — Content — Acknowledgement —\nDetails -Abstract — Introduction — Aim of the Study — Background - Writing the research question\n- Need of the Study/Project Significance, Relevance — Determining the feasibility — Theoretical\n\nFramework.\n\nImportance of a well-framed title in a project report and strategies\n\nA well-framed title is crucial in a project report as it serves as the first point of contact for\nreaders, including supervisors, peers, and stakeholders. The title sets the tone and provides a\nsuccinct summary of the report\'s content, influencing the reader\'s initial perception and\nengagement. An effective title should accurately reflect the essence and scope of the research,\ndrawing in the intended audience and setting clear expectations for the report\'s findings and\n\ncontributions.\n\nKey Functions of a Well-Framed Title\n\n1. Summary of Content:\n\no The title should

In [21]:
for i, doc in enumerate(retrieved_docs):
    response = model.generate_content(
        f"Summarize the following document in 150 words:\n{doc}"
    )
    retrieved_docs[i] = response.text

In [22]:
prompt = "You are a helpful assistant. Use the following documents to answer the query."

for i, doc in enumerate(retrieved_docs, 1):
    prompt += f"Document {i}: {doc} "

prompt += f'\n Query: {query} \n Answer:'
prompt

'You are a helpful assistant. Use the following documents to answer the query.Document 1: This document (Pages 1-42) provides a comprehensive overview of project report structure, ethical considerations, and the impact of globalization on project reporting. Key topics include:\n\n*   **Title & Content:** Framing effective titles, essential content sections (Abstract, Introduction, Methodology, Results, Discussion, etc.), and the role of the Table of Contents.\n*   **Ethics:** The importance of acknowledgements, disclosure of funding details, and ethical conduct related to human subjects, including informed consent, privacy, and vulnerability.\n*   **Essential Components:** Guide on writing concise abstracts and effective introductions.\n*   **Research Elements:** Clear aims, formulating specific research objectives, and the roles of background information, research questions, and significance of the study.\n*   **Other Elements** Determining feasibility, theoretical frameworks, and the

In [23]:
client = genai.configure(api_key=GEMINI_API_KEY)

model = genai.GenerativeModel('models/gemini-2.0-flash-lite')

response = model.generate_content(
    contents=prompt
)
print(response.text)

Based on the provided information, Document 1 discusses the following:

*   **Project Report Structure:** It details the essential components of a project report, including the title, abstract, introduction, methodology, results, discussion, and table of contents.
*   **Ethics:** It emphasizes ethical considerations such as acknowledgements, funding disclosure, informed consent, privacy, and responsible conduct when dealing with human subjects.
*   **Research Elements:** It covers formulating clear aims, specific research objectives, the importance of background information, research questions, and the study's significance.
*   **Other Elements:** It touches on aspects like feasibility, theoretical frameworks, and the impact of interdisciplinary perspectives.
*   **Globalization's Impact:** It highlights incorporating diverse cultural perspectives, promoting international collaboration, and managing ethical considerations in a global context.

Document 2 and 3 have not been provided, s