<a href="https://colab.research.google.com/github/pranav-j/Simple-RAG/blob/main/Simple_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# These are the required packages.

!pip install --no-cache-dir google-generativeai PyPDF2 chromadb


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.21.0-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.

In [6]:
# Here ChromaDB is used as the vector database.

import os
import google.generativeai as genai
import PyPDF2
import chromadb
import numpy as np
from google.colab import files
import requests
import json

# Set your Google API key
GOOGLE_API_KEY = "GEMNI_API_KEY"
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
genai.configure(api_key=GOOGLE_API_KEY)

# Initialize ChromaDB client
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="pdf_documents", get_or_create=True)

# Function to upload PDF file
def upload_pdf():
    uploaded = files.upload()
    file_path = list(uploaded.keys())[0]
    return file_path

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

# Function to split text into chunks of approximately 1000 characters
def split_text(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    text_length = len(text)

    while start < text_length:
        end = min(start + chunk_size, text_length)
        # Try to find a good breaking point (end of sentence)
        if end < text_length:
            while end > start + chunk_size - overlap and text[end] not in ['.', '!', '?', '\n']:
                end -= 1
            if end <= start + chunk_size - overlap:  # If no good breaking point, just use chunk_size
                end = start + chunk_size

        chunks.append(text[start:end])
        start = end - overlap if end < text_length else text_length

    return chunks

# Function to get embeddings from Gemini API
def get_embedding(text):
    try:
        embedding = genai.embed_content(
            model="models/embedding-001",
            content=text,
            task_type="retrieval_document"
        )
        return embedding["embedding"]
    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None

# Function to index PDF content
def index_pdf(pdf_path):
    # Extract text from PDF
    text = extract_text_from_pdf(pdf_path)

    # Split into chunks
    chunks = split_text(text)

    # Generate embeddings and add to ChromaDB
    for i, chunk in enumerate(chunks):
        embedding = get_embedding(chunk)
        if embedding:
            collection.add(
                documents=[chunk],
                embeddings=[embedding],
                metadatas=[{"source": pdf_path, "chunk_id": i}],
                ids=[f"doc_{i}"]
            )

    print(f"Indexed {len(chunks)} chunks from {pdf_path}")
    return len(chunks)

# Function to search for relevant content
def search_documents(query, k=3):
    query_embedding = get_embedding(query)
    if not query_embedding:
        return []

    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=k
    )

    return results

# Function to generate response using Gemini
def generate_response(query, context):
    prompt = f"""Based on the following context, please answer this question: {query}

Context:
{context}

Answer:"""

    model = genai.GenerativeModel('gemini-2.0-flash')
    response = model.generate_content(prompt)
    return response.text

# Main function to run the RAG system
def run_rag_system():
    print("Upload your PDF file:")
    pdf_path = upload_pdf()

    print(f"Processing {pdf_path}...")
    chunk_count = index_pdf(pdf_path)

    print(f"Successfully indexed {chunk_count} chunks. The RAG system is ready for queries!")

    while True:
        query = input("\nEnter your query (or type 'exit' to quit): ")
        if query.lower() == 'exit':
            break

        # Search for relevant documents
        results = search_documents(query)
        if not results or not results['documents']:
            print("No relevant information found.")
            continue

        # Combine relevant documents as context
        context = "\n\n".join(results['documents'][0])

        # Generate response
        response = generate_response(query, context)
        print("\nAnswer:")
        print(response)

# Run the system
if __name__ == "__main__":
    run_rag_system()

Upload your PDF file:


Saving RS900 communication protocol for application developer mod4-1.pdf to RS900 communication protocol for application developer mod4-1.pdf
Processing RS900 communication protocol for application developer mod4-1.pdf...




Indexed 14 chunks from RS900 communication protocol for application developer mod4-1.pdf
Successfully indexed 14 chunks. The RAG system is ready for queries!

Enter your query (or type 'exit' to quit): What is this document about?

Answer:
This document describes the communication protocol between a "Device" and a "Host." It details the command structure, data format, and operational modes (Command and Work) used for sending commands and data between the two. Key aspects include how the Device enters different modes, how commands are structured and acknowledged, the data format in Work mode (binary with Header, DATA, and Footer), and timing considerations for sending commands and data.


Enter your query (or type 'exit' to quit): It is actually for a scanning sonar.

Answer:
Please provide the question you would like me to answer. I have the context you provided about the scanning sonar, and I'm ready to use it to answer your question.


Enter your query (or type 'exit' to quit): What 