<a href="https://colab.research.google.com/github/shyamgsundhar/RAG-Implementation-Updated-/blob/main/Rag_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import faiss
import google.generativeai as genai
import PyPDF2
import numpy as np
import pickle
import os

# Gemini API key
genai.configure(api_key=os.getenv("GEMINI_API_KEY", "AIzaSyAYQusmLa1utpkXk6pBDD6nTePsv9aQIt4"))


def pdf_to_vectors(pdf_path):
    # Read PDF
    print(f"📄 Reading PDF: {pdf_path}")
    with open(pdf_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        total_pages = len(pdf_reader.pages)

        # Extract text from each page separately
        page_texts = []
        for page_num, page in enumerate(pdf_reader.pages):
            page_text = page.extract_text()
            page_texts.append({
                'text': page_text,
                'page_number': page_num + 1
            })

        # Combine all text for chunking
        text = ''.join([p['text'] for p in page_texts if p['text']])

    print(f"📊 Total pages: {total_pages}")
    print(f"📊 Total text length: {len(text):,} characters")
    print(f"📊 Average characters per page: {len(text) // total_pages:,}")

    # Create chunks with page tracking
    chunks = []
    chunk_metadata = []

    for i in range(0, len(text), 400):
        chunk_text = text[i:i + 500]
        chunks.append(chunk_text)

        # Estimate which page this chunk belongs to
        estimated_page = min((i // (len(text) // total_pages)) + 1, total_pages)
        chunk_metadata.append({
            'start_pos': i,
            'estimated_page': estimated_page
        })

    print(f"Created {len(chunks)} chunks")

    # Get embeddings from Gemini
    print("🔄 Getting embeddings from Gemini...")
    embeddings = []
    for i, chunk in enumerate(chunks):
        print(f"Processing {i + 1}/{len(chunks)}")
        try:
            response = genai.embed_content(
                model="models/embedding-001",
                content=chunk,
                output_dimensionality=1536
            )
            embeddings.append(response['embedding'])
        except Exception as e:
            print(f"Error embedding chunk {i}: {e}")
            embeddings.append([0.0] * 1536)

    # Create FAISS index
    print("Creating FAISS index...")
    embeddings = np.array(embeddings)
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings.astype('float32'))

    # Save to files
    print("Saving to files...")
    faiss.write_index(index, "vectors.index")
    with open("chunks.pkl", "wb") as f:
        pickle.dump({
            'chunks': chunks,
            'metadata': chunk_metadata,
            'total_pages': total_pages
        }, f)

    print("Vector database created successfully!")
    print(f"Files saved: vectors.index, chunks.pkl")
    print(f"Vector shape: {embeddings.shape}")
    print(f"🔢 Sample vector (first 5 dims): {embeddings[0][:5]}")

    return embeddings, chunks


# Usage
if __name__ == "__main__":
    pdf_file = "doc.pdf"  # Change to your PDF file
    embeddings, chunks = pdf_to_vectors(pdf_file)

    print("\n🎉 Setup complete! Now you can run 'ask_questions.py' to chat with your PDF!")

📄 Reading PDF: doc.pdf
📊 Total pages: 15
📊 Total text length: 10,955 characters
📊 Average characters per page: 730
Created 28 chunks
🔄 Getting embeddings from Gemini...
Processing 1/28
Processing 2/28
Processing 3/28
Processing 4/28
Processing 5/28
Processing 6/28
Processing 7/28
Processing 8/28
Processing 9/28
Processing 10/28
Processing 11/28
Processing 12/28
Processing 13/28
Processing 14/28
Processing 15/28
Processing 16/28
Processing 17/28
Processing 18/28
Processing 19/28
Processing 20/28
Processing 21/28
Processing 22/28
Processing 23/28
Processing 24/28
Processing 25/28
Processing 26/28
Processing 27/28
Processing 28/28
Creating FAISS index...
Saving to files...
Vector database created successfully!
Files saved: vectors.index, chunks.pkl
Vector shape: (28, 768)
🔢 Sample vector (first 5 dims): [ 0.03128196 -0.06782899 -0.07025576  0.00207815  0.03874438]

🎉 Setup complete! Now you can run 'ask_questions.py' to chat with your PDF!


In [3]:
def ask_question(question):
    # Check if vector files exist
    if not os.path.exists("vectors.index") or not os.path.exists("chunks.pkl"):
        print("Error: Vector database not found!")
        print("run 'pdf_to_vectors.py' first to create the database.")
        return None

    try:
        # Load saved data
        index = faiss.read_index("vectors.index")
        with open("chunks.pkl", "rb") as f:
            data = pickle.load(f)

        chunks = data['chunks']
        metadata = data['metadata']
        total_pages = data['total_pages']

        # Get question embedding from Gemini
        response = genai.embed_content(
            model="models/embedding-001",
            content=question,
            output_dimensionality=1536
        )
        query_vector = np.array(response['embedding']).reshape(1, -1)

        # Search similar chunks
        scores, indices = index.search(query_vector.astype('float32'), 3)

        print(f"Found {len(indices[0])} relevant chunks:")
        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
            page_num = metadata[idx]['estimated_page']
            print(f"   Chunk {i + 1}: Score {score:.3f} (≈Page {page_num})")

        # Build context
        context_parts = []
        for idx in indices[0]:
            chunk_text = chunks[idx]
            page_num = metadata[idx]['estimated_page']
            context_parts.append(f"[Page {page_num}]: {chunk_text}")

        context = '\n\n'.join(context_parts)

        # Ask Gemini to answer with context
        model = genai.GenerativeModel("gemini-1.5-flash")
        prompt = f"""
You are answering questions about a {total_pages}-page document.
When possible, mention the page numbers where the answer is found.

Context:
{context}

Question: {question}

Answer based on the context:
"""
        response = model.generate_content(prompt)
        return response.text

    except Exception as e:
        print(f"Error processing question: {str(e)}")
        return None


def main():
    # Check if database exists
    if not os.path.exists("vectors.index") or not os.path.exists("chunks.pkl"):
        print("Vector database not found!")
        print("Please run 'pdf_to_vectors.py' first to create the database.")
        print("Steps:")
        print("   1. Run: python pdf_to_vectors.py")
        print("   2. Then run: python ask_questions.py")
        return

    # Load database info
    try:
        index = faiss.read_index("vectors.index")
        with open("chunks.pkl", "rb") as f:
            data = pickle.load(f)

        chunks = data['chunks']
        total_pages = data['total_pages']

        print(f"✅ Database loaded: {len(chunks)} chunks from {total_pages} pages")
    except Exception as e:
        print(f"Error loading database: {str(e)}")
        return

    # Interactive Q&A loop
    print("\n" + "=" * 60)
    print("🤖 RAG System Ready! Ask me questions about your PDF")
    print("💡 Type 'bye', 'quit', 'exit', or 'q' to exit")
    print("🔢 Type 'info' to see database statistics")
    print("=" * 60)

    while True:
        question = input("\nYour question: ").strip()

        if question.lower() in ['bye', 'quit', 'exit', 'q']:
            print("Goodbye! Thanks for using the RAG system!")
            break

        if question.lower() == 'info':
            print(f"📊 Database Info:")
            print(f"   • Total pages: {total_pages}")
            print(f"   • Total chunks: {len(chunks)}")
            print(f"   • Vector dimensions: 1536")
            print(f"   • Average chunks per page: {len(chunks) / total_pages:.1f}")
            print(f"   • Sample chunk: {chunks[0][:100]}...")
            continue

        if not question:
            print("Please enter a question!")
            continue

        print("Searching and generating answer...")
        answer = ask_question(question)

        if answer:
            print(f"Answer: {answer}")
        else:
            print("Sorry, I couldn't generate an answer. Please try again.")


if __name__ == "__main__":
    main()


✅ Database loaded: 28 chunks from 15 pages

🤖 Gemini RAG System Ready! Ask me questions about your PDF
💡 Type 'bye', 'quit', 'exit', or 'q' to exit
🔢 Type 'info' to see database statistics

Your question: whats the document about
Searching and generating answer...
Found 3 relevant chunks:
   Chunk 1: Score 0.578 (≈Page 10)
   Chunk 2: Score 0.576 (≈Page 11)
   Chunk 3: Score 0.563 (≈Page 10)
Answer: Based on pages 10 and 11, the document appears to describe a dataset and a proposed technology for classifying cardiovascular disease (CVD).  The dataset includes patient information such as weight, gender, blood pressure, cholesterol levels, glucose levels, smoking status, alcohol consumption, and physical activity (page 10).  The proposed technology is a fuzzy-hybrid classification algorithm that uses this data to predict the presence or absence of CVD (pages 10-11).


Your question: who is the author
Searching and generating answer...
Found 3 relevant chunks:
   Chunk 1: Score 0.527 (≈Pa