In [72]:
!pip install -q transformers torch sentence-transformers faiss-cpu pypdf2 langchain openai python-dotenv chromadb python-docx tiktoken google-generativeai

In [73]:
import os
import re
import json
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, clear_output
import ipywidgets as widgets
# Core libraries
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
from sentence_transformers import SentenceTransformer
import faiss

# OpenAI integration
import openai
from openai import OpenAI
import tiktoken
import google.generativeai as genai
# Document processing
import PyPDF2
from docx import Document
from io import BytesIO

# Google Drive integration
from google.colab import drive, files
import zipfile

# Vector database
import chromadb
from chromadb.config import Settings
from typing import Any, Dict

# Display and visualization
from IPython.display import display, Markdown, HTML
import matplotlib.pyplot as plt

print("✅ All packages installed successfully!")

✅ All packages installed successfully!


In [74]:
def mount_drive():
    """Mount Google Drive"""
    try:
        drive.mount('/content/drive')
        print("✅ Google Drive mounted successfully!")
        return True
    except Exception as e:
        print(f"❌ Error mounting Google Drive: {e}")
        return False

def upload_files():
    """Upload files directly to Colab"""
    print("📁 Please upload your documents (PDF, DOCX, TXT files)")
    uploaded = files.upload()

    # Create documents directory
    os.makedirs("documents", exist_ok=True)

    # Move uploaded files to documents directory
    for filename in uploaded.keys():
        os.rename(filename, f"documents/{filename}")
        print(f"✅ Moved {filename} to documents/")

    return list(uploaded.keys())

def list_drive_files(drive_path="/content/drive/MyDrive"):
    """List files in Google Drive"""
    try:
        files_found = []
        for root, dirs, files in os.walk(drive_path):
            for file in files:
                if file.lower().endswith(('.pdf', '.docx', '.txt', '.doc')):
                    full_path = os.path.join(root, file)
                    rel_path = os.path.relpath(full_path, drive_path)
                    files_found.append((file, full_path, rel_path))
        return files_found
    except Exception as e:
        print(f"❌ Error listing drive files: {e}")
        return []

def copy_from_drive(file_paths):
    """Copy selected files from Google Drive to local documents folder"""
    os.makedirs("documents", exist_ok=True)
    copied_files = []

    for file_path in file_paths:
        try:
            filename = os.path.basename(file_path)
            destination = f"documents/{filename}"

            # Copy file
            with open(file_path, 'rb') as src, open(destination, 'wb') as dst:
                dst.write(src.read())

            copied_files.append(filename)
            print(f"✅ Copied: {filename}")

        except Exception as e:
            print(f"❌ Error copying {file_path}: {e}")

    return copied_files

# Document Upload Interface
print("📂 DOCUMENT UPLOAD OPTIONS:")
print("1. Mount Google Drive and select files")
print("2. Upload files directly")
print("3. Skip (if documents already in documents/ folder)")

choice = input("\nEnter your choice (1/2/3): ").strip()

uploaded_files = []

if choice == "1":
    if mount_drive():
        print("\n📁 Scanning Google Drive for documents...")
        drive_files = list_drive_files()

        if drive_files:
            print(f"\n📋 Found {len(drive_files)} documents in Google Drive:")
            for i, (filename, full_path, rel_path) in enumerate(drive_files, 1):
                print(f"{i}. {filename} ({rel_path})")

            # Let user select files
            selection = input(f"\nEnter file numbers to use (e.g., 1,3,5) or 'all' for all files: ").strip()

            if selection.lower() == 'all':
                selected_paths = [full_path for _, full_path, _ in drive_files]
            else:
                try:
                    indices = [int(x.strip()) - 1 for x in selection.split(',')]
                    selected_paths = [drive_files[i][1] for i in indices if 0 <= i < len(drive_files)]
                except:
                    print("❌ Invalid selection. Using all files.")
                    selected_paths = [full_path for _, full_path, _ in drive_files]

            uploaded_files = copy_from_drive(selected_paths)
        else:
            print("❌ No documents found in Google Drive")

elif choice == "2":
    uploaded_files = upload_files()

elif choice == "3":
    # Check if documents folder exists and has files
    if os.path.exists("documents"):
        uploaded_files = [f for f in os.listdir("documents")
                         if f.lower().endswith(('.pdf', '.docx', '.txt', '.doc'))]
        if uploaded_files:
            print(f"✅ Found {len(uploaded_files)} documents in documents/ folder")
        else:
            print("❌ No documents found in documents/ folder")
    else:
        print("❌ documents/ folder not found")

else:
    print("❌ Invalid choice")

if not uploaded_files:
    print("⚠️ No documents available. Please upload documents first.")
else:
    print(f"\n🎉 Ready to process {len(uploaded_files)} documents!")
    for file in uploaded_files:
        print(f"  📄 {file}")

📂 DOCUMENT UPLOAD OPTIONS:
1. Mount Google Drive and select files
2. Upload files directly
3. Skip (if documents already in documents/ folder)

Enter your choice (1/2/3): 3
✅ Found 3 documents in documents/ folder

🎉 Ready to process 3 documents!
  📄 2005.11401v4.pdf
  📄 2005.14165v4.pdf
  📄 1706.03762v7.pdf


In [75]:
@dataclass
class DocumentChunk:
    """Represents a chunk of text from a document"""
    text: str
    source: str
    page_number: int
    chunk_id: str
    metadata: Dict

class DocumentProcessor:
    """Handles document loading, cleaning, and chunking"""

    def __init__(self, chunk_size: int = 1000, overlap: int = 200):
        self.chunk_size = chunk_size
        self.overlap = overlap
        self.supported_formats = ['.pdf', '.docx', '.txt', '.doc']

    def extract_text_from_pdf(self, pdf_path: str) -> List[Tuple[str, int]]:
        """Extract text from PDF file"""
        text_pages = []

        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)

                for page_num, page in enumerate(pdf_reader.pages):
                    text = page.extract_text()
                    if text.strip():
                        text_pages.append((text, page_num + 1))

        except Exception as e:
            print(f"❌ Error reading PDF {pdf_path}: {str(e)}")

        return text_pages

    def extract_text_from_docx(self, docx_path: str) -> List[Tuple[str, int]]:
        """Extract text from DOCX file"""
        try:
            doc = Document(docx_path)
            text_pages = []
            current_text = ""

            for paragraph in doc.paragraphs:
                current_text += paragraph.text + "\n"

            if current_text.strip():
                text_pages.append((current_text, 1))

            return text_pages

        except Exception as e:
            print(f"❌ Error reading DOCX {docx_path}: {str(e)}")
            return []

    def extract_text_from_txt(self, txt_path: str) -> List[Tuple[str, int]]:
        """Extract text from TXT file"""
        try:
            with open(txt_path, 'r', encoding='utf-8') as file:
                text = file.read()
                return [(text, 1)] if text.strip() else []
        except UnicodeDecodeError:
            try:
                with open(txt_path, 'r', encoding='latin-1') as file:
                    text = file.read()
                    return [(text, 1)] if text.strip() else []
            except Exception as e:
                print(f"❌ Error reading TXT {txt_path}: {str(e)}")
                return []
        except Exception as e:
            print(f"❌ Error reading TXT {txt_path}: {str(e)}")
            return []

    def extract_text_from_file(self, file_path: str) -> List[Tuple[str, int]]:
        """Extract text from any supported file format"""
        file_ext = os.path.splitext(file_path)[1].lower()

        if file_ext == '.pdf':
            return self.extract_text_from_pdf(file_path)
        elif file_ext in ['.docx', '.doc']:
            return self.extract_text_from_docx(file_path)
        elif file_ext == '.txt':
            return self.extract_text_from_txt(file_path)
        else:
            print(f"❌ Unsupported file format: {file_ext}")
            return []

    def clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters but keep punctuation
        text = re.sub(r'[^\w\s.,!?;:()\-\'\""]', '', text)
        # Fix common PDF extraction issues
        text = re.sub(r'(\w)-\s+(\w)', r'\1\2', text)  # Fix hyphenated words

        return text.strip()

    def create_chunks(self, text: str, source: str, page_number: int) -> List[DocumentChunk]:
        """Create overlapping chunks from text"""
        chunks = []
        words = text.split()

        if len(words) < 10:  # Skip very short texts
            return chunks

        for i in range(0, len(words), self.chunk_size - self.overlap):
            chunk_words = words[i:i + self.chunk_size]
            chunk_text = ' '.join(chunk_words)

            if len(chunk_text.strip()) > 50:  # Only keep substantial chunks
                chunk_id = f"{source}_page_{page_number}_chunk_{len(chunks)}"

                chunk = DocumentChunk(
                    text=chunk_text,
                    source=source,
                    page_number=page_number,
                    chunk_id=chunk_id,
                    metadata={
                        'word_count': len(chunk_words),
                        'char_count': len(chunk_text),
                        'file_type': os.path.splitext(source)[1]
                    }
                )
                chunks.append(chunk)

        return chunks

    def process_documents(self, documents_folder: str = "documents") -> List[DocumentChunk]:
        """Process all documents in folder and return chunks"""
        all_chunks = []

        if not os.path.exists(documents_folder):
            print(f"❌ Documents folder '{documents_folder}' not found")
            return all_chunks

        files = [f for f in os.listdir(documents_folder)
                if os.path.splitext(f)[1].lower() in self.supported_formats]

        if not files:
            print(f"❌ No supported documents found in '{documents_folder}'")
            return all_chunks

        print(f"🔄 Processing {len(files)} documents...")

        for filename in files:
            print(f"📄 Processing: {filename}")

            file_path = os.path.join(documents_folder, filename)

            # Extract text from file
            text_pages = self.extract_text_from_file(file_path)

            if not text_pages:
                print(f"⚠️ No text extracted from {filename}")
                continue

            # Process each page
            doc_chunks = 0
            for text, page_num in text_pages:
                cleaned_text = self.clean_text(text)
                if cleaned_text:
                    chunks = self.create_chunks(cleaned_text, filename, page_num)
                    all_chunks.extend(chunks)
                    doc_chunks += len(chunks)

            print(f"✅ Created {doc_chunks} chunks from {filename}")

        return all_chunks

# Process documents
print("\n🔄 Processing your documents...")
processor = DocumentProcessor(chunk_size=800, overlap=150)
document_chunks = processor.process_documents("documents")

if document_chunks:
    print(f"\n📊 Processing Summary:")
    print(f"Total chunks created: {len(document_chunks)}")

    # Group by source
    source_counts = {}
    for chunk in document_chunks:
        source_counts[chunk.source] = source_counts.get(chunk.source, 0) + 1

    for source, count in source_counts.items():
        print(f"  📄 {source}: {count} chunks")
else:
    print("❌ No chunks created. Please check your documents.")


🔄 Processing your documents...
🔄 Processing 3 documents...
📄 Processing: 2005.11401v4.pdf
✅ Created 22 chunks from 2005.11401v4.pdf
📄 Processing: 2005.14165v4.pdf
✅ Created 93 chunks from 2005.14165v4.pdf
📄 Processing: 1706.03762v7.pdf
✅ Created 15 chunks from 1706.03762v7.pdf

📊 Processing Summary:
Total chunks created: 130
  📄 2005.11401v4.pdf: 22 chunks
  📄 2005.14165v4.pdf: 93 chunks
  📄 1706.03762v7.pdf: 15 chunks


In [76]:
class EmbeddingModel:
    """Handles text embeddings using sentence transformers"""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        print(f"🔄 Loading embedding model: {model_name}")
        self.model = SentenceTransformer(model_name)
        self.dimension = self.model.get_sentence_embedding_dimension()
        print(f"✅ Embedding model loaded. Dimension: {self.dimension}")

    def encode(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
        """Encode texts to embeddings"""
        return self.model.encode(texts, batch_size=batch_size, show_progress_bar=True)

    def encode_single(self, text: str) -> np.ndarray:
        """Encode single text to embedding"""
        return self.model.encode([text])[0]

class VectorStore:
    """FAISS-based vector store for efficient similarity search"""

    def __init__(self, embedding_model: EmbeddingModel):
        self.embedding_model = embedding_model
        self.index = None
        self.chunks = []
        self.embeddings = None

    def build_index(self, chunks: List[DocumentChunk]) -> None:
        """Build FAISS index from document chunks"""
        if not chunks:
            print("❌ No chunks provided for indexing")
            return

        print("🔄 Building vector index...")

        self.chunks = chunks
        texts = [chunk.text for chunk in chunks]

        # Generate embeddings
        print("🔄 Generating embeddings...")
        self.embeddings = self.embedding_model.encode(texts)

        # Build FAISS index
        self.index = faiss.IndexFlatIP(self.embedding_model.dimension)  # Inner product for cosine similarity

        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(self.embeddings)
        self.index.add(self.embeddings)

        print(f"✅ Vector index built with {len(chunks)} chunks")

    def search(self, query: str, k: int = 5) -> List[Tuple[DocumentChunk, float]]:
        """Search for most relevant chunks"""
        if self.index is None:
            raise ValueError("Index not built. Call build_index first.")

        # Encode query
        query_embedding = self.embedding_model.encode_single(query).reshape(1, -1)
        faiss.normalize_L2(query_embedding)

        # Search
        scores, indices = self.index.search(query_embedding, k)

        results = []
        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
            if idx < len(self.chunks):
                results.append((self.chunks[idx], float(score)))

        return results

In [77]:
def setup_gemini():
    """Setup Gemini API key"""
    print("🔑 Setting up Gemini API...")

    # Method 1: Try to get from environment
    api_key = os.getenv('GOOGLE_API_KEY')

    if not api_key:
        print("\n📝 Gemini API Key Setup:")
        print("You can get your API key from: https://makersuite.google.com/app/apikey")
        print("Or from Google Cloud Console: https://console.cloud.google.com/")
        api_key = input("Enter your Google Gemini API key: ").strip()

        if api_key:
            os.environ['GOOGLE_API_KEY'] = api_key
            print("✅ API key set successfully!")
        else:
            print("⚠️ No API key provided. Will use local models as fallback.")
            return None

    try:
        # Configure Gemini
        genai.configure(api_key=api_key)

        # Test the API by listing models
        models = list(genai.list_models())
        print(f"✅ Gemini API connection successful! Found {len(models)} available models.")
        return True
    except Exception as e:
        print(f"❌ Gemini API setup failed: {e}")
        print("Will use local models as fallback.")
        return None

# Setup Gemini (optional)
gemini_available = setup_gemini()

# Initialize embedding model and vector store (only if we have chunks)
if document_chunks:
    embedding_model = EmbeddingModel("all-MiniLM-L6-v2")
    vector_store = VectorStore(embedding_model)
    vector_store.build_index(document_chunks)
    print("✅ Vector store ready!")
else:
    print("⚠️ Skipping vector store creation - no document chunks available")

🔑 Setting up Gemini API...
✅ Gemini API connection successful! Found 53 available models.
🔄 Loading embedding model: all-MiniLM-L6-v2
✅ Embedding model loaded. Dimension: 384
🔄 Building vector index...
🔄 Generating embeddings...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

✅ Vector index built with 130 chunks
✅ Vector store ready!


In [78]:
class GeminiAnswerGenerator:
    """Answer generator using Google's Gemini models"""

    def __init__(self, model_name: str = "gemini-1.5-flash"):
        self.model_name = model_name
        try:
            self.model = genai.GenerativeModel(model_name)
            print(f"✅ Gemini Answer Generator initialized with {model_name}")
        except Exception as e:
            print(f"❌ Failed to initialize Gemini model: {e}")
            raise

    def count_tokens(self, text: str) -> int:
        """Estimate token count (approximate for Gemini)"""
        # Rough estimation: 1 token ≈ 4 characters for most languages
        return len(text) // 4

    def truncate_context(self, context: str, max_chars: int = 30000) -> str:
        """Truncate context to fit within character limits"""
        if len(context) <= max_chars:
            return context
        return context[:max_chars] + "..."

    def generate_answer(self, query: str, context_chunks: List[DocumentChunk]) -> str:
        """Generate answer using Gemini API"""

        if not context_chunks:
            return "No relevant context found to answer this question."

        # Prepare context from chunks
        context_parts = []
        for i, chunk in enumerate(context_chunks[:5], 1):  # Use top 5 chunks
            source_name = os.path.splitext(chunk.source)[0]
            context_parts.append(f"Document {i} ({source_name}, Page {chunk.page_number}):\n{chunk.text}\n")

        context = "\n".join(context_parts)

        # Truncate context if too long
        context = self.truncate_context(context)

        # Create prompt for Gemini
        prompt = f"""You are a helpful AI assistant that answers questions based on provided document contexts.

Instructions:
1. Answer the question using only the information provided in the context
2. Be accurate and cite specific documents when possible
3. If the context doesn't contain enough information, say so clearly
4. Provide comprehensive answers when possible
5. Use a professional and informative tone

Context from documents:
{context}

Question: {query}

Please provide a detailed answer based on the context above."""

        try:
            # Generate response using Gemini
            response = self.model.generate_content(
                prompt,
                generation_config=genai.types.GenerationConfig(
                    temperature=0.3,  # Lower temperature for more focused answers
                    max_output_tokens=500,
                    top_p=0.9,
                    top_k=40
                )
            )

            if response.text:
                return response.text.strip()
            else:
                return self._fallback_answer(query, context_chunks)

        except Exception as e:
            print(f"❌ Gemini API error: {e}")
            return self._fallback_answer(query, context_chunks)

    def _fallback_answer(self, query: str, context_chunks: List[DocumentChunk]) -> str:
        """Fallback answer generation"""
        relevant_sentences = []
        query_words = set(query.lower().split())

        for chunk in context_chunks[:2]:
            sentences = re.split(r'[.!?]+', chunk.text)
            for sentence in sentences:
                if len(sentence.strip()) > 20:
                    sentence_words = set(sentence.lower().split())
                    overlap = len(query_words.intersection(sentence_words))

                    if overlap > 0:
                        relevant_sentences.append((sentence.strip(), overlap))

        relevant_sentences.sort(key=lambda x: x[1], reverse=True)

        if relevant_sentences:
            answer_parts = [sent[0] for sent in relevant_sentences[:3]]
            return ". ".join(answer_parts) + "."
        else:
            return "Based on the available context, I cannot provide a specific answer to this question."

class LocalAnswerGenerator:
    """Fallback local answer generator using transformers"""

    def __init__(self, model_name: str = "distilgpt2"):
        print(f"🔄 Loading local answer generation model: {model_name}")

        self.generator = pipeline(
            "text-generation",
            model=model_name,
            tokenizer=model_name,
            device=0 if torch.cuda.is_available() else -1
        )

        print("✅ Local answer generation model loaded")

    def generate_answer(self, query: str, context_chunks: List[DocumentChunk]) -> str:
        """Generate answer using local model"""

        if not context_chunks:
            return "No relevant context found to answer this question."

        # Prepare context
        context_texts = []
        for chunk in context_chunks[:3]:
            source_name = os.path.splitext(chunk.source)[0]
            context_texts.append(f"[{source_name}] {chunk.text[:300]}...")

        context = "\n\n".join(context_texts)

        # Create prompt
        prompt = f"""Based on the following document excerpts, answer the question accurately:

Context:
{context}

Question: {query}

Answer:"""

        try:
            response = self.generator(
                prompt,
                max_length=len(prompt.split()) + 100,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.generator.tokenizer.eos_token_id
            )

            generated_text = response[0]['generated_text']
            answer = generated_text.split("Answer:")[-1].strip()

            # Clean up the answer
            answer = re.sub(r'^[^\w]*', '', answer)

            return answer if answer else self._fallback_answer(query, context_chunks)

        except Exception as e:
            print(f"Error generating answer: {e}")
            return self._fallback_answer(query, context_chunks)

    def _fallback_answer(self, query: str, context_chunks: List[DocumentChunk]) -> str:
        """Simple extractive answer"""
        relevant_sentences = []
        query_words = set(query.lower().split())

        for chunk in context_chunks[:2]:
            sentences = re.split(r'[.!?]+', chunk.text)
            for sentence in sentences:
                if len(sentence.strip()) > 20:
                    sentence_words = set(sentence.lower().split())
                    overlap = len(query_words.intersection(sentence_words))

                    if overlap > 0:
                        relevant_sentences.append((sentence.strip(), overlap))

        relevant_sentences.sort(key=lambda x: x[1], reverse=True)

        if relevant_sentences:
            answer_parts = [sent[0] for sent in relevant_sentences[:3]]
            return ". ".join(answer_parts) + "."
        else:
            return "Based on the available context, I cannot provide a specific answer to this question."

# Initialize answer generator
if document_chunks:
    if gemini_available:
        print("🔄 Initializing Gemini answer generator...")
        try:
            answer_generator = GeminiAnswerGenerator("gemini-1.5-flash")
            print("✅ Using Google Gemini 1.5 Flash for answer generation!")
        except Exception as e:
            print(f"❌ Failed to initialize Gemini: {e}")
            print("🔄 Falling back to local model...")
            answer_generator = LocalAnswerGenerator()
    else:
        print("🔄 Initializing local answer generator...")
        answer_generator = LocalAnswerGenerator()
        print("✅ Using local model for answer generation")
else:
    answer_generator = None

🔄 Initializing Gemini answer generator...
✅ Gemini Answer Generator initialized with gemini-1.5-flash
✅ Using Google Gemini 1.5 Flash for answer generation!


In [79]:
class RAGSystem:
    """Complete RAG system for question answering"""

    def __init__(self, vector_store: Any, answer_generator: Any):
        self.vector_store = vector_store
        self.answer_generator = answer_generator

    def answer_question(self, query: str, k: int = 5) -> Dict:
        """Answer a question using RAG approach"""

        # Step 1: Retrieve relevant chunks
        retrieved_chunks = self.vector_store.search(query, k=k)

        # Step 2: Generate answer
        chunks_only = [chunk for chunk, score in retrieved_chunks]
        answer = self.answer_generator.generate_answer(query, chunks_only)

        # Step 3: Prepare response with source attribution
        sources = []
        for chunk, score in retrieved_chunks:
            sources.append({
                'document': chunk.source,
                'page': chunk.page_number,
                'relevance_score': score,
                'file_type': chunk.metadata.get('file_type', 'unknown'),
                'excerpt': chunk.text[:200] + "..." if len(chunk.text) > 200 else chunk.text
            })

        return {
            'query': query,
            'answer': answer,
            'sources': sources,
            'num_sources': len(sources)
        }

    def display_answer(self, result: Dict) -> None:
        """Display answer in a formatted way"""

        display(Markdown(f"## 🤔 Question: {result['query']}"))
        display(Markdown(f"## 💡 Answer:\n{result['answer']}"))

        display(Markdown("## 📚 Sources:"))
        for i, source in enumerate(result['sources'], 1):
            display(Markdown(f"""
**{i}. {source['document']}**
- Page: {source['page']}
- File Type: {source['file_type']}
- Relevance Score: {source['relevance_score']:.3f}
- Excerpt: *{source['excerpt']}*
            """))

# Initialize complete RAG system (only if we have everything)
if document_chunks and 'vector_store' in locals() and 'answer_generator' in locals():
    rag_system = RAGSystem(vector_store, answer_generator)
    print("🎉 RAG System initialized successfully!")

🎉 RAG System initialized successfully!


In [80]:
def interactive_query():
    """Start a question-answer loop with the RAG system"""
    print("\n🧠 Ask me anything about your documents!")
    print("Type 'quit' to exit, or 'docs' to see the list of documents.")
    print("=" * 60)

    # Display unique document sources
    unique_sources = set(chunk.source for chunk in document_chunks)

    while True:
        query = input("\n❓ Your question: ").strip()

        if query.lower() == 'quit':
            print("👋 Exiting. Happy learning!")
            break
        elif query.lower() == 'docs':
            print(f"\n📄 Available documents ({len(unique_sources)}):")
            for doc in sorted(unique_sources):
                chunks_count = len([c for c in document_chunks if c.source == doc])
                print(f"  • {doc} ({chunks_count} chunks)")
            continue
        elif not query:
            print("⚠️ Please enter a valid question.")
            continue

        try:
            print("\n🔍 Getting your answer...")
            result = rag_system.answer_question(query)

            print(f"\n✅ **Answer:**\n{result['answer']}")

            print(f"\n📚 **Sources ({len(result['sources'])}):**")
            for i, source in enumerate(result['sources'][:3], 1):
                print(f"{i}. {source['document']} (Page {source['page']}, Score: {source['relevance_score']:.3f})")

        except Exception as e:
            print(f"❌ Error: {e}")


In [81]:
print(f"""
🎯 RAG SYSTEM IS READY!
═════════════════════════════════════════════════════════════════════

📊 SYSTEM STATUS:
• Documents loaded: {len(set(chunk.source for chunk in document_chunks))}
• Total chunks: {len(document_chunks)}
• Vector index: ✅ Ready
• LLM backend: {"🌟 Google Gemini 1.5 Flash" if isinstance(answer_generator, GeminiAnswerGenerator) else "🏠 Local DistilGPT-2"}
• Interface: Interactive Q&A enabled

🧠 HOW TO USE:
• Ask your questions using `interactive_query()`
• Or call `rag_system.answer_question("Your question here")` in code
• Use `rag_system.display_answer(result)` for full formatted output

🚀 START BY ASKING:
Try: `interactive_query()` and ask questions like:
→ What are the key findings?
→ What methods were used in the study?
→ Summarize document XYZ.

🧪 TESTING SYSTEM WITH A SAMPLE QUESTION:
""")

sample_question = "What are the main topics covered in these documents?"
print(f"📝 Sample Question: {sample_question}\n" + "=" * 50)

try:
    result = rag_system.answer_question(sample_question)
    rag_system.display_answer(result)
except Exception as e:
    print(f"❌ Error in sample query: {e}")

print("\n🎉 All set! Start exploring your documents by running `interactive_query()`")



🎯 RAG SYSTEM IS READY!
═════════════════════════════════════════════════════════════════════

📊 SYSTEM STATUS:
• Documents loaded: 3
• Total chunks: 130
• Vector index: ✅ Ready
• LLM backend: 🌟 Google Gemini 1.5 Flash
• Interface: Interactive Q&A enabled

🧠 HOW TO USE:
• Ask your questions using `interactive_query()`
• Or call `rag_system.answer_question("Your question here")` in code
• Use `rag_system.display_answer(result)` for full formatted output

🚀 START BY ASKING:
Try: `interactive_query()` and ask questions like:
→ What are the key findings?
→ What methods were used in the study?
→ Summarize document XYZ.

🧪 TESTING SYSTEM WITH A SAMPLE QUESTION:

📝 Sample Question: What are the main topics covered in these documents?


## 🤔 Question: What are the main topics covered in these documents?

## 💡 Answer:
The provided documents cover several topics related to natural language processing (NLP) and large language models.  Specifically:

* **Document 1 (2005.11401v4, Page 7):** This document focuses on the performance evaluation of a Retrieval Augmented Generation (RAG) model.  It discusses the model's accuracy in various tasks, including Jeopardy question generation, and compares its performance to other models like BART.  The document also analyzes the model's retrieval mechanism and its ability to generate diverse and factually accurate responses.  It mentions the works of Ernest Hemingway, specifically *A Farewell to Arms* and *The Sun Also Rises*, as examples used in the model's evaluation.

* **Document 2 (2005.14165v4, Page 66):** This document appears to present results from various NLP tasks, including common sense reasoning, question answering, reading comprehension, and natural language inference (ANLI).  However, the provided text only shows figure references (Figures H.6-H.9) and lacks specific details about the results.

* **Document 3 (2005.14165v4, Page 50):** This document describes a dataset used for evaluating a model's ability to understand conversational nuances across different cultures.  It provides examples of questions and answers related to appropriate conversation topics in various cultures (Latin America, France, the United States, Japan, China, Korea, and the Middle East).

* **Document 4 (2005.14165v4, Page 72):** This document lists numerous citations of publications related to natural language processing.  These citations cover a wide range of topics within NLP, including word embeddings, question answering, commonsense reasoning, and bias in language models.

* **Document 5 (2005.14165v4, Page 44):** This document discusses the methodology used to assess data contamination in a large language model training dataset.  It describes techniques for identifying and filtering overlapping content between training and test sets to prevent overfitting.  The document also analyzes the impact of data contamination on model performance and discusses challenges in accurately identifying contaminated data.

## 📚 Sources:


**1. 2005.11401v4.pdf**
- Page: 7
- File Type: .pdf
- Relevance Score: 0.320
- Excerpt: *Document 1 : his works are considered classics of American literature ... His wartime experiences formed the basis for his novel A Farewell to Arms (1929) ... Document 2 : ... artists of the 1920s Los...*
            


**2. 2005.14165v4.pdf**
- Page: 66
- File Type: .pdf
- Relevance Score: 0.313
- Excerpt: *Figure H.6: All results for all Common Sense Reasoning tasks. Figure H.7: All results for all QA tasks. Figure H.8: All results for all Reading Comprehension tasks. Figure H.9: All results for all ANL...*
            


**3. 2005.14165v4.pdf**
- Page: 50
- File Type: .pdf
- Relevance Score: 0.310
- Excerpt: *G Details of Task Phrasing and Speciﬁcations The following ﬁgures illustrate the formatting and phrasing of all the tasks included in the paper. All data comes from the ground truth datasets in this s...*
            


**4. 2005.14165v4.pdf**
- Page: 72
- File Type: .pdf
- Relevance Score: 0.297
- Excerpt: *MBXS17 Bryan McCann, James Bradbury, Caiming Xiong, and Richard Socher. Learned in translation: Contextualized word vectors. In Advances in Neural Information Processing Systems , pages 62946305, 2017...*
            


**5. 2005.14165v4.pdf**
- Page: 44
- File Type: .pdf
- Relevance Score: 0.292
- Excerpt: *removed entirely. Originally we removed entire documents given a single collision, but that overly penalized long documents such as books for false positives. An example of a false positive might be a...*
            


🎉 All set! Start exploring your documents by running `interactive_query()`


In [None]:
interactive_query()


🧠 Ask me anything about your documents!
Type 'quit' to exit, or 'docs' to see the list of documents.

❓ Your question: What are the two sub-layers in each encoder layer of the Transformer model?

🔍 Getting your answer...

✅ **Answer:**
Based on Document 1 (1706.03762v7, Page 3), each encoder layer in the Transformer model contains two sub-layers.  The first is a multi-head self-attention mechanism, and the second is a position-wise fully connected feed-forward network.

📚 **Sources (5):**
1. 1706.03762v7.pdf (Page 3, Score: 0.686)
2. 1706.03762v7.pdf (Page 5, Score: 0.458)
3. 1706.03762v7.pdf (Page 8, Score: 0.430)
