In [None]:
# Web URL RAG System with ChromaDB and GPT-4o-mini
# Requirements: pip install langchain langchain-community langchain-openai chromadb sentence-transformers beautifulsoup4 requests

import os
import warnings
from typing import List, Dict, Any, Optional
from pathlib import Path

# LangChain imports
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

# Additional imports
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import chromadb

# Suppress warnings
warnings.filterwarnings("ignore")

class WebRAGChromaSystem:
    def __init__(self, 
                 openai_api_key: str,
                 embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
                 persist_directory: str = "./chroma_db",
                 collection_name: str = "web_documents"):
        """
        Initialize Web RAG system with ChromaDB and GPT-4o-mini
        
        Args:
            openai_api_key: OpenAI API key
            embedding_model: HuggingFace embedding model
            persist_directory: ChromaDB persistence directory
            collection_name: ChromaDB collection name
        """
        # Set OpenAI API key
        os.environ["OPENAI_API_KEY"] = openai_api_key
        
        self.persist_directory = Path(persist_directory)
        self.collection_name = collection_name
        self.persist_directory.mkdir(parents=True, exist_ok=True)
        
        print("🔄 Initializing embedding model...")
        self.embeddings = HuggingFaceEmbeddings(
            model_name=embedding_model,
            model_kwargs={'device': 'cpu'}
        )
        
        print("🔄 Initializing ChatGPT-4o-mini...")
        self.llm = ChatOpenAI(
            model="gpt-4o-mini",
            temperature=0.3,
            max_tokens=100
        )
        
        self.vectorstore = None
        self.retriever = None
        self.rag_chain = None
        
        print("✅ Web RAG system initialized successfully")
    
    def load_web_urls(self, urls: List[str]) -> List[Document]:
        """
        Load content from web URLs using LangChain WebBaseLoader
        
        Args:
            urls: List of URLs to scrape
            
        Returns:
            List of Document objects
        """
        print(f"🌐 Loading content from {len(urls)} URLs...")
        
        all_documents = []
        
        for url in urls:
            try:
                print(f"📡 Scraping: {url}")
                
                # Use WebBaseLoader for better content extraction
                loader = WebBaseLoader(
                    web_paths=[url],
                    bs_kwargs={
                        "parse_only": BeautifulSoup.SoupStrainer([
                            "article", "main", "div", "p", "h1", "h2", "h3", "h4", "h5", "h6"
                        ])
                    }
                )
                
                documents = loader.load()
                
                # Add URL to metadata
                for doc in documents:
                    doc.metadata["source_url"] = url
                    doc.metadata["source_type"] = "web"
                
                all_documents.extend(documents)
                print(f"✅ Loaded {len(documents)} documents from {url}")
                
            except Exception as e:
                print(f"❌ Error loading {url}: {e}")
                # Try alternative scraping method
                try:
                    alt_doc = self._alternative_scrape(url)
                    if alt_doc:
                        all_documents.append(alt_doc)
                except Exception as alt_e:
                    print(f"❌ Alternative scraping also failed: {alt_e}")
        
        print(f"✅ Total loaded: {len(all_documents)} documents")
        return all_documents
    
    def _alternative_scrape(self, url: str) -> Optional[Document]:
        """Alternative web scraping method using requests + BeautifulSoup"""
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
        
        # Get text content
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        if text.strip():
            return Document(
                page_content=text,
                metadata={
                    "source": url,
                    "source_url": url,
                    "source_type": "web",
                    "title": soup.title.string if soup.title else "No Title"
                }
            )
        return None
    
    def split_documents(self, documents: List[Document], 
                       chunk_size: int = 1000, 
                       chunk_overlap: int = 200) -> List[Document]:
        """
        Split documents into chunks optimized for web content
        
        Args:
            documents: List of documents to split
            chunk_size: Maximum chunk size
            chunk_overlap: Overlap between chunks
            
        Returns:
            List of document chunks
        """
        print("✂️ Splitting documents into chunks...")
        
        # Web-optimized text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=[
                "\n\n\n",    # Multiple newlines
                "\n\n",      # Double newlines
                "\n",        # Single newlines
                ". ",        # Sentences
                "! ",        # Exclamations
                "? ",        # Questions
                "; ",        # Semicolons
                ", ",        # Commas
                " ",         # Spaces
                ""           # Characters
            ]
        )
        
        chunks = text_splitter.split_documents(documents)
        
        # Add chunk metadata
        for i, chunk in enumerate(chunks):
            chunk.metadata["chunk_id"] = i
            chunk.metadata["chunk_size"] = len(chunk.page_content)
        
        print(f"✅ Created {len(chunks)} chunks")
        return chunks
    
    def create_chroma_vectorstore(self, documents: List[Document]) -> None:
        """
        Create ChromaDB vector store from documents
        
        Args:
            documents: List of document chunks
        """
        print("🔮 Creating ChromaDB vector store...")
        
        try:
            # Create or load ChromaDB vector store
            self.vectorstore = Chroma.from_documents(
                documents=documents,
                embedding=self.embeddings,
                collection_name=self.collection_name,
                persist_directory=str(self.persist_directory)
            )
            
            # Persist the database
            self.vectorstore.persist()
            
            print(f"✅ ChromaDB vector store created with {len(documents)} documents")
            print(f"💾 Persisted to: {self.persist_directory}")
            
        except Exception as e:
            print(f"❌ Error creating vector store: {e}")
            raise
    
    def load_existing_vectorstore(self) -> bool:
        """Load existing ChromaDB vector store"""
        try:
            if (self.persist_directory / "chroma.sqlite3").exists():
                print("📂 Loading existing ChromaDB vector store...")
                
                self.vectorstore = Chroma(
                    collection_name=self.collection_name,
                    embedding_function=self.embeddings,
                    persist_directory=str(self.persist_directory)
                )
                
                print("✅ Existing vector store loaded successfully")
                return True
            else:
                print("❌ No existing ChromaDB found")
                return False
                
        except Exception as e:
            print(f"❌ Error loading existing vector store: {e}")
            return False
    
    def setup_retriever(self, search_type: str = "similarity", k: int = 4) -> None:
        """
        Setup document retriever from vector store
        
        Args:
            search_type: Type of search ("similarity", "mmr")
            k: Number of documents to retrieve
        """
        if not self.vectorstore:
            raise ValueError("Vector store not created. Please create vector store first.")
        
        print(f"🔍 Setting up retriever (type: {search_type}, k: {k})...")
        
        # Create retriever
        self.retriever = self.vectorstore.as_retriever(
            search_type=search_type,
            search_kwargs={"k": k}
        )
        
        print("✅ Retriever setup complete")
    
    def create_rag_chain(self) -> None:
        """Create RAG chain with custom prompt template"""
        if not self.retriever:
            raise ValueError("Retriever not setup. Please setup retriever first.")
        
        print("🔗 Creating RAG chain with GPT-4o-mini...")
        
        # Custom prompt template for web content RAG
        prompt_template = """You are a helpful assistant that answers questions based on web content provided as context.
Use the following pieces of context from web pages to answer the question. If you don't know the answer based on the context, just say that you don't know, don't try to make up an answer.

Context from web pages:
{context}

Question: {question}

Instructions:
- Provide a comprehensive answer based on the context
- If relevant, mention which website(s) the information comes from
- Be specific and cite details from the context
- If the context doesn't contain enough information, clearly state what's missing

Answer:"""
        
        # Create prompt template
        PROMPT = PromptTemplate(
            template=prompt_template,
            input_variables=["context", "question"]
        )
        
        # Create the RAG chain using LCEL (LangChain Expression Language)
        self.rag_chain = (
            {
                "context": self.retriever | self._format_docs,
                "question": RunnablePassthrough()
            }
            | PROMPT
            | self.llm
            | StrOutputParser()
        )
        
        print("✅ RAG chain created successfully")
    
    def _format_docs(self, docs: List[Document]) -> str:
        """Format retrieved documents for context"""
        formatted_docs = []
        
        for i, doc in enumerate(docs, 1):
            source_url = doc.metadata.get("source_url", "Unknown URL")
            title = doc.metadata.get("title", "")
            
            formatted_doc = f"Source {i} ({source_url}):\n"
            if title and title != "No Title":
                formatted_doc += f"Title: {title}\n"
            formatted_doc += f"Content: {doc.page_content}\n"
            
            formatted_docs.append(formatted_doc)
        
        return "\n" + "-"*50 + "\n".join(formatted_docs)
    
    def query(self, question: str) -> Dict[str, Any]:
        """
        Query the RAG system
        
        Args:
            question: User question
            
        Returns:
            Dictionary with answer and metadata
        """
        if not self.rag_chain:
            raise ValueError("RAG chain not created. Please create RAG chain first.")
        
        print(f"🤔 Processing question: {question}")
        
        try:
            # Get relevant documents for context
            retrieved_docs = self.retriever.invoke(question)
            
            # Generate answer using RAG chain
            answer = self.rag_chain.invoke(question)
            
            return {
                "question": question,
                "answer": answer,
                "source_documents": retrieved_docs,
                "num_sources": len(retrieved_docs)
            }
            
        except Exception as e:
            return {
                "question": question,
                "answer": f"Error processing question: {str(e)}",
                "source_documents": [],
                "num_sources": 0
            }
    
    def process_urls_pipeline(self, urls: List[str], 
                             chunk_size: int = 1000, 
                             chunk_overlap: int = 200,
                             force_recreate: bool = False) -> None:
        """
        Complete pipeline to process URLs and setup RAG system
        
        Args:
            urls: List of URLs to process
            chunk_size: Chunk size for text splitting
            chunk_overlap: Overlap between chunks
            force_recreate: Force recreation of vector store
        """
        print("🚀 Starting web URL processing pipeline...")
        
        # Try to load existing vector store
        if not force_recreate and self.load_existing_vectorstore():
            print("📋 Using existing vector store")
        else:
            # Load and process web content
            documents = self.load_web_urls(urls)
            if not documents:
                raise ValueError("No documents could be loaded from the provided URLs")
            
            chunks = self.split_documents(documents, chunk_size, chunk_overlap)
            self.create_chroma_vectorstore(chunks)
        
        # Setup retriever and RAG chain
        self.setup_retriever(search_type="similarity", k=4)
        self.create_rag_chain()
        
        print("🎉 Pipeline complete! RAG system ready for queries.")
    
    def interactive_chat(self) -> None:
        """Interactive chat interface"""
        print("\n" + "="*60)
        print("🤖 Web RAG Chat Interface with GPT-4o-mini")
        print("="*60)
        print("Commands:")
        print("  'quit', 'exit', 'q' - Exit the chat")
        print("  'sources' - Show sources for the last answer")
        print("  'stats' - Show vector store statistics")
        print("-"*60)
        
        last_result = None
        
        while True:
            question = input("\n💬 Your question: ").strip()
            
            if question.lower() in ['quit', 'exit', 'q']:
                print("👋 Goodbye!")
                break
            
            if question.lower() == 'sources' and last_result:
                print(f"\n📚 Sources for last answer ({last_result['num_sources']} documents):")
                for i, doc in enumerate(last_result['source_documents'], 1):
                    source_url = doc.metadata.get("source_url", "Unknown")
                    print(f"\n{i}. {source_url}")
                    print(f"   Content preview: {doc.page_content[:150]}...")
                continue
            
            if question.lower() == 'stats':
                if self.vectorstore:
                    try:
                        collection = self.vectorstore._collection
                        count = collection.count()
                        print(f"\n📊 Vector Store Stats:")
                        print(f"   Total documents: {count}")
                        print(f"   Collection name: {self.collection_name}")
                        print(f"   Persist directory: {self.persist_directory}")
                    except:
                        print("📊 Unable to retrieve stats")
                continue
            
            if not question:
                continue
            
            # Process the question
            result = self.query(question)
            last_result = result
            
            print(f"\n🤖 GPT-4o-mini Answer:")
            print(f"{result['answer']}")
            
            if result['num_sources'] > 0:
                print(f"\n📖 Based on {result['num_sources']} source(s). Type 'sources' to see details.")

def main():
    """Main function to demonstrate the Web RAG system"""
    print("🔧 Web RAG System with ChromaDB and GPT-4o-mini")
    print("="*50)
    
    # Get OpenAI API key
    api_key = input("🔑 Enter your OpenAI API key: ").strip()
    if not api_key:
        print("❌ OpenAI API key is required!")
        return
    
    # Initialize the system
    try:
        rag_system = WebRAGChromaSystem(
            openai_api_key=api_key,
            persist_directory="./web_chroma_db"
        )
    except Exception as e:
        print(f"❌ Error initializing system: {e}")
        return
    
    # Get URLs from user
    print("\n📡 Enter URLs to scrape (one per line, press Enter twice to finish):")
    urls = []
    while True:
        url = input("URL: ").strip()
        if not url:
            break
        if url.startswith(('http://', 'https://')):
            urls.append(url)
        else:
            print("⚠️  Please enter a valid URL starting with http:// or https://")
    
    if not urls:
        print("❌ No valid URLs provided!")
        return
    
    try:
        # Process URLs and setup RAG system
        rag_system.process_urls_pipeline(
            urls=urls,
            chunk_size=1000,
            chunk_overlap=200,
            force_recreate=False
        )
        
        # Start interactive chat
        rag_system.interactive_chat()
        
    except Exception as e:
        print(f"❌ Error: {e}")

# Quick test function
def quick_test():
    """Quick test with sample URLs"""
    # Sample URLs for testing
    test_urls = [
        "https://en.wikipedia.org/wiki/Artificial_intelligence",
        "https://en.wikipedia.org/wiki/Machine_learning"
    ]
    
    api_key = input("🔑 Enter OpenAI API key for quick test: ").strip()
    
    if api_key:
        rag_system = WebRAGChromaSystem(openai_api_key=api_key)
        rag_system.process_urls_pipeline(test_urls)
        
        # Test query
        result = rag_system.query("What is artificial intelligence?")
        print(f"\n🤖 Answer: {result['answer']}")

if __name__ == "__main__":
    choice = input("Choose mode:\n1. Full interactive mode\n2. Quick test\nChoice (1/2): ").strip()
    
    if choice == "2":
        quick_test()
    else:
        main()