In [None]:
%pip install requests
%pip install bs4
%pip install tqdm
%pip install chromadb

In [None]:
import os
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from tqdm import tqdm
import chromadb
from chromadb.config import Settings
import hashlib
from datetime import datetime
import argparse
import sys
from typing import List, Dict, Any
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('scraper.log'),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

visited_urls = set()

class WebScraperWithVector:
    def __init__(self, collection_name="web_content", persist_directory="./chroma_db"):
        """Initialize the scraper with ChromaDB integration"""
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self.setup_chromadb()
        
    def setup_chromadb(self):
        """Initialize ChromaDB client and collection"""
        try:
            logger.info("🔧 Initializing ChromaDB...")
            
            # Create persistent directory
            os.makedirs(self.persist_directory, exist_ok=True)
            
            # Initialize ChromaDB client with persistence
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            try:
                self.collection = self.client.get_collection(name=self.collection_name)
                logger.info(f"📚 Found existing collection: {self.collection_name}")
                logger.info(f"📊 Collection contains {self.collection.count()} documents")
            except:
                self.collection = self.client.create_collection(
                    name=self.collection_name,
                    metadata={"description": "Web scraped content with semantic search"}
                )
                logger.info(f"🆕 Created new collection: {self.collection_name}")
                
        except Exception as e:
            logger.error(f"❌ Failed to initialize ChromaDB: {e}")
            raise

    def fetch_html(self, url):
        """Fetch HTML content from URL"""
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
            response = requests.get(url, timeout=10, headers=headers)
            response.raise_for_status()
            return response.text
        except Exception as e:
            logger.error(f"❌ Failed to fetch {url}: {e}")
            return None

    def is_valid_url(self, url):
        """Check if URL is valid and not blacklisted"""
        parsed = urlparse(url)
        blacklisted_domains = ["linkedin.com", "facebook.com", "twitter.com", "instagram.com"]
        return (parsed.scheme in ["http", "https"] and 
                not any(domain in parsed.netloc for domain in blacklisted_domains))

    def extract_links(self, soup, base_url):
        """Extract all valid links from the page"""
        links = set()
        for tag in soup.find_all("a", href=True):
            href = tag.get("href")
            full_url = urljoin(base_url, href)
            if self.is_valid_url(full_url):
                links.add(full_url)
        return links

    def clean_text(self, text):
        """Clean and normalize text content"""
        if not text:
            return ""
        # Remove extra whitespace and normalize
        return ' '.join(text.split()).strip()

    def parse_content(self, url):
        """Parse content from a single URL"""
        html = self.fetch_html(url)
        if html is None:
            return None

        soup = BeautifulSoup(html, "html.parser")
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()

        # Extract structured data
        data = {
            "url": url,
            "title": self.clean_text(soup.title.string) if soup.title and soup.title.string else "",
            "headings": [self.clean_text(h.get_text()) for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) if h.get_text().strip()],
            "paragraphs": [self.clean_text(p.get_text()) for p in soup.find_all("p") if p.get_text().strip()],
            "body_text": self.clean_text(soup.get_text()),
            "images": [{"src": urljoin(url, img.get("src")), "alt": img.get("alt", "")} 
                      for img in soup.find_all("img") if img.get("src")],
            "links": list(self.extract_links(soup, url)),
            "scraped_at": datetime.now().isoformat(),
            "content_hash": None
        }
        
        # Generate content hash for deduplication
        content_for_hash = f"{data['title']}{' '.join(data['headings'])}{' '.join(data['paragraphs'])}"
        data["content_hash"] = hashlib.md5(content_for_hash.encode()).hexdigest()
        
        return data

    def prepare_document_for_vector_db(self, page_data):
        """Prepare document content and metadata for ChromaDB"""
        # Combine all text content for embedding
        text_content = []
        
        if page_data.get("title"):
            text_content.append(f"Title: {page_data['title']}")
            
        if page_data.get("headings"):
            text_content.append(f"Headings: {' | '.join(page_data['headings'])}")
            
        if page_data.get("paragraphs"):
            # Limit paragraph content to avoid token limits
            paragraphs = page_data['paragraphs'][:10]  # First 10 paragraphs
            text_content.append(f"Content: {' '.join(paragraphs)}")
        
        document_text = ' '.join(text_content)
        
        # Prepare metadata
        metadata = {
            "url": page_data["url"],
            "title": page_data.get("title", "")[:100],  # Truncate for metadata limits
            "scraped_at": page_data["scraped_at"],
            "content_hash": page_data["content_hash"],
            "num_headings": len(page_data.get("headings", [])),
            "num_paragraphs": len(page_data.get("paragraphs", [])),
            "num_images": len(page_data.get("images", [])),
            "num_links": len(page_data.get("links", []))
        }
        
        return document_text, metadata

    def insert_to_vector_db(self, structured_data):
        """Insert scraped data into ChromaDB"""
        logger.info("💾 Inserting data into vector database...")
        
        documents = []
        metadatas = []
        ids = []
        
        for i, page_data in enumerate(tqdm(structured_data, desc="Preparing documents")):
            try:
                document_text, metadata = self.prepare_document_for_vector_db(page_data)
                
                # Skip empty documents
                if not document_text.strip():
                    logger.warning(f"⚠️ Skipping empty document: {page_data.get('url', 'Unknown')}")
                    continue
                
                # Generate unique ID
                doc_id = f"doc_{page_data['content_hash']}_{i}"
                
                documents.append(document_text)
                metadatas.append(metadata)
                ids.append(doc_id)
                
            except Exception as e:
                logger.error(f"❌ Error preparing document {i}: {e}")
                continue
        
        if not documents:
            logger.warning("⚠️ No valid documents to insert")
            return
        
        try:
            # Insert in batches to avoid memory issues
            batch_size = 50
            total_inserted = 0
            
            for i in range(0, len(documents), batch_size):
                batch_docs = documents[i:i+batch_size]
                batch_metas = metadatas[i:i+batch_size]
                batch_ids = ids[i:i+batch_size]
                
                self.collection.add(
                    documents=batch_docs,
                    metadatas=batch_metas,
                    ids=batch_ids
                )
                
                total_inserted += len(batch_docs)
                logger.info(f"📝 Inserted batch: {len(batch_docs)} documents (Total: {total_inserted})")
            
            logger.info(f"✅ Successfully inserted {total_inserted} documents into ChromaDB")
            logger.info(f"📊 Total collection size: {self.collection.count()} documents")
            
        except Exception as e:
            logger.error(f"❌ Failed to insert documents: {e}")
            raise

    def search_similar_content(self, query, n_results=5):
        """Search for similar content in the vector database"""
        try:
            results = self.collection.query(
                query_texts=[query],
                n_results=n_results
            )
            
            logger.info(f"🔍 Found {len(results['documents'][0])} results for query: '{query}'")
            
            for i, (doc, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
                print(f"\n--- Result {i+1} ---")
                print(f"URL: {metadata['url']}")
                print(f"Title: {metadata['title']}")
                print(f"Content Preview: {doc[:200]}...")
                print(f"Scraped: {metadata['scraped_at']}")
                
        except Exception as e:
            logger.error(f"❌ Search failed: {e}")

    def crawl_website(self, start_url, max_depth=2, max_pages=None):
        """Crawl website and collect structured data"""
        structured_data = []
        to_visit = [(start_url, 0)]
        visited_urls.add(start_url)
        pages_processed = 0

        logger.info(f"🚀 Starting crawl from: {start_url}")
        logger.info(f"📐 Max depth: {max_depth}, Max pages: {max_pages or 'unlimited'}")

        while to_visit and (max_pages is None or pages_processed < max_pages):
            current_url, depth = to_visit.pop(0)
            if depth > max_depth:
                continue

            logger.info(f"🔍 Parsing [depth {depth}]: {current_url}")
            page_data = self.parse_content(current_url)
            
            if page_data:
                structured_data.append(page_data)
                pages_processed += 1
                
                # Add new links to visit queue
                new_links = 0
                for link in page_data["links"]:
                    if link not in visited_urls and urlparse(link).netloc == urlparse(start_url).netloc:
                        visited_urls.add(link)
                        to_visit.append((link, depth + 1))
                        new_links += 1
                
                logger.info(f"✅ Processed page (found {new_links} new links)")
            else:
                logger.warning(f"⚠️ Failed to process: {current_url}")

        logger.info(f"🎯 Crawl completed: {len(structured_data)} pages processed")
        return structured_data

    def save_structured_data(self, data, output_dir="scraped_data"):
        """Save structured data to JSON files"""
        os.makedirs(output_dir, exist_ok=True)

        # Save individual page files
        for i, page in enumerate(data):
            filename = os.path.join(output_dir, f"page_{i+1}.json")
            with open(filename, "w", encoding="utf-8") as f:
                json.dump(page, f, indent=4, ensure_ascii=False)

        # Save combined data
        combined_file = os.path.join(output_dir, "all_pages.json")
        with open(combined_file, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=4, ensure_ascii=False)

        logger.info(f"💾 Saved {len(data)} pages to '{output_dir}'")

    def get_collection_stats(self):
        """Display collection statistics"""
        try:
            count = self.collection.count()
            logger.info(f"📊 Collection Statistics:")
            logger.info(f"   Collection Name: {self.collection_name}")
            logger.info(f"   Total Documents: {count}")
            logger.info(f"   Storage Path: {self.persist_directory}")
            
            if count > 0:
                # Get a sample document to show structure
                sample = self.collection.peek(limit=1)
                if sample['metadatas']:
                    logger.info(f"   Sample Metadata Keys: {list(sample['metadatas'][0].keys())}")
                    
        except Exception as e:
            logger.error(f"❌ Failed to get collection stats: {e}")

def main():
    parser = argparse.ArgumentParser(description="Web Scraper with ChromaDB Vector Storage")
    parser.add_argument("--url", required=True, help="Starting URL to scrape")
    parser.add_argument("--depth", type=int, default=2, help="Maximum crawl depth")
    parser.add_argument("--max-pages", type=int, help="Maximum pages to scrape")
    parser.add_argument("--collection", default="web_content", help="ChromaDB collection name")
    parser.add_argument("--db-path", default="./chroma_db", help="ChromaDB storage path")
    parser.add_argument("--output-dir", default="scraped_data", help="Output directory for JSON files")
    parser.add_argument("--search", help="Search query to test vector database")
    parser.add_argument("--stats", action="store_true", help="Show collection statistics")
    
    args = parser.parse_args()
    
    try:
        # Initialize scraper
        scraper = WebScraperWithVector(
            collection_name=args.collection,
            persist_directory=args.db_path
        )
        
        # Show stats if requested
        if args.stats:
            scraper.get_collection_stats()
            return
        
        # Perform search if query provided
        if args.search:
            scraper.search_similar_content(args.search)
            return
        
        # Crawl website
        logger.info("🌐 Starting web scraping process...")
        structured_data = scraper.crawl_website(
            start_url=args.url,
            max_depth=args.depth,
            max_pages=args.max_pages
        )
        
        if not structured_data:
            logger.warning("⚠️ No data was scraped")
            return
        
        # Save to files
        scraper.save_structured_data(structured_data, args.output_dir)
        
        # Insert into vector database
        scraper.insert_to_vector_db(structured_data)
        
        # Show final statistics
        scraper.get_collection_stats()
        
        logger.info("🎉 Scraping and vector storage completed successfully!")
        logger.info(f"💡 Try searching with: python script.py --search 'your query here'")
        
    except KeyboardInterrupt:
        logger.info("⏹️ Scraping interrupted by user")
    except Exception as e:
        logger.error(f"💥 Fatal error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()