In [None]:
# üöÄ STEP 1: Run the Complete Generic RAG Pipeline
import sys
import os
sys.path.append('/home/rkpatel/RAG')

# Import generic RAG system
from src.rag_system import RAGSystem

print("üöÄ GENERIC RAG PIPELINE")
print("=" * 80)

# Initialize the RAG system
rag_system = RAGSystem()

# Example: Scrape Python documentation
start_urls = ["https://pytorch.org/docs/stable/"]
output_file = "data/pytorch_async.json"

print("üìÑ Scraping and processing documentation (will use cache if available)...")
print("   This will scrape Python documentation for demonstration...")

success = rag_system.scrape_and_process_website(
    start_urls=start_urls,
    max_pages=15,
    output_file=output_file,
    same_domain_only=True,
    max_depth=2
)

if success:
    print(f"‚úÖ System ready!")
    print(f"üìä Processed: {len(rag_system.chunks)} chunks")
    print(f"üìä Data file: {output_file}")
else:
    print("‚ùå Failed to initialize system")

print("\n‚úÖ Step 1 Complete: Generic RAG system ready!")

üöÄ GENERIC RAG PIPELINE
üìÑ Scraping and processing documentation (will use cache if available)...
   This will scrape Python documentation for demonstration...
üöÄ RAG: Scraping and processing website...
üåê Scraping website from: https://docs.python.org/3/
üöÄ Starting generic website scraping...
   Starting URLs: 1
   Max pages: 15
   Same domain only: True
   Max depth: 2
üîç Discovering URLs from 1 starting points...
   Found 15 URLs

üìÑ Processing 1/15: /3/
   üìÑ Processing: https://docs.python.org/3/
      ‚úÖ Extracted 1 sections

üìÑ Processing 2/15: /3/download.html
   üìÑ Processing: https://docs.python.org/3/download.html
      ‚úÖ Extracted 3 sections

üìÑ Processing 3/15: /3.15/
   üìÑ Processing: https://docs.python.org/3.15/
      ‚úÖ Extracted 1 sections

üìÑ Processing 4/15: /3.14/
   üìÑ Processing: https://docs.python.org/3.14/
      ‚úÖ Extracted 1 sections

üìÑ Processing 5/15: /3.13/
   üìÑ Processing: https://docs.python.org/3.13/
      ‚úÖ Ex

In [None]:
# ‚ö° Performance Comparison: Sync vs Async Scraping
import time
import asyncio
from src.rag_system import RAGSystem

print("üèÅ PERFORMANCE COMPARISON: Sync vs Async Scraping")
print("=" * 70)

# Test URLs - use reliable sites for fair comparison
# Note: Some sites like pytorch.org may block concurrent requests
test_urls = ["https://pytorch.org/docs/stable/"]  # Very reliable test site

# Test 1: Original Synchronous Scraper
print("üêå Testing SYNC Scraper...")
sync_rag = RAGSystem()

start_time = time.time()
sync_success = sync_rag.scrape_and_process_website(
    start_urls=test_urls,
    max_pages=5,
    output_file="data/pytorch_sync.json",
    use_cache=False  # Force fresh scraping
)
sync_duration = time.time() - start_time

print(f"   ‚è±Ô∏è Sync Duration: {sync_duration:.2f}s")

# Test 2: New Asynchronous Scraper
print("\n‚ö° Testing ASYNC Scraper...")
async_rag = RAGSystem()

async def test_async():
    start_time = time.time()
    success = await async_rag.scrape_and_process_website_async(
        start_urls=test_urls,
        max_pages=30,
        output_file="data/pytorch_async.json",
        concurrent_limit=1,        # Conservative for reliability
        requests_per_second=3.0,   # Conservative rate
        use_cache=False            # Force fresh scraping
    )
    duration = time.time() - start_time
    return success, duration

async_success, async_duration = await test_async()

print(f"   ‚è±Ô∏è Async Duration: {async_duration:.2f}s")

# Performance Analysis
if sync_success and async_success:
    improvement = sync_duration / async_duration if async_duration > 0 else 1
    time_saved = sync_duration - async_duration
    percent_faster = ((sync_duration - async_duration) / sync_duration) * 100 if sync_duration > 0 else 0
    
    print(f"\nüéØ PERFORMANCE RESULTS:")
    print(f"   ‚Ä¢ Async is {improvement:.1f}x speed ratio")
    print(f"   ‚Ä¢ Time difference: {time_saved:.2f}s ({percent_faster:.1f}% change)")
    print(f"   ‚Ä¢ Both completed successfully: ‚úÖ")
    
    print(f"\nüìä Real-World Expectations:")
    print(f"   ‚Ä¢ Small sites (1-5 pages): Similar performance")
    print(f"   ‚Ä¢ Medium sites (10-50 pages): 2-5x faster with async")  
    print(f"   ‚Ä¢ Large sites (100+ pages): 5-10x faster with async")
    print(f"   ‚Ä¢ Complex sites: Major async advantages!")
    
    print(f"\nüí° Async Benefits:")
    print("   ‚Ä¢ Concurrent processing of multiple URLs")
    print("   ‚Ä¢ Better resource utilization") 
    print("   ‚Ä¢ Maintains same quality extraction")
    print("   ‚Ä¢ Respects rate limits and robots.txt")
    
else:
    print("‚ö†Ô∏è One or both tests failed - check network connection")
    if not sync_success:
        print("   ‚ùå Sync test failed")
    if not async_success:
        print("   ‚ùå Async test failed")

print(f"\n‚ú® The async scraper eliminates delays and uses concurrent processing!")
print("üí° Try with larger websites to see dramatic performance gains!")
print("üí° Note: Some sites (like pytorch.org) may block concurrent requests")

üèÅ PERFORMANCE COMPARISON: Sync vs Async Scraping
üêå Testing SYNC Scraper...
üöÄ RAG: Scraping and processing website...
üåê Scraping website from: https://pytorch.org/docs/stable/
üöÄ Starting generic website scraping...
   Starting URLs: 1
   Max pages: 10
   Same domain only: True
   Max depth: 2
üîç Discovering URLs from 1 starting points...
   Found 10 URLs

üìÑ Processing 1/10: /docs/stable/
   üìÑ Processing: https://pytorch.org/docs/stable/
      ‚úÖ Extracted 2 sections

üìÑ Processing 2/10: /
   üìÑ Processing: https://pytorch.org/
      ‚úÖ Extracted 18 sections

üìÑ Processing 3/10: /get-started
   üìÑ Processing: https://pytorch.org/get-started
      ‚úÖ Extracted 0 sections

üìÑ Processing 4/10: /tutorials
   üìÑ Processing: https://pytorch.org/tutorials
      ‚úÖ Extracted 100 sections

üìÑ Processing 5/10: /tutorials/beginner/basics/intro.html
   üìÑ Processing: https://pytorch.org/tutorials/beginner/basics/intro.html
      ‚úÖ Extracted 3 sections

ü


üìÑ Processing 10/10: /community-hub/
   üìÑ Processing: https://pytorch.org/community-hub/
      ‚úÖ Extracted 15 sections


INFO:src.async_web_scraper:üöÄ AsyncWebScraper initialized with 1 concurrent workers
INFO:src.async_web_scraper:üöÄ Starting async scraping of 1 URLs
INFO:src.async_web_scraper:‚öôÔ∏è Config: 1 workers, 3.0 RPS, max 30 pages
INFO:src.async_web_scraper:üîß Worker 0 started
