In [None]:
# 🚀 STEP 1: Run the Complete Generic RAG Pipeline
import sys
import os
sys.path.append('/home/rkpatel/RAG')

# Import generic RAG system
from src.rag_system import RAGSystem

print("🚀 GENERIC RAG PIPELINE")
print("=" * 80)

# Initialize the RAG system
rag_system = RAGSystem()

# Example: Scrape Python documentation
start_urls = ["https://pytorch.org/docs/stable/"]
output_file = "data/pytorch_async.json"

print("📄 Scraping and processing documentation (will use cache if available)...")
print("   This will scrape Python documentation for demonstration...")

success = rag_system.scrape_and_process_website(
    start_urls=start_urls,
    max_pages=15,
    output_file=output_file,
    same_domain_only=True,
    max_depth=2
)

if success:
    print(f"✅ System ready!")
    print(f"📊 Processed: {len(rag_system.chunks)} chunks")
    print(f"📊 Data file: {output_file}")
else:
    print("❌ Failed to initialize system")

print("\n✅ Step 1 Complete: Generic RAG system ready!")

In [None]:
# ⚡ Performance Comparison: Sync vs Async Scraping
import time
import asyncio
from src.rag_system import RAGSystem

print("🏁 PERFORMANCE COMPARISON: Sync vs Async Scraping")
print("=" * 70)

# Test URLs - use reliable sites for fair comparison
# Note: Some sites like pytorch.org may block concurrent requests
test_urls = ["https://pytorch.org/docs/stable/"]  # Very reliable test site

# Test 1: Original Synchronous Scraper
print("🐌 Testing SYNC Scraper...")
sync_rag = RAGSystem()

start_time = time.time()
sync_success = sync_rag.scrape_and_process_website(
    start_urls=test_urls,
    max_pages=5,
    output_file="data/pytorch_sync.json",
    use_cache=False  # Force fresh scraping
)
sync_duration = time.time() - start_time

print(f"   ⏱️ Sync Duration: {sync_duration:.2f}s")

# Test 2: New Asynchronous Scraper
print("\n⚡ Testing ASYNC Scraper...")
async_rag = RAGSystem()

async def test_async():
    start_time = time.time()
    success = await async_rag.scrape_and_process_website_async(
        start_urls=test_urls,
        max_pages=30,
        output_file="data/pytorch_async.json",
        concurrent_limit=1,        # Conservative for reliability
        requests_per_second=3.0,   # Conservative rate
        use_cache=False            # Force fresh scraping
    )
    duration = time.time() - start_time
    return success, duration

async_success, async_duration = await test_async()

print(f"   ⏱️ Async Duration: {async_duration:.2f}s")

# Performance Analysis
if sync_success and async_success:
    improvement = sync_duration / async_duration if async_duration > 0 else 1
    time_saved = sync_duration - async_duration
    percent_faster = ((sync_duration - async_duration) / sync_duration) * 100 if sync_duration > 0 else 0
    
    print(f"\n🎯 PERFORMANCE RESULTS:")
    print(f"   • Async is {improvement:.1f}x speed ratio")
    print(f"   • Time difference: {time_saved:.2f}s ({percent_faster:.1f}% change)")
    print(f"   • Both completed successfully: ✅")
    
    print(f"\n📊 Real-World Expectations:")
    print(f"   • Small sites (1-5 pages): Similar performance")
    print(f"   • Medium sites (10-50 pages): 2-5x faster with async")  
    print(f"   • Large sites (100+ pages): 5-10x faster with async")
    print(f"   • Complex sites: Major async advantages!")
    
    print(f"\n💡 Async Benefits:")
    print("   • Concurrent processing of multiple URLs")
    print("   • Better resource utilization") 
    print("   • Maintains same quality extraction")
    print("   • Respects rate limits and robots.txt")
    
else:
    print("⚠️ One or both tests failed - check network connection")
    if not sync_success:
        print("   ❌ Sync test failed")
    if not async_success:
        print("   ❌ Async test failed")

print(f"\n✨ The async scraper eliminates delays and uses concurrent processing!")
print("💡 Try with larger websites to see dramatic performance gains!")
print("💡 Note: Some sites (like pytorch.org) may block concurrent requests")

In [2]:
# check local
import time
import asyncio

from src.rag_system import RAGSystem
from src.async_web_scraper import AsyncWebScraper, ScrapingConfig
from src.web_scraper import WebScraper

sync_scraper = WebScraper(local_mode=True)


rag_system = RAGSystem()

results = await AsyncWebScraper.process_local_files_fast(
    file_paths=["/home/rkpatel/RAG/data/temp.html","/home/rkpatel/RAG/data/temp2.html"],
    concurrent_limit=1,
    output_file="data/temp_local.json",
)

sync_result = sync_scraper.process_local_files(
    file_paths=["/home/rkpatel/RAG/data/temp.html"],
    output_file="data/test_sync_comparison.json"
)


INFO:src.async_web_scraper:🚀 AsyncWebScraper initialized with 1 concurrent workers
INFO:src.async_web_scraper:🚀 Starting async processing of 2 local HTML files...
INFO:src.async_web_scraper:⚡ Processing files with 2 concurrent workers...
INFO:src.async_web_scraper:📂 Reading local file: temp.html
INFO:src.async_web_scraper:📂 Reading local file: temp2.html
INFO:src.async_web_scraper:🧠 Creating semantic chunks from 2 documents...
INFO:src.async_web_scraper:💾 Results saved to data/temp_local.json and data/temp_local.txt
INFO:src.async_web_scraper:✅ Async local file processing complete!
INFO:src.async_web_scraper:   📊 Statistics:
INFO:src.async_web_scraper:      Files processed: 2
INFO:src.async_web_scraper:      Files failed: 0
INFO:src.async_web_scraper:      Semantic chunks: 8
INFO:src.async_web_scraper:      Processing time: 0.0s
INFO:src.async_web_scraper:      Processing rate: 20.0 files/sec
INFO:src.async_web_scraper:      Average chunk size: 28 words


🚀 Processing 1 local HTML files...

📂 Processing 1/1: temp.html
   📂 Reading local file: temp.html
   📄 Processing: file:///home/rkpatel/RAG/data/temp.html
      ✅ Extracted 4 sections

🧠 Creating semantic chunks from 1 documents...
🧠 Creating semantic chunks...
   ✅ Created 4 semantic chunks

💾 Saving to data/test_sync_comparison.json...
💾 Creating text file: data/test_sync_comparison.txt...

✅ Local file processing complete!
   📊 Statistics:
      Files processed: 1
      Semantic chunks: 4
      Average chunk size: 25 words
   📁 Files created:
      data/test_sync_comparison.json (structured JSON)
      data/test_sync_comparison.txt (plain text)
