In [1]:
import requests
import random
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from seleniumwire import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from cachetools import cached, TTLCache

class ScrapingService:
    def __init__(self):
        self.proxies = []
        self.cache = TTLCache(maxsize=1000, ttl=3600)
        self.CREDIBLE_DOMAINS = {".gov", ".edu", ".ac.", "wikipedia.org", "who.int"}
        self.TRUSTED_TLDS = {".org", ".gov", ".edu"}
        self.USER_AGENTS = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
            "Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0"
        ]
        
        self.SEARCH_API_KEY = "AIzaSyAThHluWfPfcDLKELraJS2RTGJhih3BOEM"
        self.SEARCH_ENGINE_ID = "41b7b35fba4b24f40"
        self.SEARCH_API_URL = "https://www.googleapis.com/customsearch/v1"

service = ScrapingService()

In [2]:
def test_selenium_scrape(url: str, return_html: bool = False):
    """Test the selenium-based scraping"""
    try:
        options = ChromeOptions()
        options.add_argument("--headless=new")
        options.add_argument(f"user-agent={random.choice(service.USER_AGENTS)}")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        
        # Fix for M1/M2 Macs
        options.add_argument("--remote-debugging-port=9222")
        options.add_argument("--disable-gpu")
        
        # Specify the correct ChromeDriver path
        try:
            chrome_service = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service=chrome_service, options=options)
        except Exception as e:
            print(f"ChromeDriver error: {str(e)}")
            print("Trying with direct ChromeDriver path...")
            # Fallback to system Chrome if installed
            options.binary_location = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
            driver = webdriver.Chrome(options=options)
        
        try:
            print(f"\nFetching {url} with Selenium...")
            driver.get(url)
            html = driver.page_source
            
            if return_html:
                return html
                
            # Test sanitization
            try:
                soup = BeautifulSoup(html, 'lxml')
            except:
                soup = BeautifulSoup(html, 'html.parser')
                
            for element in soup(["script", "style", "nav", "footer", "iframe", "noscript"]):
                element.decompose()
            sanitized = soup.get_text(separator='\n', strip=True)[:15000]
            
            print("\nSuccessfully scraped with Selenium")
            print(f"Content length: {len(sanitized)} characters")
            print("\nSample content:")
            print(sanitized[:500] + "...")
            
            return sanitized
        finally:
            driver.quit()
    except Exception as e:
        print(f"Selenium scrape failed: {str(e)}")
        return None

In [3]:
def test_search(query: str, max_results: int = 3):
    """Test the search engine functionality"""
    if not service.SEARCH_API_KEY or service.SEARCH_API_KEY == "YOUR_API_KEY":
        print("Search test skipped - No API key configured")
        return None
        
    try:
        params = {
            "key": service.SEARCH_API_KEY,
            "cx": service.SEARCH_ENGINE_ID,
            "q": query,
            "num": max_results,
            "safe": "active"
        }
        
        print(f"\nTesting search for: '{query}'")
        resp = requests.get(service.SEARCH_API_URL, params=params, timeout=10)
        resp.raise_for_status()
        results = resp.json().get('items', [])
        
        print(f"Found {len(results)} results:")
        for i, item in enumerate(results, 1):
            print(f"\n{i}. {item.get('title')}")
            print(f"URL: {item.get('link')}")
            print(f"Snippet: {item.get('snippet')}")
            
        return results
    except Exception as e:
        print(f"Search failed: {str(e)}")
        if hasattr(e, 'response') and e.response.status_code == 403:
            print("API key may be invalid or quota exceeded")
        return None

In [4]:
def test_basic_scrape(url: str):
    """Test the basic web scraping functionality"""
    try:
        # First try with requests
        headers = {"User-Agent": random.choice(service.USER_AGENTS)}
        print(f"Testing basic scrape of {url} with requests...")
        
        try:
            resp = requests.get(url, headers=headers, timeout=10)
            resp.raise_for_status()
            html = resp.text
            print("Success with direct requests")
        except requests.RequestException as e:
            print(f"Requests failed, trying with selenium: {str(e)}")
            html = test_selenium_scrape(url, return_html=True)
            if not html:
                raise ValueError("Both requests and selenium failed")
        
        # Test sanitize_content - using 'html.parser' as fallback
        try:
            soup = BeautifulSoup(html, 'lxml')
        except:
            print("Falling back to html.parser")
            soup = BeautifulSoup(html, 'html.parser')
            
        for element in soup(["script", "style", "nav", "footer", "iframe", "noscript"]):
            element.decompose()
        sanitized = soup.get_text(separator='\n', strip=True)[:15000]
        
        print(f"\nScraped content length: {len(sanitized)} characters")
        print("\nSample content:")
        print(sanitized[:500] + "...")
        
        return sanitized
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return None

In [5]:
def test_credible_source(url: str):
    """Test the credible source detection"""
    parsed = urlparse(url)
    is_credible = any(tld in parsed.netloc for tld in service.TRUSTED_TLDS) or \
                 any(domain in parsed.netloc for domain in service.CREDIBLE_DOMAINS)
    
    print(f"URL: {url}")
    print(f"Domain: {parsed.netloc}")
    print(f"Is credible source: {is_credible}")
    return is_credible

# Example usage:
# test_credible_source("https://www.nih.gov")
# test_credible_source("https://www.wikipedia.org")
# test_credible_source("https://www.example.com")

In [6]:
def run_all_tests():
    """Run all test cases with error handling"""
    print("=== Testing Basic Scraping ===")
    basic_result = test_basic_scrape("https://httpbin.org/html")  # Using a reliable test URL
    
    print("\n=== Testing Search ===")
    search_results = test_search("python programming", max_results=3) if service.SEARCH_API_KEY != "YOUR_API_KEY" else None
    
    print("\n=== Testing Selenium ===")
    selenium_result = test_selenium_scrape("https://httpbin.org/html")
    
    print("\n=== Testing Credible Sources ===")
    test_credible_source("https://www.cdc.gov")
    test_credible_source("https://www.nytimes.com")
    test_credible_source("https://www.harvard.edu")
    
    return {
        "basic_scrape": bool(basic_result),
        "search": bool(search_results) if search_results is not None else "skipped",
        "selenium": bool(selenium_result)
    }

# Run the tests
test_results = run_all_tests()
print("\n=== Test Summary ===")
for test, result in test_results.items():
    status = "PASSED" if result else "FAILED" if result is not True else "SKIPPED"
    print(f"{test}: {status}")

=== Testing Basic Scraping ===
Testing basic scrape of https://httpbin.org/html with requests...
Success with direct requests

Scraped content length: 3594 characters

Sample content:
Herman Melville - Moby-Dick
Availing himself of the mild, summer-cool weather that now reigned in these latitudes, and in preparation for the peculiarly active pursuits shortly to be anticipated, Perth, the begrimed, blistered old blacksmith, had not removed his portable forge to the hold again, after concluding his contributory work for Ahab's leg, but still retained it on deck, fast lashed to ringbolts by the foremast; being now almost incessantly invoked by the headsmen, and harpooneers, and ...

=== Testing Search ===

Testing search for: 'python programming'
Found 3 results:

1. Welcome to Python.org
URL: https://www.python.org/
Snippet: The official home of the Python Programming Language.

2. Introduction to Python
URL: https://www.w3schools.com/python/python_intro.asp
Snippet: Python Syntax compar