In [1]:
import requests
from bs4 import BeautifulSoup
from typing import List

In [2]:
def scrape_urls(url: str) -> List[str]:
    """
    Scrapes the text from a given URL and its sub-URLs.

    Args:
        url (str): The URL to scrape.

    Returns:
        List[str]: A list of strings containing the extracted text.
    """
    result = []
    visited_urls = set()  # Keep track of visited URLs to avoid duplicates

    def scrape_text(url: str) -> None:
        """
        Helper function to scrape text from a URL.

        Args:
            url (str): The URL to scrape.
        """
        nonlocal result, visited_urls

        if url in visited_urls:
            return  # Skip already visited URLs

        visited_urls.add(url)

        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            text = soup.get_text()
            result.append(text)
        except requests.exceptions.RequestException as e:
            print(f"An error occurred while scraping {url}: {e}")

        # Find all anchor tags and recursively scrape their href URLs
        for anchor in soup.find_all('a', href=True):
            sub_url = anchor['href']
            if sub_url.startswith('http'):
                scrape_text(sub_url)
            else:
                # Construct absolute URL if the sub URL is relative
                base_url = response.url
                sub_url = f"{base_url}/{sub_url}"
                scrape_text(sub_url)

    scrape_text(url)
    return result


In [4]:
%%time

url = "https://finviz.com/news.ashx"
scraped_text = scrape_urls(url)
print(scraped_text)

['Just a moment...Enable JavaScript and cookies to continue\n']
CPU times: user 110 ms, sys: 964 µs, total: 111 ms
Wall time: 169 ms


In [7]:
len(scraped_text[0])

58