# Scaling up Web Scraping with Ray

In this example we will show you how to use Ray for scraping information from the web. There are sophisticated Python libraries to achieve this task (like [https://scrapy.org/](https://scrapy.org/)). In this example we will keep it very simple and adapt existing code from [https://www.scrapingbee.com/blog/crawling-python/](https://www.scrapingbee.com/blog/crawling-python/) and show how easy it is to parallelize the code with Ray. Ray is well suited for scaling up web scraping: While for batch-processing systems we need a database or other stateful component to hold the list of crawled URLs, Ray's actors allow us to do everything in one framework.

First install the required dependencies with

```
pip install requests bs4
```

We can then already run the example from [https://www.scrapingbee.com/blog/crawling-python/](https://www.scrapingbee.com/blog/crawling-python/) out of the box like this:

In [None]:
import logging
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

logging.basicConfig(
    format='%(asctime)s %(levelname)s:%(message)s',
    level=logging.INFO)

In [None]:
class Crawler:

    def __init__(self, urls=[]):
        self.visited_urls = []
        self.urls_to_visit = urls

    def download_url(self, url):
        text = requests.get(url).text
        return text

    def get_linked_urls(self, url, html):
        soup = BeautifulSoup(html, 'html.parser')
        for link in soup.find_all('a'):
            path = link.get('href')
            if path and path.startswith('/'):
                path = urljoin(url, path)
            yield path

    def add_url_to_visit(self, url):
        if url not in self.visited_urls and url not in self.urls_to_visit:
            self.urls_to_visit.append(url)

    def crawl(self, url):
        html = self.download_url(url)
        for url in self.get_linked_urls(url, html):
            self.add_url_to_visit(url)

    def run(self):
        while self.urls_to_visit:
            url = self.urls_to_visit.pop(0)
            logging.info(f'URLs: {len(self.visited_urls) + len(self.urls_to_visit)}')
            try:
                self.crawl(url)
            except Exception:
                pass
                # logging.exception(f'Failed to crawl: {url}')
            finally:
                self.visited_urls.append(url)

if __name__ == '__main__':
    Crawler(urls=['https://en.wikipedia.org/']).run()

In order to parallelize the crawling, let us first initialize Ray.

In [None]:
# We initialize Ray's profiling feature to get insights into where the bottlenecks of the application are.
import os
os.environ["RAY_PROFILING"] = "1"

import ray
ray.init()

We need to keep track of which URLs we already crawled to avoid double visiting them and we also need to keep track of which URLs still need to be visited. We do this by centralize this data in an actor `CrawlQueue`.

In [None]:
import ray
import asyncio
import collections

@ray.remote
class CrawlQueue:
    # Initialize the crawl queue with a set of seed urls to be crawled.
    async def __init__(self, seed_urls):
        logging.basicConfig(
            format='%(asctime)s %(levelname)s:%(message)s',
            level=logging.INFO)
        # A queue of pending crawl requests
        self.pending_crawl_requests = asyncio.Queue()
        # All crawl requests - pending, in progress, and completed
        self.all_crawl_requests = set()
        for url in seed_urls:
            await self.add_crawl_request(url)

    # Add additional urls to be crawled - this is called each time a crawler
    # encounters a URL in the document it is processing.
    async def add_crawl_request(self, url):
        if url not in self.all_crawl_requests:
            await self.pending_crawl_requests.put(url)
            self.all_crawl_requests.add(url)

    # Get an url to crawl - this is called from an idle crawler.
    # It returns the url to be crawled.
    async def get_crawl_request(self):
        logging.info(f'URLs: {len(self.all_crawl_requests)}')
        return await self.pending_crawl_requests.get()

<!-- #raw -->
```{eval-rst}
.. code-block:: python
    :emphasize-lines: 21, 30, 39, 40, 42, 45, 46

    class RayCrawler:

        def __init__(self, crawl_queue):
            self.crawl_queue = crawl_queue
            self.num_processed_bytes = 0

        def download_url(self, url):
            text = requests.get(url).text
            self.num_processed_bytes += len(text)
            return text

        def get_linked_urls(self, url, html):
            soup = BeautifulSoup(html, 'html.parser')
            for link in soup.find_all('a'):
                path = link.get('href')
                if path and path.startswith('/'):
                    path = urljoin(url, path)
                yield path

        def add_url_to_visit(self, url):
            self.crawl_queue.add_crawl_request.remote(url)

        def crawl(self, url):
            html = self.download_url(url)
            for url in self.get_linked_urls(url, html):
                self.add_url_to_visit(url)

        def run(self):
            while True:
                url = ray.get(self.crawl_queue.get_crawl_request.remote())
                logging.info(f'Crawling: {url}')
                logging.info(f'Bytes: {self.num_processed_bytes}')
                try:
                    self.crawl(url)
                except Exception:
                    # logging.exception(f'Failed to crawl: {url}')
                    pass
                
    @ray.remote
    def worker(crawl_queue):
        logging.basicConfig(level=logging.INFO)
        RayCrawler(crawl_queue).run()

    if __name__ == '__main__':
        crawl_queue = CrawlQueue.remote(['https://en.wikipedia.org/'])
        ray.get([worker.remote(crawl_queue) for i in range(5)])
```
```
(CrawlQueue pid=20236) 2022-06-24 01:23:42,862 INFO:URLs: 1
(CrawlQueue pid=20236) 2022-06-24 01:23:42,863 INFO:URLs: 1
(CrawlQueue pid=20236) 2022-06-24 01:23:42,863 INFO:URLs: 1
(CrawlQueue pid=20236) 2022-06-24 01:23:42,863 INFO:URLs: 1
(CrawlQueue pid=20236) 2022-06-24 01:23:43,065 INFO:URLs: 40
(CrawlQueue pid=20236) 2022-06-24 01:23:43,065 INFO:URLs: 40
(CrawlQueue pid=20236) 2022-06-24 01:23:43,065 INFO:URLs: 40
(CrawlQueue pid=20236) 2022-06-24 01:23:43,161 INFO:URLs: 262
(CrawlQueue pid=20236) 2022-06-24 01:23:43,264 INFO:URLs: 327
(CrawlQueue pid=20236) 2022-06-24 01:23:43,637 INFO:URLs: 997
(CrawlQueue pid=20236) 2022-06-24 01:23:43,904 INFO:URLs: 1664
(CrawlQueue pid=20236) 2022-06-24 01:23:44,828 INFO:URLs: 3537
(CrawlQueue pid=20236) 2022-06-24 01:23:44,933 INFO:URLs: 3740
(CrawlQueue pid=20236) 2022-06-24 01:23:45,448 INFO:URLs: 5085
(CrawlQueue pid=20236) 2022-06-24 01:23:46,652 INFO:URLs: 7862
(CrawlQueue pid=20236) 2022-06-24 01:23:46,723 INFO:URLs: 7979
(CrawlQueue pid=20236) 2022-06-24 01:23:46,744 INFO:URLs: 7979
(CrawlQueue pid=20236) 2022-06-24 01:23:47,662 INFO:URLs: 10000
(CrawlQueue pid=20236) 2022-06-24 01:23:47,711 INFO:URLs: 10079
(CrawlQueue pid=20236) 2022-06-24 01:23:48,267 INFO:URLs: 11288
(CrawlQueue pid=20236) 2022-06-24 01:23:49,029 INFO:URLs: 12996
(CrawlQueue pid=20236) 2022-06-24 01:23:49,069 INFO:URLs: 13054
(CrawlQueue pid=20236) 2022-06-24 01:23:49,903 INFO:URLs: 14534
(CrawlQueue pid=20236) 2022-06-24 01:23:50,250 INFO:URLs: 15093
(CrawlQueue pid=20236) 2022-06-24 01:23:50,622 INFO:URLs: 15850
(CrawlQueue pid=20236) 2022-06-24 01:23:50,678 INFO:URLs: 15961
(CrawlQueue pid=20236) 2022-06-24 01:23:50,890 INFO:URLs: 16365
(CrawlQueue pid=20236) 2022-06-24 01:23:51,030 INFO:URLs: 16596
(CrawlQueue pid=20236) 2022-06-24 01:23:51,331 INFO:URLs: 17192
(CrawlQueue pid=20236) 2022-06-24 01:23:51,335 INFO:URLs: 17198
(CrawlQueue pid=20236) 2022-06-24 01:23:51,434 INFO:URLs: 17350
(CrawlQueue pid=20236) 2022-06-24 01:23:51,746 INFO:URLs: 17486
(CrawlQueue pid=20236) 2022-06-24 01:23:51,927 INFO:URLs: 17806
(CrawlQueue pid=20236) 2022-06-24 01:23:51,980 INFO:URLs: 17875
2022-06-24 01:23:52,564	WARNING worker.py:1404 -- Warning: More than 5000 tasks are pending submission to actor 5f0fdb5035e172d9bb00d39b01000000. To reduce memory usage, wait for these tasks to finish before sending more.
(CrawlQueue pid=20236) 2022-06-24 01:23:53,238 INFO:URLs: 20569
(CrawlQueue pid=20236) 2022-06-24 01:23:54,835 INFO:URLs: 25175
(CrawlQueue pid=20236) 2022-06-24 01:23:55,689 INFO:URLs: 26614
(CrawlQueue pid=20236) 2022-06-24 01:23:56,443 INFO:URLs: 27648
(CrawlQueue pid=20236) 2022-06-24 01:23:57,483 INFO:URLs: 29637
(CrawlQueue pid=20236) 2022-06-24 01:23:58,491 INFO:URLs: 31531
(CrawlQueue pid=20236) 2022-06-24 01:23:59,273 INFO:URLs: 32863
(CrawlQueue pid=20236) 2022-06-24 01:23:59,978 INFO:URLs: 33858
(CrawlQueue pid=20236) 2022-06-24 01:24:00,229 INFO:URLs: 34293
(CrawlQueue pid=20236) 2022-06-24 01:24:00,699 INFO:URLs: 35119
(CrawlQueue pid=20236) 2022-06-24 01:24:00,791 INFO:URLs: 35281
(CrawlQueue pid=20236) 2022-06-24 01:24:00,987 INFO:URLs: 35740
(CrawlQueue pid=20236) 2022-06-24 01:24:01,034 INFO:URLs: 35796
(CrawlQueue pid=20236) 2022-06-24 01:24:01,102 INFO:URLs: 35916
(CrawlQueue pid=20236) 2022-06-24 01:24:01,404 INFO:URLs: 36113
(CrawlQueue pid=20236) 2022-06-24 01:24:01,848 INFO:URLs: 36783
(CrawlQueue pid=20236) 2022-06-24 01:24:01,922 INFO:URLs: 36918
(CrawlQueue pid=20236) 2022-06-24 01:24:02,332 INFO:URLs: 37714
(CrawlQueue pid=20236) 2022-06-24 01:24:02,799 INFO:URLs: 38609
(CrawlQueue pid=20236) 2022-06-24 01:24:03,169 INFO:URLs: 39630
(CrawlQueue pid=20236) 2022-06-24 01:24:03,547 INFO:URLs: 40015
(CrawlQueue pid=20236) 2022-06-24 01:24:03,630 INFO:URLs: 40130
(CrawlQueue pid=20236) 2022-06-24 01:24:04,569 INFO:URLs: 41238
(CrawlQueue pid=20236) 2022-06-24 01:24:05,444 INFO:URLs: 42255
(CrawlQueue pid=20236) 2022-06-24 01:24:05,493 INFO:URLs: 42299
(CrawlQueue pid=20236) 2022-06-24 01:24:06,600 INFO:URLs: 43529
(CrawlQueue pid=20236) 2022-06-24 01:24:06,712 INFO:URLs: 43615
(CrawlQueue pid=20236) 2022-06-24 01:24:06,984 INFO:URLs: 43969
(CrawlQueue pid=20236) 2022-06-24 01:24:08,501 INFO:URLs: 46122
(CrawlQueue pid=20236) 2022-06-24 01:24:08,605 INFO:URLs: 46244
(CrawlQueue pid=20236) 2022-06-24 01:24:10,775 INFO:URLs: 49757
(CrawlQueue pid=20236) 2022-06-24 01:24:10,867 INFO:URLs: 49930
(CrawlQueue pid=20236) 2022-06-24 01:24:11,866 INFO:URLs: 50991
2022-06-24 01:24:12,448	WARNING worker.py:1404 -- Warning: More than 5000 tasks are pending submission to actor 5f0fdb5035e172d9bb00d39b01000000. To reduce memory usage, wait for these tasks to finish before sending more.
(CrawlQueue pid=20236) 2022-06-24 01:24:12,695 INFO:URLs: 52009
(CrawlQueue pid=20236) 2022-06-24 01:24:13,434 INFO:URLs: 53378
(CrawlQueue pid=20236) 2022-06-24 01:24:15,572 INFO:URLs: 56747
(CrawlQueue pid=20236) 2022-06-24 01:24:18,551 INFO:URLs: 61107
(CrawlQueue pid=20236) 2022-06-24 01:24:19,181 INFO:URLs: 62028
(CrawlQueue pid=20236) 2022-06-24 01:24:19,717 INFO:URLs: 62908
(CrawlQueue pid=20236) 2022-06-24 01:24:19,879 INFO:URLs: 63109
(CrawlQueue pid=20236) 2022-06-24 01:24:20,053 INFO:URLs: 63311
(CrawlQueue pid=20236) 2022-06-24 01:24:20,148 INFO:URLs: 63417
(CrawlQueue pid=20236) 2022-06-24 01:24:20,415 INFO:URLs: 63798
(CrawlQueue pid=20236) 2022-06-24 01:24:20,535 INFO:URLs: 63931
(CrawlQueue pid=20236) 2022-06-24 01:24:21,370 INFO:URLs: 65354
(CrawlQueue pid=20236) 2022-06-24 01:24:21,406 INFO:URLs: 65423
(CrawlQueue pid=20236) 2022-06-24 01:24:21,560 INFO:URLs: 65674
(CrawlQueue pid=20236) 2022-06-24 01:24:22,653 INFO:URLs: 66384
(CrawlQueue pid=20236) 2022-06-24 01:24:23,023 INFO:URLs: 66932
(CrawlQueue pid=20236) 2022-06-24 01:24:23,153 INFO:URLs: 67007
(CrawlQueue pid=20236) 2022-06-24 01:24:24,631 INFO:URLs: 69091
(CrawlQueue pid=20236) 2022-06-24 01:24:24,798 INFO:URLs: 69352
(CrawlQueue pid=20236) 2022-06-24 01:24:27,194 INFO:URLs: 72397
(CrawlQueue pid=20236) 2022-06-24 01:24:27,276 INFO:URLs: 72568
(CrawlQueue pid=20236) 2022-06-24 01:24:28,088 INFO:URLs: 73582
(CrawlQueue pid=20236) 2022-06-24 01:24:28,089 INFO:URLs: 73582
(CrawlQueue pid=20236) 2022-06-24 01:24:28,413 INFO:URLs: 74055
(CrawlQueue pid=20236) 2022-06-24 01:24:29,438 INFO:URLs: 75363
(CrawlQueue pid=20236) 2022-06-24 01:24:29,832 INFO:URLs: 75932
(CrawlQueue pid=20236) 2022-06-24 01:24:30,353 INFO:URLs: 76676
(CrawlQueue pid=20236) 2022-06-24 01:24:31,444 INFO:URLs: 77963
(CrawlQueue pid=20236) 2022-06-24 01:24:31,774 INFO:URLs: 78581
(CrawlQueue pid=20236) 2022-06-24 01:24:32,001 INFO:URLs: 78909
(CrawlQueue pid=20236) 2022-06-24 01:24:32,288 INFO:URLs: 79222
(CrawlQueue pid=20236) 2022-06-24 01:24:32,746 INFO:URLs: 79767
(CrawlQueue pid=20236) 2022-06-24 01:24:33,328 INFO:URLs: 80571
(CrawlQueue pid=20236) 2022-06-24 01:24:34,868 INFO:URLs: 82719
(CrawlQueue pid=20236) 2022-06-24 01:24:35,182 INFO:URLs: 83124

(CrawlQueue pid=20236) 2022-06-24 01:24:36,195 INFO:URLs: 84420
(CrawlQueue pid=20236) 2022-06-24 01:24:36,733 INFO:URLs: 85094
(CrawlQueue pid=20236) 2022-06-24 01:24:36,935 INFO:URLs: 85221
(CrawlQueue pid=20236) 2022-06-24 01:24:37,995 INFO:URLs: 86190
(CrawlQueue pid=20236) 2022-06-24 01:24:38,816 INFO:URLs: 86995
(CrawlQueue pid=20236) 2022-06-24 01:24:39,070 INFO:URLs: 87348
(CrawlQueue pid=20236) 2022-06-24 01:24:39,215 INFO:URLs: 87517
(CrawlQueue pid=20236) 2022-06-24 01:24:39,383 INFO:URLs: 87707
(CrawlQueue pid=20236) 2022-06-24 01:24:40,026 INFO:URLs: 88526
(CrawlQueue pid=20236) 2022-06-24 01:24:40,654 INFO:URLs: 89269
(CrawlQueue pid=20236) 2022-06-24 01:24:40,856 INFO:URLs: 89389
(CrawlQueue pid=20236) 2022-06-24 01:24:41,212 INFO:URLs: 89914
(CrawlQueue pid=20236) 2022-06-24 01:24:41,452 INFO:URLs: 90209
(CrawlQueue pid=20236) 2022-06-24 01:24:41,878 INFO:URLs: 90791
(CrawlQueue pid=20236) 2022-06-24 01:24:42,152 INFO:URLs: 91179
(CrawlQueue pid=20236) 2022-06-24 01:24:42,213 INFO:URLs: 91263
(CrawlQueue pid=20236) 2022-06-24 01:24:42,285 INFO:URLs: 91341
(CrawlQueue pid=20236) 2022-06-24 01:24:42,352 INFO:URLs: 91407
(CrawlQueue pid=20236) 2022-06-24 01:24:42,419 INFO:URLs: 91462
(CrawlQueue pid=20236) 2022-06-24 01:24:42,507 INFO:URLs: 91567
(CrawlQueue pid=20236) 2022-06-24 01:24:42,569 INFO:URLs: 91629
(CrawlQueue pid=20236) 2022-06-24 01:24:42,654 INFO:URLs: 91672
(CrawlQueue pid=20236) 2022-06-24 01:24:42,795 INFO:URLs: 91802
(CrawlQueue pid=20236) 2022-06-24 01:24:42,882 INFO:URLs: 91849
(CrawlQueue pid=20236) 2022-06-24 01:24:42,948 INFO:URLs: 91900
(CrawlQueue pid=20236) 2022-06-24 01:24:43,220 INFO:URLs: 92254
(CrawlQueue pid=20236) 2022-06-24 01:24:43,302 INFO:URLs: 92331
(CrawlQueue pid=20236) 2022-06-24 01:24:43,441 INFO:URLs: 92521
(CrawlQueue pid=20236) 2022-06-24 01:24:43,662 INFO:URLs: 92930
(CrawlQueue pid=20236) 2022-06-24 01:24:43,793 INFO:URLs: 93069
(CrawlQueue pid=20236) 2022-06-24 01:24:44,032 INFO:URLs: 93459
```
<!-- #endraw -->

In [None]:
class RayCrawler:

    def __init__(self, crawl_queue):
        self.crawl_queue = crawl_queue

    def download_url(self, url):
        text = requests.get(url).text
        return text

    def get_linked_urls(self, url, html):
        soup = BeautifulSoup(html, 'html.parser')
        for link in soup.find_all('a'):
            path = link.get('href')
            if path and path.startswith('/'):
                path = urljoin(url, path)
            yield path

    def add_url_to_visit(self, url):
        self.crawl_queue.add_crawl_request.remote(url)

    def crawl(self, url):
        html = self.download_url(url)
        for url in self.get_linked_urls(url, html):
            self.add_url_to_visit(url)

    def run(self):
        while True:
            url = ray.get(self.crawl_queue.get_crawl_request.remote())
            try:
                self.crawl(url)
            except Exception:
                # logging.exception(f'Failed to crawl: {url}')
                pass
                
@ray.remote
def worker(crawl_queue):
    logging.basicConfig(level=logging.INFO)
    RayCrawler(crawl_queue).run()

if __name__ == '__main__':
    crawl_queue = CrawlQueue.remote(['https://en.wikipedia.org/'])
    ray.get([worker.remote(crawl_queue) for i in range(4)])

### Profiling the application

In order to profile where the bottlenecks of the application are, we save the task timeline to `/tmp/timeline.json`.

In [None]:
ray.timeline(filename="/tmp/timeline.json")

We can load the timeline in Chrome Tracing -- open the Chrome browser, navigate to `chrome://tracing`, click the `Load` button and select the `/tmp/timeline.json` file. After zooming into the timeline a bit you should see a timeline view like

```{image} ../../images/web_crawler_timeline.png
:align: center
```

If you select one of the grey tasks, you see that it is from `CrawlQueue.add_crawl_request()` and there are lots of them.

## Batching

In [None]:
import ray
import asyncio
import collections

@ray.remote
class BatchedCrawlQueue:
    # Initialize the crawl queue with a set of seed urls to be crawled.
    async def __init__(self, seed_urls):
        logging.basicConfig(
            format='%(asctime)s %(levelname)s:%(message)s',
            level=logging.INFO)
        # A queue of pending crawl requests
        self.pending_crawl_requests = asyncio.Queue()
        # All crawl requests - pending, in progress, and completed
        self.all_crawl_requests = set()
        await self.add_crawl_requests(seed_urls)

    # Add additional urls to be crawled - this is called each time a crawler
    # encounters a URL in the document it is processing.
    async def add_crawl_requests(self, urls):
        for url in urls:
            if url not in self.all_crawl_requests:
                await self.pending_crawl_requests.put(url)
                self.all_crawl_requests.add(url)

    # Get an url to crawl - this is called from an idle crawler.
    # It returns the url to be crawled.
    async def get_crawl_request(self):
        logging.info(f'URLs: {len(self.all_crawl_requests)}')
        return await self.pending_crawl_requests.get()

In [None]:
logging.basicConfig(level=logging.INFO)

class BatchedRayCrawler:

    def __init__(self, crawl_queue):
        self.crawl_queue = crawl_queue

    def download_url(self, url):
        text = requests.get(url).text
        return text

    def get_linked_urls(self, url, html):
        soup = BeautifulSoup(html, 'html.parser')
        for link in soup.find_all('a'):
            path = link.get('href')
            if path and path.startswith('/'):
                path = urljoin(url, path)
            yield path

    def crawl(self, url):
        html = self.download_url(url)
        urls = []
        for x in self.get_linked_urls(url, html):
            urls.append(x)
        self.crawl_queue.add_crawl_requests.remote(urls)

    def run(self):
        while True:
            url = ray.get(self.crawl_queue.get_crawl_request.remote())
            try:
                self.crawl(url)
                logging.info(url)
            except Exception:
                # logging.exception(f'Failed to crawl: {url}')
                pass
                
@ray.remote
def worker(crawl_queue):
    logging.basicConfig(level=logging.INFO)
    BatchedRayCrawler(crawl_queue).run()

if __name__ == '__main__':
    crawl_queue = BatchedCrawlQueue.remote(['https://en.wikipedia.org/'])
    ray.get([worker.remote(crawl_queue) for i in range(4)])