In [1]:
# Settings for notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Show Python version
import platform
platform.python_version()

'3.8.5'

In [2]:
import logging
import re
import scrapy
from scrapy import Selector
from scrapy.crawler import CrawlerProcess

In [3]:
class FLSpider(scrapy.Spider):
    name = 'FL'
    start_urls = [
        'https://forgottenlanguages-full.forgottenlanguages.org/',
    ]
    custom_settings = {
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900,
        },
        'DOWNLOAD_DELAY': 0.25,
        'HTTPCACHE_ENABLED': True,
        'HTTPCACHE_DIR': 'httpcache',
        'LOG_LEVEL': logging.INFO,
        'FEEDS': {
            'refs.json': {
                'format': 'json',
                'encoding': 'utf8',
                'store_empty': False,
            }
        }
    }
    
    _sel_refs = '(.//div[@class="post-body entry-content"]//img)[last()]/../following-sibling::p'
    
    def parse(self, response):
        url = response.selector.xpath('.//h3[@class="post-title entry-title"]/a/@href').get()
        title = response.selector.xpath('.//h3[@class="post-title entry-title"]/a/text()').get().strip()
        refs = response.selector.xpath(self._sel_refs).getall()
        hrefs = response.selector.xpath(
            './/div[@class="post-body entry-content"]//img[last()]/../following-sibling::p/a/@href').getall()
        
        clean_refs = [re.sub('<[^<]+?>', '', ref).strip() for ref in refs]
        clean_refs = [ref for ref in clean_refs if ref]
        
        yield {
            'url': url,
            'title': title,
            'refs': clean_refs,
            'hrefs': hrefs
        }
        # get next page
        next_page_url = response.selector.xpath('.//a[@class="blog-pager-older-link"]/@href').get()
        if next_page_url:
            yield response.follow(next_page_url, self.parse)

In [None]:
process = CrawlerProcess()
process.crawl(FLSpider)
process.start()

2021-02-12 02:40:16 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: scrapybot)
2021-02-12 02:40:16 [scrapy.utils.log] INFO: Versions: lxml 4.6.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.8.5 (default, Sep  4 2020, 02:22:02) - [Clang 10.0.0 ], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.1.1, Platform macOS-10.15.5-x86_64-i386-64bit
2021-02-12 02:40:16 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2021-02-12 02:40:16 [scrapy.crawler] INFO: Overridden settings:
{'DOWNLOAD_DELAY': 0.25, 'HTTPCACHE_ENABLED': True, 'LOG_LEVEL': 20}
2021-02-12 02:40:16 [scrapy.extensions.telnet] INFO: Telnet Password: ada05cee6c1c0b84
2021-02-12 02:40:16 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.Lo

<Deferred at 0x7fa24744f670>

2021-02-12 02:41:16 [scrapy.extensions.logstats] INFO: Crawled 487 pages (at 487 pages/min), scraped 486 items (at 486 items/min)
2021-02-12 02:42:16 [scrapy.extensions.logstats] INFO: Crawled 979 pages (at 492 pages/min), scraped 978 items (at 492 items/min)
2021-02-12 02:43:16 [scrapy.extensions.logstats] INFO: Crawled 1505 pages (at 526 pages/min), scraped 1504 items (at 526 items/min)
2021-02-12 02:44:16 [scrapy.extensions.logstats] INFO: Crawled 2024 pages (at 519 pages/min), scraped 2023 items (at 519 items/min)
2021-02-12 02:45:16 [scrapy.extensions.logstats] INFO: Crawled 2545 pages (at 521 pages/min), scraped 2544 items (at 521 items/min)
2021-02-12 02:46:16 [scrapy.extensions.logstats] INFO: Crawled 3059 pages (at 514 pages/min), scraped 3058 items (at 514 items/min)
2021-02-12 02:47:16 [scrapy.extensions.logstats] INFO: Crawled 3576 pages (at 517 pages/min), scraped 3575 items (at 517 items/min)
2021-02-12 02:48:16 [scrapy.extensions.logstats] INFO: Crawled 4090 pages (at 51