In [1]:
# Settings for notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Show Python version
import platform
platform.python_version()

'3.8.5'

In [2]:
import logging
import re
import scrapy
from scrapy import Selector
from scrapy.crawler import CrawlerProcess

In [3]:
class FLSpider(scrapy.Spider):
    name = 'FL'
    start_urls = [
        'https://forgottenlanguages-full.forgottenlanguages.org/',
    ]
    custom_settings = {
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900,
        },
        'DOWNLOAD_DELAY': 0.25,
        'HTTPCACHE_ENABLED': True,
        'HTTPCACHE_DIR': 'httpcache',
        'LOG_LEVEL': logging.INFO,
        'FEEDS': {
            'refs.json': {
                'format': 'json',
                'encoding': 'utf8',
                'store_empty': False,
            }
        }
    }
    
    _sel = '(.//div[@class="post-body entry-content"]//img)[last()]/../following-sibling::p'
    
    def parse(self, response):
        url = response.selector.xpath('.//h3[@class="post-title entry-title"]/a/@href').get()
        title = response.selector.xpath('.//h3[@class="post-title entry-title"]/a/text()').get().strip()
        refs = response.selector.xpath(self._sel).getall()
        clean_refs = [re.sub('<[^<]+?>', '', ref).strip() for ref in refs]
        clean_refs = [ref for ref in clean_refs if ref]
        yield {
            'url': url,
            'title': title,
            'refs': clean_refs
        }
        # get next page
        next_page_url = response.selector.xpath('.//a[@class="blog-pager-older-link"]/@href').get()
        yield response.follow(next_page_url, self.parse)

In [None]:
process = CrawlerProcess()
process.crawl(FLSpider)
process.start()

2021-02-12 00:34:46 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: scrapybot)
2021-02-12 00:34:46 [scrapy.utils.log] INFO: Versions: lxml 4.6.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.8.5 (default, Sep  4 2020, 02:22:02) - [Clang 10.0.0 ], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.1.1, Platform macOS-10.15.5-x86_64-i386-64bit
2021-02-12 00:34:46 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2021-02-12 00:34:46 [scrapy.crawler] INFO: Overridden settings:
{'DOWNLOAD_DELAY': 0.25, 'HTTPCACHE_ENABLED': True, 'LOG_LEVEL': 20}
2021-02-12 00:34:46 [scrapy.extensions.telnet] INFO: Telnet Password: 524d6e0aae2f33f9
2021-02-12 00:34:46 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.Lo

<Deferred at 0x7fb789c5a2e0>

2021-02-12 00:35:46 [scrapy.extensions.logstats] INFO: Crawled 47 pages (at 47 pages/min), scraped 46 items (at 46 items/min)
2021-02-12 00:36:46 [scrapy.extensions.logstats] INFO: Crawled 93 pages (at 46 pages/min), scraped 93 items (at 47 items/min)
2021-02-12 00:37:46 [scrapy.extensions.logstats] INFO: Crawled 149 pages (at 56 pages/min), scraped 149 items (at 56 items/min)
2021-02-12 00:38:46 [scrapy.extensions.logstats] INFO: Crawled 204 pages (at 55 pages/min), scraped 204 items (at 55 items/min)
2021-02-12 00:39:46 [scrapy.extensions.logstats] INFO: Crawled 261 pages (at 57 pages/min), scraped 261 items (at 57 items/min)
2021-02-12 00:40:46 [scrapy.extensions.logstats] INFO: Crawled 310 pages (at 49 pages/min), scraped 310 items (at 49 items/min)
2021-02-12 00:41:46 [scrapy.extensions.logstats] INFO: Crawled 356 pages (at 46 pages/min), scraped 356 items (at 46 items/min)
2021-02-12 00:42:46 [scrapy.extensions.logstats] INFO: Crawled 411 pages (at 55 pages/min), scraped 411 item