Skip to content

AttributeError: 'NoneType' object has no attribute 'headers' #10

@michaelvsinko

Description

@michaelvsinko

The error exists at arbitrary times. Maybe after viewing 500 pages, or on the very first one.

Example to reproduce

import re

from scrapy import Spider, Request
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
from scrapy.utils.project import get_project_settings


class Spider1(Spider):
  name = "spider1"
  custom_settings = {
    "FEEDS": {
      "members.json": {
        "format": "jsonlines",
        "encoding": "utf-8",
        "store_empty": False,
      },
    }
  }
  
  URL_MEMBERS_POSTFIX = "?act=members&offset=0"
  MEMBERS_LIST = '//div[@id="mcont"]/descendant::div[has-class("upanel")]/a[has-class("inline_item")]'
  MEMBER_NAME = './div[has-class("ii_body")]/span[has-class("ii_owner")]/text()'
  NEXT_URL = '//div[@id="mcont"]/descendant::div[has-class("upanel")]/div[has-class("pagination")]/a[has-class("pg_link_sel")]/following-sibling::a[has-class("pg_link")]/@href'
  
  def start_requests(self):
    for url in self.start_urls:
      yield Request(url=url + self.URL_MEMBERS_POSTFIX, meta={"playwright": True})
      
  def parse(self, response):
    selector = Selector(response)
    
    members = selector.xpath(self.MEMBERS_LIST)
    for member in members:
      member_name = member.xpath(self.MEMBER_NAME).get()
      
      yield {"member_name": member_name}
      
    next_url = selector.xpath(self.NEXT_URL).get()
    if next_url:
      next_offset = re.findall(r"offset=[0-9]*", next_url)[0]
      next_url = re.sub(r"offset=[0-9]*", next_offset, response.url)
      
      yield Request(url=next_url, meta={"playwright": True}, callback=self.parse)
      
      
if __name__ == "__main__":
  settings = get_project_settings()
  process = CrawlerProcess(settings=settings)
  process.crawl(Spider1, start_urls=["https://m.vk.com/vkmusicians"])
  #process.crawl(Spider1, start_urls=["https://m.vk.com/tumblr_perf"])
  process.start()

Settings

BOT_NAME = 'crawlers'

SPIDER_MODULES = ['crawlers.spiders']
NEWSPIDER_MODULE = 'crawlers.spiders'

ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 3

CONCURRENT_REQUESTS = 1
CONCURRENT_REQUESTS_PER_DOMAIN = 1
CONCURRENT_REQUESTS_PER_IP = 1

COOKIES_ENABLED = False

TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"

DOWNLOAD_HANDLERS = {
    "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
    #"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}

DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
    'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
    'scrapy_fake_useragent.middleware.RetryUserAgentMiddleware': 401,
}

FAKEUSERAGENT_PROVIDERS = [
    'scrapy_fake_useragent.providers.FakeUserAgentProvider',
    'scrapy_fake_useragent.providers.FakerProvider',
    'scrapy_fake_useragent.providers.FixedUserAgentProvider',
]
FAKEUSERAGENT_FALLBACK = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 (KHTML, like Gecko) Brave Chrome/89.0.4389.105 Safari/537.36'

PLAYWRIGHT_BROWSER_TYPE = "chromium"
FAKE_USERAGENT_RANDOM_UA_TYPE = "chrome"
FAKER_RANDOM_UA_TYPE = "chrome"

Error

Traceback (most recent call last):
  File "/Users/user/develop/work/crawlers/.venv/lib/python3.8/site-packages/twisted/internet/defer.py", line 1443, in _inlineCallbacks
    result = current_context.run(result.throwExceptionIntoGenerator, g)
  File "/Users/user/develop/work/crawlers/.venv/lib/python3.8/site-packages/twisted/python/failure.py", line 500, in throwExceptionIntoGenerator
    return g.throw(self.type, self.value, self.tb)
  File "/Users/user/develop/work/crawlers/.venv/lib/python3.8/site-packages/scrapy/core/downloader/middleware.py", line 44, in process_request
    return (yield download_func(request=request, spider=spider))
  File "/Users/user/develop/work/crawlers/.venv/lib/python3.8/site-packages/twisted/internet/defer.py", line 837, in adapt
    extracted = result.result()
  File "/Users/user/develop/work/crawlers/.venv/lib/python3.8/site-packages/scrapy_playwright/handler.py", line 140, in _download_request
    result = await self._download_request_with_page(request, spider, page)
  File "/Users/user/develop/work/crawlers/.venv/lib/python3.8/site-packages/scrapy_playwright/handler.py", line 180, in _download_request_with_page
    headers = Headers(response.headers)
AttributeError: 'NoneType' object has no attribute 'headers'

Using

  • python 3.8.5
  • scrapy 2.5.0
  • playwright 1.10.0
  • scrapy-playwright 0.0.3
  • scrapy-fake-useragent 1.4.4

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions