-
Notifications
You must be signed in to change notification settings - Fork 149
Closed
Labels
bugSomething isn't workingSomething isn't working
Description
The error exists at arbitrary times. Maybe after viewing 500 pages, or on the very first one.
Example to reproduce
import re
from scrapy import Spider, Request
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
from scrapy.utils.project import get_project_settings
class Spider1(Spider):
name = "spider1"
custom_settings = {
"FEEDS": {
"members.json": {
"format": "jsonlines",
"encoding": "utf-8",
"store_empty": False,
},
}
}
URL_MEMBERS_POSTFIX = "?act=members&offset=0"
MEMBERS_LIST = '//div[@id="mcont"]/descendant::div[has-class("upanel")]/a[has-class("inline_item")]'
MEMBER_NAME = './div[has-class("ii_body")]/span[has-class("ii_owner")]/text()'
NEXT_URL = '//div[@id="mcont"]/descendant::div[has-class("upanel")]/div[has-class("pagination")]/a[has-class("pg_link_sel")]/following-sibling::a[has-class("pg_link")]/@href'
def start_requests(self):
for url in self.start_urls:
yield Request(url=url + self.URL_MEMBERS_POSTFIX, meta={"playwright": True})
def parse(self, response):
selector = Selector(response)
members = selector.xpath(self.MEMBERS_LIST)
for member in members:
member_name = member.xpath(self.MEMBER_NAME).get()
yield {"member_name": member_name}
next_url = selector.xpath(self.NEXT_URL).get()
if next_url:
next_offset = re.findall(r"offset=[0-9]*", next_url)[0]
next_url = re.sub(r"offset=[0-9]*", next_offset, response.url)
yield Request(url=next_url, meta={"playwright": True}, callback=self.parse)
if __name__ == "__main__":
settings = get_project_settings()
process = CrawlerProcess(settings=settings)
process.crawl(Spider1, start_urls=["https://m.vk.com/vkmusicians"])
#process.crawl(Spider1, start_urls=["https://m.vk.com/tumblr_perf"])
process.start()Settings
BOT_NAME = 'crawlers'
SPIDER_MODULES = ['crawlers.spiders']
NEWSPIDER_MODULE = 'crawlers.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
CONCURRENT_REQUESTS = 1
CONCURRENT_REQUESTS_PER_DOMAIN = 1
CONCURRENT_REQUESTS_PER_IP = 1
COOKIES_ENABLED = False
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
DOWNLOAD_HANDLERS = {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
#"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
'scrapy_fake_useragent.middleware.RetryUserAgentMiddleware': 401,
}
FAKEUSERAGENT_PROVIDERS = [
'scrapy_fake_useragent.providers.FakeUserAgentProvider',
'scrapy_fake_useragent.providers.FakerProvider',
'scrapy_fake_useragent.providers.FixedUserAgentProvider',
]
FAKEUSERAGENT_FALLBACK = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 (KHTML, like Gecko) Brave Chrome/89.0.4389.105 Safari/537.36'
PLAYWRIGHT_BROWSER_TYPE = "chromium"
FAKE_USERAGENT_RANDOM_UA_TYPE = "chrome"
FAKER_RANDOM_UA_TYPE = "chrome"Error
Traceback (most recent call last):
File "/Users/user/develop/work/crawlers/.venv/lib/python3.8/site-packages/twisted/internet/defer.py", line 1443, in _inlineCallbacks
result = current_context.run(result.throwExceptionIntoGenerator, g)
File "/Users/user/develop/work/crawlers/.venv/lib/python3.8/site-packages/twisted/python/failure.py", line 500, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "/Users/user/develop/work/crawlers/.venv/lib/python3.8/site-packages/scrapy/core/downloader/middleware.py", line 44, in process_request
return (yield download_func(request=request, spider=spider))
File "/Users/user/develop/work/crawlers/.venv/lib/python3.8/site-packages/twisted/internet/defer.py", line 837, in adapt
extracted = result.result()
File "/Users/user/develop/work/crawlers/.venv/lib/python3.8/site-packages/scrapy_playwright/handler.py", line 140, in _download_request
result = await self._download_request_with_page(request, spider, page)
File "/Users/user/develop/work/crawlers/.venv/lib/python3.8/site-packages/scrapy_playwright/handler.py", line 180, in _download_request_with_page
headers = Headers(response.headers)
AttributeError: 'NoneType' object has no attribute 'headers'
Using
- python 3.8.5
- scrapy 2.5.0
- playwright 1.10.0
- scrapy-playwright 0.0.3
- scrapy-fake-useragent 1.4.4
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working