Skip to content

Commit

Permalink
Merge pull request #1774 from scrapy/backport-1.1
Browse files Browse the repository at this point in the history
[MRG+1] [1.1.x][backport] #1766, #1770, #1750, #1662, #1765
  • Loading branch information
kmike committed Feb 15, 2016
2 parents 56ad73f + f928014 commit 7ce3242
Show file tree
Hide file tree
Showing 11 changed files with 148 additions and 14 deletions.
6 changes: 5 additions & 1 deletion docs/news.rst
Expand Up @@ -97,6 +97,7 @@ Additional New Features and Enhancements

- Dict-like settings now have per-key priorities
(:issue:`1135`, :issue:`1149` and :issue:`1586`).
- Sending non-ASCII emails (:issue:`1662`)
- ``CloseSpider`` and ``SpiderState`` extensions now get disabled if no relevant
setting is set (:issue:`1723`, :issue:`1725`).
- Added method ``ExecutionEngine.close`` (:issue:`1423`).
Expand All @@ -105,7 +106,7 @@ Additional New Features and Enhancements
:issue:`1335`, :issue:`1683`, :issue:`1660`, :issue:`1642`, :issue:`1721`,
:issue:`1727`).
- Other refactoring, optimizations and cleanup (:issue:`1476`, :issue:`1481`,
:issue:`1477`, :issue:`1315` and :issue:`1290`).
:issue:`1477`, :issue:`1315`, :issue:`1290` and :issue:`1750`).

.. _`Code of Conduct`: https://github.com/scrapy/scrapy/blob/master/CODE_OF_CONDUCT.md

Expand All @@ -124,6 +125,9 @@ Deprecations and Removals
+ ``scrapy.utils.datatypes.MultiValueDict``
+ ``scrapy.utils.datatypes.SiteNode``

- The previously bundled ``scrapy.xlib.pydispatch`` library was deprecated and
replaced by `pydispatcher <https://pypi.python.org/pypi/PyDispatcher>`_.


Relocations
~~~~~~~~~~~
Expand Down
5 changes: 4 additions & 1 deletion docs/topics/email.rst
Expand Up @@ -76,7 +76,7 @@ uses `Twisted non-blocking IO`_, like the rest of the framework.
:param settings: the e-mail recipients
:type settings: :class:`scrapy.settings.Settings` object

.. method:: send(to, subject, body, cc=None, attachs=(), mimetype='text/plain')
.. method:: send(to, subject, body, cc=None, attachs=(), mimetype='text/plain', charset=None)

Send email to the given recipients.

Expand All @@ -102,6 +102,9 @@ uses `Twisted non-blocking IO`_, like the rest of the framework.
:param mimetype: the MIME type of the e-mail
:type mimetype: str

:param charset: the character encoding to use for the e-mail contents
:type charset: str


.. _topics-email-settings:

Expand Down
23 changes: 17 additions & 6 deletions scrapy/core/engine.py
Expand Up @@ -177,12 +177,23 @@ def _handle_downloader_output(self, response, request, spider):
return d

def spider_is_idle(self, spider):
scraper_idle = self.scraper.slot.is_idle()
pending = self.slot.scheduler.has_pending_requests()
downloading = bool(self.downloader.active)
pending_start_requests = self.slot.start_requests is not None
idle = scraper_idle and not (pending or downloading or pending_start_requests)
return idle
if not self.scraper.slot.is_idle():
# scraper is not idle
return False

if self.downloader.active:
# downloader has pending requests
return False

if self.slot.start_requests is not None:
# not all start requests are handled
return False

if self.slot.scheduler.has_pending_requests():
# scheduler has pending requests
return False

return True

@property
def open_spiders(self):
Expand Down
22 changes: 19 additions & 3 deletions scrapy/http/cookies.py
Expand Up @@ -137,13 +137,29 @@ def is_unverifiable(self):
"""
return self.request.meta.get('is_unverifiable', False)

# python3 uses request.unverifiable
def get_origin_req_host(self):
return urlparse_cached(self.request).hostname

# python3 uses attributes instead of methods
@property
def full_url(self):
return self.get_full_url()

@property
def host(self):
return self.get_host()

@property
def type(self):
return self.get_type()

@property
def unverifiable(self):
return self.is_unverifiable()

def get_origin_req_host(self):
return urlparse_cached(self.request).hostname
@property
def origin_req_host(self):
return self.get_origin_req_host()

def has_header(self, name):
return name in self.request.headers
Expand Down
7 changes: 5 additions & 2 deletions scrapy/mail.py
Expand Up @@ -43,7 +43,7 @@ def from_settings(cls, settings):
settings['MAIL_PASS'], settings.getint('MAIL_PORT'),
settings.getbool('MAIL_TLS'), settings.getbool('MAIL_SSL'))

def send(self, to, subject, body, cc=None, attachs=(), mimetype='text/plain', _callback=None):
def send(self, to, subject, body, cc=None, attachs=(), mimetype='text/plain', charset=None, _callback=None):
if attachs:
msg = MIMEMultipart()
else:
Expand All @@ -57,8 +57,11 @@ def send(self, to, subject, body, cc=None, attachs=(), mimetype='text/plain', _c
rcpts.extend(cc)
msg['Cc'] = COMMASPACE.join(cc)

if charset:
msg.set_charset(charset)

if attachs:
msg.attach(MIMEText(body))
msg.attach(MIMEText(body, 'plain', charset or 'us-ascii'))
for attach_name, mimetype, f in attachs:
part = MIMEBase(*mimetype.split('/'))
part.set_payload(f.read())
Expand Down
2 changes: 1 addition & 1 deletion scrapy/spiders/sitemap.py
Expand Up @@ -32,7 +32,7 @@ def start_requests(self):

def _parse_sitemap(self, response):
if response.url.endswith('/robots.txt'):
for url in sitemap_urls_from_robots(response.body):
for url in sitemap_urls_from_robots(response.text):
yield Request(url, callback=self._parse_sitemap)
else:
body = self._get_sitemap_body(response)
Expand Down
19 changes: 19 additions & 0 deletions scrapy/xlib/pydispatch.py
@@ -0,0 +1,19 @@
from __future__ import absolute_import

import warnings
from scrapy.exceptions import ScrapyDeprecationWarning

from pydispatch import (
dispatcher,
errors,
robust,
robustapply,
saferef,
)

warnings.warn("Importing from scrapy.xlib.pydispatch is deprecated and will"
" no longer be supported in future Scrapy versions."
" If you just want to connect signals use the from_crawler class method,"
" otherwise import pydispatch directly if needed."
" See: https://github.com/scrapy/scrapy/issues/1762",
ScrapyDeprecationWarning, stacklevel=2)
4 changes: 4 additions & 0 deletions tests/test_http_cookies.py
Expand Up @@ -14,12 +14,15 @@ def setUp(self):

def test_get_full_url(self):
self.assertEqual(self.wrapped.get_full_url(), self.request.url)
self.assertEqual(self.wrapped.full_url, self.request.url)

def test_get_host(self):
self.assertEqual(self.wrapped.get_host(), urlparse(self.request.url).netloc)
self.assertEqual(self.wrapped.host, urlparse(self.request.url).netloc)

def test_get_type(self):
self.assertEqual(self.wrapped.get_type(), urlparse(self.request.url).scheme)
self.assertEqual(self.wrapped.type, urlparse(self.request.url).scheme)

def test_is_unverifiable(self):
self.assertFalse(self.wrapped.is_unverifiable())
Expand All @@ -32,6 +35,7 @@ def test_is_unverifiable2(self):

def test_get_origin_req_host(self):
self.assertEqual(self.wrapped.get_origin_req_host(), 'www.example.com')
self.assertEqual(self.wrapped.origin_req_host, 'www.example.com')

def test_has_header(self):
self.assertTrue(self.wrapped.has_header('content-type'))
Expand Down
50 changes: 50 additions & 0 deletions tests/test_mail.py
@@ -1,5 +1,8 @@
# coding=utf-8

import unittest
from io import BytesIO
from email.charset import Charset

from scrapy.mail import MailSender

Expand Down Expand Up @@ -54,11 +57,58 @@ def test_send_attach(self):

text, attach = payload
self.assertEqual(text.get_payload(decode=True), b'body')
self.assertEqual(text.get_charset(), Charset('us-ascii'))
self.assertEqual(attach.get_payload(decode=True), b'content')

def _catch_mail_sent(self, **kwargs):
self.catched_msg = dict(**kwargs)

def test_send_utf8(self):
subject = u'sübjèçt'
body = u'bödÿ-àéïöñß'
mailsender = MailSender(debug=True)
mailsender.send(to=['test@scrapy.org'], subject=subject, body=body,
charset='utf-8', _callback=self._catch_mail_sent)

assert self.catched_msg
self.assertEqual(self.catched_msg['subject'], subject)
self.assertEqual(self.catched_msg['body'], body)

msg = self.catched_msg['msg']
self.assertEqual(msg['subject'], subject)
self.assertEqual(msg.get_payload(), body)
self.assertEqual(msg.get_charset(), Charset('utf-8'))
self.assertEqual(msg.get('Content-Type'), 'text/plain; charset="utf-8"')

def test_send_attach_utf8(self):
subject = u'sübjèçt'
body = u'bödÿ-àéïöñß'
attach = BytesIO()
attach.write(body.encode('utf-8'))
attach.seek(0)
attachs = [('attachment', 'text/plain', attach)]

mailsender = MailSender(debug=True)
mailsender.send(to=['test@scrapy.org'], subject=subject, body=body,
attachs=attachs, charset='utf-8', _callback=self._catch_mail_sent)

assert self.catched_msg
self.assertEqual(self.catched_msg['subject'], subject)
self.assertEqual(self.catched_msg['body'], body)

msg = self.catched_msg['msg']
self.assertEqual(msg['subject'], subject)
self.assertEqual(msg.get_charset(), Charset('utf-8'))
self.assertEqual(msg.get('Content-Type'), 'multipart/mixed; charset="utf-8"')

payload = msg.get_payload()
assert isinstance(payload, list)
self.assertEqual(len(payload), 2)

text, attach = payload
self.assertEqual(text.get_payload(decode=True).decode('utf-8'), body)
self.assertEqual(text.get_charset(), Charset('utf-8'))
self.assertEqual(attach.get_payload(decode=True).decode('utf-8'), body)

if __name__ == "__main__":
unittest.main()
12 changes: 12 additions & 0 deletions tests/test_pydispatch_deprecated.py
@@ -0,0 +1,12 @@
import unittest
import warnings
from six.moves import reload_module


class DeprecatedPydispatchTest(unittest.TestCase):
def test_import_xlib_pydispatch_show_warning(self):
with warnings.catch_warnings(record=True) as w:
from scrapy.xlib import pydispatch
reload_module(pydispatch)
self.assertIn('Importing from scrapy.xlib.pydispatch is deprecated',
str(w[0].message))
12 changes: 12 additions & 0 deletions tests/test_spider.py
Expand Up @@ -328,6 +328,18 @@ def test_get_sitemap_body_xml_url_compressed(self):
r = Response(url="http://www.example.com/sitemap.xml.gz", body=self.GZBODY)
self.assertSitemapBody(r, self.BODY)

def test_get_sitemap_urls_from_robotstxt(self):
robots = b"""# Sitemap files
Sitemap: http://example.com/sitemap.xml
Sitemap: http://example.com/sitemap-product-index.xml
"""

r = TextResponse(url="http://www.example.com/robots.txt", body=robots)
spider = self.spider_class("example.com")
self.assertEqual([req.url for req in spider._parse_sitemap(r)],
['http://example.com/sitemap.xml',
'http://example.com/sitemap-product-index.xml'])


class BaseSpiderDeprecationTest(unittest.TestCase):

Expand Down

0 comments on commit 7ce3242

Please sign in to comment.