scrapy · dangra · Jan 16, 2014 · Jul 9, 2013 · Dec 18, 2013 · Dec 18, 2013
diff --git a/docs/topics/broad-crawls.rst b/docs/topics/broad-crawls.rst
@@ -118,3 +118,26 @@ crawler to dedicate too many resources on any specific domain.
 To disable redirects use::
 
     REDIRECT_ENABLED = False
+
+Enable crawling of "Ajax Crawlable Pages"
+=========================================
+
+Some pages (up to 1%) declare themselves as `ajax crawlable`_. This means they
+provide plain HTML version of content that is usually available only via AJAX.
+Pages can indicate it in two ways:
+
+1) by using ``#!`` in URL - this is the default way;
+2) by using a special meta tag - this way is used on
+   "main", "index" website pages.
+
+Scrapy handles (1) automatically; to handle (2) enable
+:ref:`AjaxCrawlableMiddleware <ajaxcrawlable-middleware>`::
+
+    AJAXCRAWLABLE_ENABLED = True
+
+When doing broad crawls it's common to crawl a lot of "index" web pages;
+AjaxCrawlableMiddleware helps to crawl them correctly.
+It is turned OFF by default because it has some performance overhead,
+and enabling it for focused crawls doesn't make much sense.
+
+.. _ajax crawlable: https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst
@@ -96,7 +96,7 @@ single Python class that defines one or more of the following methods:
    .. method:: process_response(request, response, spider)
 
       :meth:`process_response` should either: return a :class:`~scrapy.http.Response`
-      object, return a :class:`~scrapy.http.Request` object or 
+      object, return a :class:`~scrapy.http.Request` object or
       raise a :exc:`~scrapy.exceptions.IgnoreRequest` exception.
 
       If it returns a :class:`~scrapy.http.Response` (it could be the same given
@@ -795,6 +795,42 @@ UserAgentMiddleware
    In order for a spider to override the default user agent, its `user_agent`
    attribute must be set.
 
+.. _ajaxcrawlable-middleware:
+
+AjaxCrawlableMiddleware
+-----------------------
+
+.. module:: scrapy.contrib.downloadermiddleware.ajaxcrawlable
+
+.. class:: AjaxCrawlableMiddleware
+
+   Middleware that finds 'AJAX crawlable' page variants based
+   on meta-fragment html tag. See
+   https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
+   for more info.
+
+   .. note::
+
+       Scrapy finds 'AJAX crawlable' pages for URLs like
+       ``'http://example.com/!#foo=bar'`` even without this middleware.
+       AjaxCrawlableMiddleware is necessary when URL doesn't contain ``'!#'``.
+       This is often a case for 'index' or 'main' website pages.
+
+AjaxCrawlableMiddleware Settings
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. setting:: AJAXCRAWLABLE_ENABLED
+
+AJAXCRAWLABLE_ENABLED
+^^^^^^^^^^^^^^^^^^^^^
+
+.. versionadded:: 0.21
+
+Default: ``False``
+
+Whether the AjaxCrawlableMiddleware will be enabled. You may want to
+enable it for :ref:`broad crawls <topics-broad-crawls>`.
+
 
 .. _DBM: http://en.wikipedia.org/wiki/Dbm
 .. _anydbm: http://docs.python.org/library/anydbm.html

diff --git a/scrapy/contrib/downloadermiddleware/ajaxcrawlable.py b/scrapy/contrib/downloadermiddleware/ajaxcrawlable.py
@@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+import re
+from scrapy import log
+from scrapy.exceptions import NotConfigured
+from scrapy.http import HtmlResponse
+from scrapy.utils.response import _noscript_re, _script_re
+from w3lib import html
+
+class AjaxCrawlableMiddleware(object):
+    """
+    Handle 'AJAX crawlable' pages marked as crawlable via meta tag.
+    For more info see https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
+    """
+
+    enabled_setting = 'AJAXCRAWLABLE_ENABLED'
+
+    def __init__(self, settings):
+        if not settings.getbool(self.enabled_setting):
+            raise NotConfigured
+
+        # XXX: Google parses at least first 100k bytes; scrapy's redirect
+        # middleware parses first 4k. 4k turns out to be insufficient
+        # for this middleware, and parsing 100k could be slow.
+        # We use something in between (32K) by default.
+        self.lookup_bytes = settings.getint('AJAXCRAWLABLE_MAXSIZE', 32768)
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(crawler.settings)
+
+    def process_response(self, request, response, spider):
+
+        if not isinstance(response, HtmlResponse) or response.status != 200:
+            return response
+
+        if request.method != 'GET':
+            # other HTTP methods are either not safe or don't have a body
+            return response
+
+        if 'ajax_crawlable' in request.meta:  # prevent loops
+            return response
+
+        if not self._has_ajax_crawlable_variant(response):
+            return response
+
+        # scrapy already handles #! links properly
+        ajax_crawlable = request.replace(url=request.url+'#!')
+        log.msg(format="Downloading AJAX crawlable %(ajax_crawlable)s instead of %(request)s",
+                level=log.DEBUG, spider=spider, ajax_crawlable=ajax_crawlable,
+                request=request)
+
+        ajax_crawlable.meta['ajax_crawlable'] = True
+        return ajax_crawlable
+
+    def _has_ajax_crawlable_variant(self, response):
+        """
+        Return True if a page without hash fragment could be "AJAX crawlable"
+        according to https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
+        """
+        body = response.body_as_unicode()[:self.lookup_bytes]
+        return _has_ajaxcrawlable_meta(body)
+
+
+# XXX: move it to w3lib?
+_ajax_crawlable_re = re.compile(ur'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>')
+def _has_ajaxcrawlable_meta(text):
+    """
+    >>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment"  content="!"/></head><body></body></html>')
+    True
+    >>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
+    True
+    >>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment"  content="!"/>--></head><body></body></html>')
+    False
+    >>> _has_ajaxcrawlable_meta('<html></html>')
+    False
+    """
+
+    # Stripping scripts and comments is slow (about 20x slower than
+    # just checking if a string is in text); this is a quick fail-fast
+    # path that should work for most pages.
+    if 'fragment' not in text:
+        return False
+    if 'content' not in text:
+        return False
+
+    text = _script_re.sub(u'', text)
+    text = _noscript_re.sub(u'', text)
+    text = html.remove_comments(html.remove_entities(text))
+    return _ajax_crawlable_re.search(text) is not None
+
diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py
@@ -18,6 +18,8 @@
 from importlib import import_module
 from os.path import join, abspath, dirname
 
+AJAXCRAWLABLE_ENABLED = False
+
 BOT_NAME = 'scrapybot'
 
 CLOSESPIDER_TIMEOUT = 0
@@ -79,6 +81,7 @@
     'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
     'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
     'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
+    'scrapy.contrib.downloadermiddleware.ajaxcrawlable.AjaxCrawlableMiddleware': 560,
     'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580,
     'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590,
     'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,

diff --git a/scrapy/tests/test_downloadermiddleware_ajaxcrawlable.py b/scrapy/tests/test_downloadermiddleware_ajaxcrawlable.py
@@ -0,0 +1,58 @@
+import unittest
+
+from scrapy.contrib.downloadermiddleware.ajaxcrawlable import AjaxCrawlableMiddleware
+from scrapy.spider import BaseSpider
+from scrapy.http import Request, HtmlResponse, Response
+from scrapy.utils.test import get_crawler
+
+__doctests__ = ['scrapy.contrib.downloadermiddleware.ajaxcrawlable']
+
+class AjaxCrawlableMiddlewareTest(unittest.TestCase):
+    def setUp(self):
+        self.spider = BaseSpider('foo')
+        crawler = get_crawler({'AJAXCRAWLABLE_ENABLED': True})
+        self.mw = AjaxCrawlableMiddleware.from_crawler(crawler)
+
+    def _ajaxcrawlable_body(self):
+        return '<html><head><meta name="fragment" content="!"/></head><body></body></html>'
+
+    def _req_resp(self, url, req_kwargs=None, resp_kwargs=None):
+        req = Request(url, **(req_kwargs or {}))
+        resp = HtmlResponse(url, request=req, **(resp_kwargs or {}))
+        return req, resp
+
+    def test_non_get(self):
+        req, resp = self._req_resp('http://example.com/', {'method': 'HEAD'})
+        resp2 = self.mw.process_response(req, resp, self.spider)
+        self.assertEqual(resp, resp2)
+
+    def test_binary_response(self):
+        req = Request('http://example.com/')
+        resp = Response('http://example.com/', body=b'foobar\x00\x01\x02', request=req)
+        resp2 = self.mw.process_response(req, resp, self.spider)
+        self.assertIs(resp, resp2)
+
+    def test_ajax_crawlable(self):
+        req, resp = self._req_resp(
+            'http://example.com/',
+            {'meta': {'foo': 'bar'}},
+            {'body': self._ajaxcrawlable_body()}
+        )
+        req2 = self.mw.process_response(req, resp, self.spider)
+        self.assertEqual(req2.url, 'http://example.com/?_escaped_fragment_=')
+        self.assertEqual(req2.meta['foo'], 'bar')
+
+    def test_ajax_crawlable_loop(self):
+        req, resp = self._req_resp('http://example.com/', {}, {'body': self._ajaxcrawlable_body()})
+        req2 = self.mw.process_response(req, resp, self.spider)
+        resp2 = HtmlResponse(req2.url, body=resp.body, request=req2)
+        resp3 = self.mw.process_response(req2, resp2, self.spider)
+
+        assert isinstance(resp3, HtmlResponse), (resp3.__class__, resp3)
+        self.assertEqual(resp3.request.url, 'http://example.com/?_escaped_fragment_=')
+        assert resp3 is resp2
+
+    def test_noncrawlable_body(self):
+        req, resp = self._req_resp('http://example.com/', {}, {'body': '<html></html>'})
+        resp2 = self.mw.process_response(req, resp, self.spider)
+        self.assertIs(resp, resp2)