Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
FormRequest: handle whitespaces in action attribute properly
  • Loading branch information
kmike committed Feb 15, 2017
1 parent e1ceaf3 commit 074caf4
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 4 deletions.
10 changes: 8 additions & 2 deletions scrapy/http/request/form.py
Expand Up @@ -5,10 +5,13 @@
See documentation in docs/topics/request-response.rst
"""

import six
from six.moves.urllib.parse import urljoin, urlencode

import lxml.html
from parsel.selector import create_root_node
import six
from w3lib.html import strip_html5_whitespace

from scrapy.http.request import Request
from scrapy.utils.python import to_bytes, is_listlike
from scrapy.utils.response import get_base_url
Expand Down Expand Up @@ -51,7 +54,10 @@ def from_response(cls, response, formname=None, formid=None, formnumber=0, formd

def _get_form_url(form, url):
if url is None:
return urljoin(form.base_url, form.action)
action = form.get('action')
if action is None:
return form.base_url
return urljoin(form.base_url, strip_html5_whitespace(action))
return urljoin(form.base_url, url)


Expand Down
10 changes: 8 additions & 2 deletions tests/test_http_request.py
Expand Up @@ -556,7 +556,6 @@ def test_from_response_unicode_clickdata_latin1(self):
fs = _qs(req, to_unicode=True, encoding='latin1')
self.assertTrue(fs[u'price in \u00a5'])


def test_from_response_multiple_forms_clickdata(self):
response = _buildresponse(
"""<form name="form1">
Expand Down Expand Up @@ -989,7 +988,7 @@ def test_html_base_form_action(self):
"""
<html>
<head>
<base href="http://b.com/">
<base href=" http://b.com/">
</head>
<body>
<form action="test_form">
Expand All @@ -1002,6 +1001,11 @@ def test_html_base_form_action(self):
req = self.request_class.from_response(response)
self.assertEqual(req.url, 'http://b.com/test_form')

def test_spaces_in_action(self):
resp = _buildresponse('<body><form action=" path\n"></form></body>')
req = self.request_class.from_response(resp)
self.assertEqual(req.url, 'http://example.com/path')

def test_from_response_css(self):
response = _buildresponse(
"""<form action="post.php" method="POST">
Expand All @@ -1023,12 +1027,14 @@ def test_from_response_css(self):
self.assertRaises(ValueError, self.request_class.from_response,
response, formcss="input[name='abc']")


def _buildresponse(body, **kwargs):
kwargs.setdefault('body', body)
kwargs.setdefault('url', 'http://example.com')
kwargs.setdefault('encoding', 'utf-8')
return HtmlResponse(**kwargs)


def _qs(req, encoding='utf-8', to_unicode=False):
if req.method == 'POST':
qs = req.body
Expand Down

0 comments on commit 074caf4

Please sign in to comment.