Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lxml formrequest #111

Merged
merged 15 commits into from Apr 13, 2012
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
207 changes: 163 additions & 44 deletions scrapy/http/request/form.py
Expand Up @@ -6,70 +6,189 @@
"""

import urllib
from cStringIO import StringIO

from scrapy.xlib.ClientForm import ParseFile
from lxml import html

from scrapy.http.request import Request
from scrapy.utils.python import unicode_to_str

def _unicode_to_str(string, encoding):
if hasattr(string, '__iter__'):
return [unicode_to_str(k, encoding) for k in string]
else:
return unicode_to_str(string, encoding)
XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"


class MultipleElementsFound(Exception):
pass


class FormRequest(Request):

def __init__(self, *args, **kwargs):
formdata = kwargs.pop('formdata', None)
if formdata and kwargs.get('method') is None:
kwargs['method'] = 'POST'

super(FormRequest, self).__init__(*args, **kwargs)

if formdata:
items = formdata.iteritems() if isinstance(formdata, dict) else formdata
query = [(unicode_to_str(k, self.encoding), _unicode_to_str(v, self.encoding))
for k, v in items]
self.method = 'POST'
self._set_body(urllib.urlencode(query, doseq=1))
self.headers['Content-Type'] = 'application/x-www-form-urlencoded'
querystr = _urlencode(items, self.encoding)
if self.method == 'POST':
self.headers.setdefault('Content-Type', 'application/x-www-form-urlencoded')
self._set_body(querystr)
else:
self._set_url(self.url + ('&' if '?' in self.url else '?') + querystr)

@classmethod
def from_response(cls, response, formname=None, formnumber=0, formdata=None,
def from_response(cls, response, formname=None, formnumber=0, formdata=None,
clickdata=None, dont_click=False, **kwargs):
encoding = getattr(response, 'encoding', 'utf-8')
forms = ParseFile(StringIO(response.body), response.url,
encoding=encoding, backwards_compat=False)
if not forms:
raise ValueError("No <form> element found in %s" % response)

form = None

if formname:
for f in forms:
if f.name == formname:
form = f
break

if not form:
if not hasattr(formdata, "items"):
try:
formdata = dict(formdata) if formdata else {}
except (ValueError, TypeError):
raise ValueError('formdata should be a dict or iterable of tuples')

kwargs.setdefault('encoding', response.encoding)
hxs = html.fromstring(response.body_as_unicode(), base_url=response.url)
form = _get_form(hxs, formname, formnumber, response)
formdata = _get_inputs(form, formdata, dont_click, clickdata, response)
url = form.action or form.base_url
return cls(url, method=form.method, formdata=formdata, **kwargs)

# Copied from lxml.html to avoid relying on a non-public function
def _nons(tag):
if isinstance(tag, basestring):
if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
return tag.split('}')[-1]
return tag

def _urlencode(seq, enc):
values = [(unicode_to_str(k, enc), unicode_to_str(v, enc))
for k, vs in seq
for v in (vs if hasattr(vs, '__iter__') else [vs])]
return urllib.urlencode(values, doseq=1)

def _get_form(hxs, formname, formnumber, response):
"""
Uses all the passed arguments to get the required form
element
"""
if not hxs.forms:
raise ValueError("No <form> element found in %s" % response)

if formname is not None:
f = hxs.xpath('//form[@name="%s"]' % formname)
if f:
return f[0]

# If we get here, it means that either formname was None
# or invalid
if formnumber is not None:
try:
form = hxs.forms[formnumber]
except IndexError:
raise IndexError("Form number %d not found in %s" %
(formnumber, response))
else:
return form

def _get_inputs(form, formdata, dont_click, clickdata, response):
"""
Returns all the inputs that will be sent with the request,
both those already present in the form and those given by
the user
"""
clickables = []
inputs = []
xmlns_in_resp = u' xmlns' in response.body_as_unicode()[:200]

for el in form.inputs:
name = el.name
if not name or name in formdata:
continue
tag = _nons(el.tag)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried removing the call to _nons() and still passed all tests.
What about removing it completely of figuring out a test case that justifies its use?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1

if tag == 'textarea':
inputs.append((name, el.value))
elif tag == 'select':
if xmlns_in_resp:
#use builtin select parser with namespaces
value = el.value
else:
value = el.xpath(".//option[@selected]") or None

if el.multiple:
for v in value:
if v is not None:
inputs.append((name, v))
elif value is not None:
inputs.append((name, value[0] if isinstance(value, list)
else value))
else:
option = el.xpath(".//option[1]/@value")
if option:
inputs.append((name, option[0]))
else:
assert tag == 'input', ("Unexpected tag: %r" % el)
if el.checkable and not el.checked:
continue
if el.type in ('image', 'reset'):
continue
elif el.type == 'submit':
clickables.append(el)
else:
value = el.value
if value is not None:
inputs.append((name, el.value))

# If we are allowed to click on buttons and we have clickable
# elements, we move on to see if we have any clickdata
if not dont_click and clickables:
clickable = _get_clickable(clickdata, clickables, form)
inputs.append(clickable)

inputs.extend(formdata.iteritems())
return inputs

def _get_clickable(clickdata, clickables, form):
"""
Returns the clickable element specified in clickdata,
if the latter is given. If not, it returns the first
clickable element found
"""
# If clickdata is given, we compare it to the clickable elements
# to find a match
if clickdata is not None:
# We first look to see if the number is specified in
# clickdata, because that uniquely identifies the element
nr = clickdata.get('nr', None)
if nr is not None:
try:
form = forms[formnumber]
el = list(form.inputs)[nr]
except IndexError:
raise IndexError("Form number %d not found in %s" % (formnumber, response))
if formdata:
# remove all existing fields with the same name before, so that
# formdata fields properly can properly override existing ones,
# which is the desired behaviour
form.controls = [c for c in form.controls if c.name not in formdata]
for k, v in formdata.iteritems():
for v2 in v if hasattr(v, '__iter__') else [v]:
form.new_control('text', k, {'value': v2})

if dont_click:
url, body, headers = form._switch_click('request_data')
pass
else:
return (el.name, el.value)

# We didn't find it, so now we build an XPath expression
# out of the other arguments, because they can be used
# as such
else:
url, body, headers = form.click_request_data(**(clickdata or {}))
xpath_pred = []
for k, v in clickdata.items():
if k == 'coord':
v = ','.join(str(c) for c in v)
xpath_pred.append('[@%s="%s"]' % (k, v))

kwargs.setdefault('headers', {}).update(headers)
xpath_expr = '//*%s' % ''.join(xpath_pred)
el = form.xpath(xpath_expr)
if len(el) > 1:
raise MultipleElementsFound(
"Multiple elements found (%r) matching the criteria"
" in clickdata: %r" % (el, clickdata)
)
else:
return (el[0].name, el[0].value)

return cls(url, method=form.method, body=body, **kwargs)
# If we don't have clickdata, we just use the first
# clickable element
else:
el = clickables.pop(0)
return (el.name, el.value)
8 changes: 0 additions & 8 deletions scrapy/tests/test_clientform.py

This file was deleted.

16 changes: 16 additions & 0 deletions scrapy/tests/test_http_request.py
Expand Up @@ -5,6 +5,7 @@
from urlparse import urlparse

from scrapy.http import Request, FormRequest, XmlRpcRequest, Headers, HtmlResponse
from scrapy.http.request.form import MultipleElementsFound


class RequestTest(unittest.TestCase):
Expand Down Expand Up @@ -320,6 +321,21 @@ def test_from_response_dont_click(self):
self.assertFalse('clickeable1' in urlargs, urlargs)
self.assertFalse('clickeable2' in urlargs, urlargs)

def test_from_response_ambiguous_clickdata(self):
respbody = """
<form action="get.php" method="GET">
<input type="submit" name="clickeable1" value="clicked1">
<input type="hidden" name="one" value="1">
<input type="hidden" name="two" value="3">
<input type="submit" name="clickeable2" value="clicked2">
</form>
"""
response = HtmlResponse("http://www.example.com/this/list.html", body=respbody)
self.assertRaises(MultipleElementsFound,
self.request_class.from_response,
response,
clickdata={'type': 'submit'})

def test_from_response_errors_noform(self):
respbody = """<html></html>"""
response = HtmlResponse("http://www.example.com/lala.html", body=respbody)
Expand Down