Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] support multipart/form-data form encoding and file upload #1954

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion scrapy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

# Declare top-level shortcuts
from scrapy.spiders import Spider
from scrapy.http import Request, FormRequest
from scrapy.http import Request, FormRequest, MultipartFormRequest
from scrapy.selector import Selector
from scrapy.item import Item, Field

Expand Down
2 changes: 2 additions & 0 deletions scrapy/http/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

from scrapy.http.request import Request
from scrapy.http.request.form import FormRequest
from scrapy.http.request.form import MultipartFormRequest
from scrapy.http.request.form import MultipartFile
from scrapy.http.request.rpc import XmlRpcRequest

from scrapy.http.response import Response
Expand Down
56 changes: 56 additions & 0 deletions scrapy/http/request/form.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import lxml.html
from parsel.selector import create_root_node
import six
import string
import random
from scrapy.http.request import Request
from scrapy.utils.python import to_bytes, is_listlike
from scrapy.utils.response import get_base_url
Expand Down Expand Up @@ -49,6 +51,37 @@ def from_response(cls, response, formname=None, formid=None, formnumber=0, formd
return cls(url=url, method=method, formdata=formdata, **kwargs)


class MultipartFormRequest(FormRequest):

def __init__(self, *args, **kwargs):
formdata = kwargs.pop('formdata', None)

kwargs.setdefault('method', 'POST')

super(MultipartFormRequest, self).__init__(*args, **kwargs)

content_type = self.headers.setdefault(b'Content-Type', [b'multipart/form-data'])[0]
method = kwargs.get('method').upper()
if formdata and method == 'POST' and content_type == b'multipart/form-data':
items = formdata.items() if isinstance(formdata, dict) else formdata
self._boundary = ''

# encode the data using multipart spec
self._boundary = to_bytes(''.join(
random.choice(string.digits + string.ascii_letters) for i in range(20)), self.encoding)
self.headers[b'Content-Type'] = b'multipart/form-data; boundary=' + self._boundary
request_data = _multpart_encode(items, self._boundary, self.encoding)
self._set_body(request_data)


class MultipartFile(object):

def __init__(self, name, content, mimetype='application/octet-stream'):
self.name = name
self.content = content
self.mimetype = mimetype


def _get_form_url(form, url):
if url is None:
return urljoin(form.base_url, form.action)
Expand All @@ -62,6 +95,29 @@ def _urlencode(seq, enc):
return urlencode(values, doseq=1)


def _multpart_encode(items, boundary, enc):
body = []

for name, value in items:
body.append(b'--' + boundary)
if isinstance(value, MultipartFile):
file_name = value.name
content = value.content
content_type = value.mimetype

body.append(b'Content-Disposition: form-data; name="' + to_bytes(name, enc) + b'"; filename="' + to_bytes(file_name, enc) + b'"')
body.append(b'Content-Type: ' + to_bytes(content_type, enc))
body.append(b'')
body.append(to_bytes(content, enc))
else:
body.append(b'Content-Disposition: form-data; name="' + to_bytes(name, enc) + b'"')
body.append(b'')
body.append(to_bytes(value, enc))

body.append(b'--' + boundary + b'--')
return b'\r\n'.join(body)


def _get_form(response, formname, formid, formnumber, formxpath):
"""Find the form element """
root = create_root_node(response.text, lxml.html.HTMLParser,
Expand Down
79 changes: 78 additions & 1 deletion tests/test_http_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
from six.moves.urllib.parse import urlparse, parse_qs, unquote
if six.PY3:
from urllib.parse import unquote_to_bytes
from collections import OrderedDict

from scrapy.http import Request, FormRequest, XmlRpcRequest, Headers, HtmlResponse
from scrapy.http import Request, FormRequest, MultipartFile, XmlRpcRequest, Headers, \
HtmlResponse, MultipartFormRequest
from scrapy.utils.python import to_bytes, to_native_str


Expand Down Expand Up @@ -1072,5 +1074,80 @@ def test_latin1(self):
self._test_request(params=(u'pas£',), encoding='latin1')


class MultipartFormRequestTest(RequestTest):

request_class = MultipartFormRequest
default_method = 'POST'
default_headers = {b'Content-Type': [b'multipart/form-data']}

def test_multpart_text_body(self):
data = {b'field': b'value'}
headers = {b'Content-Type': b'multipart/form-data'}
r1 = self.request_class("http://www.example.com", formdata=data, headers=headers)
boundary1 = r1._boundary
body = b'--' + boundary1 + \
b'\r\nContent-Disposition: form-data; name="field"' + \
b'\r\n' + \
b'\r\nvalue' + \
b'\r\n--' + boundary1 + b'--'
self.assertEqual(r1.body, body)

def test_multpart_unicode_body(self):
data = {u'Price £': u'£ 100'}
headers = {b'Content-Type': b'multipart/form-data'}
r1 = self.request_class("http://www.example.com", formdata=data, headers=headers)
boundary1 = r1._boundary
body = b'--' + boundary1 + \
b'\r\nContent-Disposition: form-data; name="Price \xc2\xa3"' + \
b'\r\n' + \
b'\r\n\xc2\xa3 100' + \
b'\r\n--' + boundary1 + b'--'
self.assertEqual(r1.body, body)

def test_multipart_file(self):
sample_file = MultipartFile(name='sample.txt', content=u'Text file content £ 100')

data = OrderedDict((
(u'Price £', u'£ 100'),
('file', sample_file)
))
headers = {b'Content-Type': b'multipart/form-data'}
r1 = self.request_class("http://www.example.com", formdata=data, headers=headers)
boundary1 = r1._boundary
body = b'--' + boundary1 + \
b'\r\nContent-Disposition: form-data; name="Price \xc2\xa3"' + \
b'\r\n' + \
b'\r\n\xc2\xa3 100' + \
b'\r\n--' + boundary1 + \
b'\r\nContent-Disposition: form-data; name="file"; filename="sample.txt"' + \
b'\r\nContent-Type: application/octet-stream' + \
b'\r\n' + \
b'\r\nText file content \xc2\xa3 100' + \
b'\r\n--' + boundary1 + b'--'
self.assertEqual(r1.body, body)

def test_multipart_file_mimetype(self):
sample_file = MultipartFile(name='sample.txt', content=u'Text file content £ 100', mimetype='text/plain')

data = OrderedDict((
(u'Price £', u'£ 100'),
('file', sample_file)
))
headers = {b'Content-Type': b'multipart/form-data'}
r1 = self.request_class("http://www.example.com", formdata=data, headers=headers)
boundary1 = r1._boundary
body = b'--' + boundary1 + \
b'\r\nContent-Disposition: form-data; name="Price \xc2\xa3"' + \
b'\r\n' + \
b'\r\n\xc2\xa3 100' + \
b'\r\n--' + boundary1 + \
b'\r\nContent-Disposition: form-data; name="file"; filename="sample.txt"' + \
b'\r\nContent-Type: text/plain' + \
b'\r\n' + \
b'\r\nText file content \xc2\xa3 100' + \
b'\r\n--' + boundary1 + b'--'
self.assertEqual(r1.body, body)


if __name__ == "__main__":
unittest.main()