Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementing caching of chunked/streamed responses #106

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
language: python
sudo: false

env:
- TOXENV=py26
Expand Down
9 changes: 9 additions & 0 deletions cachecontrol/adapter.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import types
import functools

from requests.adapters import HTTPAdapter
Expand Down Expand Up @@ -97,6 +98,14 @@ def build_response(self, request, response, from_cache=False):
response,
)
)
if response.chunked:
super_update_chunk_length = response._update_chunk_length

def _update_chunk_length(self):
super_update_chunk_length()
if self.chunk_left == 0:
self._fp._close()
response._update_chunk_length = types.MethodType(_update_chunk_length, response)

resp = super(CacheControlAdapter, self).build_response(
request, response
Expand Down
33 changes: 24 additions & 9 deletions cachecontrol/filewrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,19 +45,34 @@ def __is_fp_closed(self):
# TODO: Add some logging here...
return False

def _close(self):
if self.__callback:
self.__callback(self.__buf.getvalue())

# We assign this to None here, because otherwise we can get into
# really tricky problems where the CPython interpreter dead locks
# because the callback is holding a reference to something which
# has a __del__ method. Setting this to None breaks the cycle
# and allows the garbage collector to do it's thing normally.
self.__callback = None

def read(self, amt=None):
data = self.__fp.read(amt)
self.__buf.write(data)
if self.__is_fp_closed():
self._close()

return data

def _safe_read(self, amt):
data = self.__fp._safe_read(amt)
if amt == 2 and data == b'\r\n':
# urllib executes this read to toss the CRLF at the end
# of the chunk.
return data

self.__buf.write(data)
if self.__is_fp_closed():
if self.__callback:
self.__callback(self.__buf.getvalue())

# We assign this to None here, because otherwise we can get into
# really tricky problems where the CPython interpreter dead locks
# because the callback is holding a reference to something which
# has a __del__ method. Setting this to None breaks the cycle
# and allows the garbage collector to do it's thing normally.
self.__callback = None
self._close()

return data
6 changes: 6 additions & 0 deletions cachecontrol/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,12 @@ def prepare_response(self, request, cached):

body_raw = cached["response"].pop("body")

headers = CaseInsensitiveDict(data=cached['response']['headers'])
if headers.get('transfer-encoding', '') == 'chunked':
headers.pop('transfer-encoding')

cached['response']['headers'] = headers

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a good reason for this? It seems like it doesn't hurt to know the original response was chunked.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really remember this PR. I think perhaps when the initial request is transferred with chunks, one of the first steps is to strip off the chunk markers, so that the data that actually enters the cache lacks the chunk markers. when it's read back out of the cache then, it doesn't look like a chunked request.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah ok, that is what I suspected. Thanks!

I've finally gotten around to getting this merged with some changes. Thanks for your patience!

try:
body = io.BytesIO(body_raw)
except TypeError:
Expand Down
8 changes: 8 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,14 @@ def multiple_choices(self, env, start_response):
start_response('300 Multiple Choices', headers)
return ['See: /permalink'.encode('utf-8')]

def stream(self, env, start_response):
headers = [
('Cache-Control', 'max-age=5000'),
('Content-Type', 'text/plain'),
]
start_response('200 OK', headers)
return (pformat(env).encode("utf8") for i in range(10))

def __call__(self, env, start_response):
func = self.dispatch(env)

Expand Down
28 changes: 28 additions & 0 deletions tests/test_stream.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""
Test for supporting streamed responses (Transfer-Encoding: chunked)
"""
import requests

from cachecontrol import CacheControl


class TestStream(object):
def test_stream_is_cached(self, url):
sess = CacheControl(requests.Session())

resp_1 = sess.get(url + 'stream')
content_1 = resp_1.content

resp_2 = sess.get(url + 'stream')
content_2 = resp_1.content

assert not resp_1.from_cache
assert resp_2.from_cache
assert content_1 == content_2

def test_stream_is_not_cached_when_content_is_not_read(self, url):
sess = CacheControl(requests.Session())
sess.get(url + 'stream', stream=True)
resp = sess.get(url + 'stream', stream=True)

assert not resp.from_cache