From 36cbdc276358031ffcdb4aee33978dee728aa780 Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Tue, 13 Sep 2016 11:46:28 +0200 Subject: [PATCH] Support preserving fragments in url_query_cleaner Fixes GH-60 --- tests/test_url.py | 6 ++++++ w3lib/url.py | 15 ++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/tests/test_url.py b/tests/test_url.py index bfdd5bd6..3c0a6b31 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -301,6 +301,12 @@ def test_url_query_cleaner(self): self.assertEqual('product.html?foobar=wired', url_query_cleaner("product.html?foo=bar&foobar=wired", 'foobar')) + def test_url_query_cleaner_keep_fragments(self): + self.assertEqual('product.html?id=200#foo', + url_query_cleaner("product.html?id=200&foo=bar&name=wired#foo", + ['id'], + keep_fragments=True)) + def test_path_to_file_uri(self): if os.name == 'nt': self.assertEqual(path_to_file_uri("C:\\windows\clock.avi"), diff --git a/w3lib/url.py b/w3lib/url.py index 900b360b..f30effc3 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -173,7 +173,7 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0): ) return queryparams.get(parameter, [default])[0] -def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True): +def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True, keep_fragments=False): """Clean URL arguments leaving only those passed in the parameterlist keeping order >>> import w3lib.url @@ -197,11 +197,17 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u 'product.html?name=wired' >>> + By default, URL fragments are removed. If you need to preserve fragments, + pass the ``keep_fragments`` argument as ``True``. + + >>> w3lib.url.url_query_cleaner('http://domain.tld/?bla=123#123123', ['bla'], remove=True, keep_fragments=True) + 'http://domain.tld/#123123' + """ if isinstance(parameterlist, (six.text_type, bytes)): parameterlist = [parameterlist] - url = urldefrag(url)[0] + url, fragment = urldefrag(url) base, _, query = url.partition('?') seen = set() querylist = [] @@ -216,7 +222,10 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u else: querylist.append(ksv) seen.add(k) - return '?'.join([base, sep.join(querylist)]) if querylist else base + url = '?'.join([base, sep.join(querylist)]) if querylist else base + if keep_fragments: + url += '#' + fragment + return url def add_or_replace_parameter(url, name, new_value): """Add or remove a parameter to a given url