From d32b677652e0c6306daad2914b11ed853019863f Mon Sep 17 00:00:00 2001 From: Luke Lee Date: Mon, 27 Jun 2016 12:46:46 +0200 Subject: [PATCH] Fix bug with passing unicode data to urllib.pathname2url - This can result in a KeyError for some unicode strings because pathname2url is meant to be passed bytes, not unicode. - This fixes an exception in the logs we've seen related to requests to this URL in production. Note this isn't a valid article URL however it shouldn't result in an exception, just an error message about not finding the guide. - '/microsoft-net/%E2%80%A6' --- pskb_website/remote.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/pskb_website/remote.py b/pskb_website/remote.py index 535344d..e9f8ba7 100644 --- a/pskb_website/remote.py +++ b/pskb_website/remote.py @@ -269,8 +269,18 @@ def read_file_from_github(path, branch=u'master', rendered_text=True, # would be wrong. However, those URLs have been the same for years so # seems like a safe enough bet at this point. owner, repo, file_path = split_full_file_path(path) - url = u'https://github.com/%s/%s/blob/%s/%s' % (owner, repo, branch, - urllib.pathname2url(file_path)) + + # Cannot pass unicode data to pathname2url or it can raise KeyError. + # Must only pass URL-safe bytes. So, something like u'\u2026' will + # raise a # KeyError but if we encode it to bytes, '%E2%80%A6', things + # work correctly. + # http://stackoverflow.com/questions/15115588/urllib-quote-throws-keyerror + + url = u'https://github.com/%s/%s/blob/%s/%s' % ( + owner, + repo, + branch, + urllib.pathname2url(file_path.encode('utf-8'))) details = file_details(path, branch, None, None, url, text) else: @@ -522,6 +532,16 @@ def contents_url_from_path(path): """ owner, repo, file_path = split_full_file_path(path) + + # Cannot pass unicode data to pathname2url or it can raise KeyError. Must + # only pass URL-safe bytes. So, something like u'\u2026' will raise a + # KeyError but if we encode it to bytes, '%E2%80%A6', things work + # correctly. + # http://stackoverflow.com/questions/15115588/urllib-quote-throws-keyerror + owner = owner.encode('utf-8') + repo = repo.encode('utf-8') + file_path = file_path.encode('utf-8') + return urllib.pathname2url('repos/%s/%s/contents/%s' % (owner, repo, file_path))