@@ -579,6 +579,50 @@ def _percent_decode(text, normalize_case=False, subencoding='utf-8',
579579
580580
581581def _decode_host (host ):
582+ """Decode a host from ASCII-encodable text to IDNA-decoded text. If
583+ the host text is not ASCII, it is returned unchanged, as it is
584+ presumed that it is already IDNA-decoded.
585+
586+ Some technical details: _decode_host is built on top of the "idna"
587+ package, which has some quirks:
588+
589+ Capital letters are not valid IDNA2008. The idna package will
590+ raise an exception like this on capital letters:
591+
592+ > idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed
593+
594+ However, if a segment of a host (i.e., something in
595+ url.host.split('.')) is already ASCII, idna doesn't perform its
596+ usual checks. In fact, for capital letters it automatically
597+ lowercases them.
598+
599+ This check and some other functionality can be bypassed by passing
600+ uts46=True to idna.encode/decode. This allows a more permissive and
601+ convenient interface. So far it seems like the balanced approach.
602+
603+ Example output (from idna==2.6):
604+
605+ >> idna.encode(u'mahmöud.io')
606+ 'xn--mahmud-zxa.io'
607+ >> idna.encode(u'Mahmöud.io')
608+ Traceback (most recent call last):
609+ File "<stdin>", line 1, in <module>
610+ File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode
611+ result.append(alabel(label))
612+ File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel
613+ check_label(label)
614+ File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label
615+ raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
616+ idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6 ud' not allowed
617+ >> idna.encode(u'Mahmoud.io')
618+ 'Mahmoud.io'
619+
620+ # Similar behavior for decodes below
621+ >> idna.decode(u'Mahmoud.io')
622+ u'mahmoud.io
623+ >> idna.decode(u'Méhmoud.io', uts46=True)
624+ u'm\xe9 hmoud.io'
625+ """
582626 if not host :
583627 return u''
584628 try :
@@ -1802,38 +1846,3 @@ def parse(url, decoded=True, lazy=False):
18021846 return enc_url
18031847 dec_url = DecodedURL (enc_url , lazy = lazy )
18041848 return dec_url
1805-
1806- """idna package notes:
1807-
1808- * If a segment of a host (i.e., something in url.host.split('.')) is
1809- already ascii, idna doesn't perform its usual checks. For instance,
1810- capital letters are not valid idna2008. The package automatically lowercases.
1811-
1812- You'll get something like:
1813-
1814- > idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed
1815-
1816- This check and some other functionality can be bypassed by passing
1817- uts46=True to encode/decode. This allows a more permission and
1818- convenient interface. So far it seems like the balanced approach.
1819-
1820- However, all of this is bypassed if the string segment contains no
1821- unicode characters.
1822-
1823- Example output:
1824-
1825- >>> idna.encode(u'mahmöud.io')
1826- 'xn--mahmud-zxa.io'
1827- >>> idna.encode(u'Mahmöud.io')
1828- Traceback (most recent call last):
1829- File "<stdin>", line 1, in <module>
1830- File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode
1831- result.append(alabel(label))
1832- File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel
1833- check_label(label)
1834- File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label
1835- raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
1836- idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6 ud' not allowed
1837- >>> idna.encode(u'Mahmoud.io')
1838- 'Mahmoud.io'
1839- """
0 commit comments