----

Example 4-1. Smart quotes and ISO-8859-1

In [8]:
b = [45,147, 128, 53, 44, 32, 112, 108, 101, 97, 115, 101, 148,45]
s = bytes(c)
print(s)

b'-\x93\x805, please\x94-'


In [9]:
print(s.decode('cp1252'))

-“€5, please”-


In [10]:
print(s.decode('iso-8859-1'))

-5, please-


In [11]:
print(len(s.decode('cp1252')))

14


In [12]:
print(len(s.decode('iso-8859-1')))

14


----

Example 4-2. Generating test data.

In [28]:
def make_alnum_sample(out, codec, n):
    """
    Look at the first n unicode code points
    if that unicode character is alphanumeric
    and can be encoded by codec write the encoded
    character to out
    """
    for x in range(n):
        try:
            u = chr(x)
            if u.isalnum():
                b = u.encode(codec)
                out.write(b)
        except:
            # skip u if codec cannot represent it
            pass
    out.write(b'\n')

In [29]:
codecs = ['ascii', 'cp437', 'cp858', 'cp1252', 'iso-8859-1', 'macroman', 'utf-8', 'utf-16']
for codec in codecs:
    with open('%s_alnum.txt' % codec, mode='wb') as out:
        make_alnum_sample(out, codec, 512)

---

Example 4-4. Snippets of non-ASCII text

In [33]:
def stream_non_ascii_snippets(s, n_before=15, n_after=15):
    """
    s is a byte string possibly containing non-ascii
    characters
    n_before and n_after specify a window size

    this function is a generator for snippets
    containing the n_before bytes before a non-ascii
    character, the non-ascii byte itself, and the
    n_after bytes that follow it.
    """
    for idx, c in enumerate(s):
        if c > 127:
            start = max(idx - n_before, 0)
            end = idx + n_after + 1
            yield(s[start:end])

In [34]:
CODECS = ['cp858', 'cp1252', 'macroman']
def test_codecs(s, codecs=CODECS):
    """
    prints the codecs that can decode s to a Unicode
    string and those unicode strings
    """
    max_len = max(map(len, codecs))
    for codec in codecs:
        try:
            u = s.decode(codec)
            print(codec.rjust(max_len) + ': ' + u)
        except:
            pass

In [35]:
b = [45,147, 128, 53, 44, 32, 112, 108, 101, 97, 115, 101, 148,45]
s = bytes(c)
test_codecs(next(stream_non_ascii_snippets(s)))

   cp858: -ôÇ5, pleaseö-
  cp1252: -“€5, please”-
macroman: -ìÄ5, pleaseî-


---

Example 4-5. Frequency-count snippets of non-ASCII text

In [36]:
from collections import defaultdict
from operator import itemgetter

In [43]:
def get_non_ascii_byte_counts(s):
    """
    returns {code point: count}
    for non-ASCII code points
    """
    counts = defaultdict(int)
    for c in s:
        if c > 127:
            counts[c] += 1
    return counts

In [46]:
def stream_targeted_non_ascii_snippets(s, target_byte, n_before=15, n_after=15):
    """
    s is a byte string possibly containing non-ascii
    characters
    target_byte is code point
    n_before and n_after specify a window size

    this function is a generator for snippets
    containing the n_before bytes before
    target_byte, target_byte itself, and the n_after
    bytes that follow it.
    """
    for idx, c in enumerate(s):
        if c == target_byte:
            start = max(idx - n_before, 0)
            end = idx + n_after + 1
            yield(s[start:end])

In [47]:
sorted(get_non_ascii_byte_counts(s).items(), key=itemgetter(1,0), reverse=True)

[(148, 1), (147, 1), (128, 1)]

In [48]:
it = stream_targeted_non_ascii_snippets(s, 148, n_before=6)
test_codecs(next(it))

   cp858: pleaseö-
  cp1252: please”-
macroman: pleaseî-


---

Example 4-7. Normalizing text from Python

In [50]:
with open('macroman_alnum.txt', mode='rb') as f:
    print(f.readline())

b'0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\xbb\xb5\xbc\xcb\xe7\xe5\xcc\x80\x81\xae\x82\xe9\x83\xe6\xe8\xed\xea\xeb\xec\x84\xf1\xee\xef\xcd\x85\xaf\xf4\xf2\xf3\x86\xa7\x88\x87\x89\x8b\x8a\x8c\xbe\x8d\x8f\x8e\x90\x91\x93\x92\x94\x95\x96\x98\x97\x99\x9b\x9a\xbf\x9d\x9c\x9e\x9f\xd8\xf5\xce\xcf\xd9\xc4\n'


In [51]:
with open('macroman_alnum.txt', mode='rb') as f:
    print(f.readline().decode('macroman'))

0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµºÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜßàáâãäåæçèéêëìíîïñòóôõöøùúûüÿıŒœŸƒ



In [53]:
with open('macroman_alnum.txt', encoding='macroman') as f:
    print(f.readline())

0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµºÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜßàáâãäåæçèéêëìíîïñòóôõöøùúûüÿıŒœŸƒ



----

Example 4-8. Decoding URL encoded text

In [54]:
import urllib

In [57]:
urllib.parse.urlencode({'eqn': '1+2==3'})

'eqn=1%2B2%3D%3D3'

In [58]:
s = 'www.example.com/test?eqn=1%2B2%3D%3D3'
urllib.parse.unquote(s)

'www.example.com/test?eqn=1+2==3'

---

Example 4-9. Decoding HTML encoded text

In [64]:
import html

In [82]:
s = '<script>//Do Some Évîl</script>'
encoded = html.escape(s).encode('ascii', 'xmlcharrefreplace').decode('ascii')
print(encoded)

&lt;script&gt;//Do Some &#201;v&#238;l&lt;/script&gt;


In [83]:
print(html.unescape(encoded))

<script>//Do Some Évîl</script>


---

Example 4-10. Decoding redundantly HTML encoded text

In [85]:
# add a few more layers of encoding
ss = html.escape(encoded).encode('ascii', 'xmlcharrefreplace').decode('ascii')
ss = html.escape(ss).encode('ascii','xmlcharrefreplace').decode('ascii')
print(ss)

&amp;amp;lt;script&amp;amp;gt;//Do Some &amp;amp;#201;v&amp;amp;#238;l&amp;amp;lt;/script&amp;amp;gt;


In [87]:
# now decode until length becomes constant
while len(ss) != len(html.unescape(ss)):
    ss = html.unescape(ss)
    print(ss)

&amp;lt;script&amp;gt;//Do Some &amp;#201;v&amp;#238;l&amp;lt;/script&amp;gt;
&lt;script&gt;//Do Some &#201;v&#238;l&lt;/script&gt;
<script>//Do Some Évîl</script>


---

Example 4-11. Quoted CSV

In [None]:
import io
import csv
# s behaves like a file opened for reading
>>> s = StringIO.StringIO('''Name,Job Description
"Bolton, Michael ""Mike""","Programmer"
Bolton,Michael "Mike",Programmer''')
>>> # When we count the fields per line,
>>> # str.split is confused by Name
>>> map(len, [line.split(',') for line in s])
[2, 3, 3]
>>> # csv.reader understands quoted name
>>> s.seek(0)
>>> map(len, csv.reader(s))
[2, 2, 3
>>> s.seek(0)
>>> data = [row for row in csv.reader(s)]
>>> # with quotes the comma in the name
>>> # is not a delimiter
>>> data[1][0]
'Bolton, Michael "Mike"'
>>> # without quotes all commas are delimiters
>>> data[2][0]
'Bolton'

In [89]:
import io
import csv

In [90]:
s = io.StringIO('''Name,Job Description
"Bolton, Michael ""Mike""","Programmer"
Bolton,Michael "Mike",Programmer''')

In [104]:
# When we count the fields per line,
# str.split is confused by Name
list(map(len, [line.split(',') for line in s]))

[2, 3, 3]

In [106]:
# csv.reader understands quoted name
s.seek(0)
list(map(len, csv.reader(s)))

[2, 2, 3]

In [107]:
s.seek(0)
data = [row for row in csv.reader(s)]

In [108]:
# with quotes the comma in the name
# is not a delimiter
data[1][0]

'Bolton, Michael "Mike"'

In [109]:
# without quotes all commas are delimiters
data[2][0]

'Bolton'