## Encoding and Decoding

In [7]:
s = 'café' 
print(len(s)) # The str 'café' has four Unicode characters
b = s.encode('utf8') # Encode str to bytes using UTF-8 encoding
print(b) # bytes literals start with a b prefix i.e. b'caf\xc3\xa9'
print(len(b)) # bytes b has five bytes (the code point for “é” is encoded as two bytes in UTF-8)
print(type(b))
print(b.decode('utf8')) # Decode bytes to str using UTF-8 encoding

4
b'caf\xc3\xa9'
5
<class 'bytes'>
café


# Byte Essentials

In [10]:
cafe = bytes('café', encoding='utf_8') # bytes can be built from a str, given an encoding
print(cafe) # the first three bytes b'caf' are in the printable ASCII range (0-255), the last two are not

print(cafe[0]) # Each item is an integer in range(256)

print(cafe[:1]) # Slices of bytes are also bytes—even slices of a single byte
cafe_arr = bytearray(cafe) #
print(cafe_arr) #There is no literal syntax for bytearray: they are shown as bytearray() with a bytes literal as argument

print(cafe_arr[-1:]) # A slice of bytearray is also a bytearray.

b'caf\xc3\xa9'
99
b'c'
bytearray(b'caf\xc3\xa9')
bytearray(b'\xa9')


In [11]:
# Binary sequences have a class method that str doesn’t have, called fromhex, which
# builds a binary sequence by parsing pairs of hex digits optionally separated by spaces:
bytes.fromhex('31 4B CE A9')

b'1K\xce\xa9'

In [16]:
# Initializing bytes from raw array data

import array
numbers = array.array('h', [-2, -1, 0, 1, 2]) # Typecode 'h' creates an array of short integers (16 bits)
octets = bytes(numbers) # octets holds a copy of the bytes that make up numbers
print(octets) # These are the 10 bytes that represent the five short integers

# Creating a bytes or bytearray object from any buffer-like source will always copy
# the bytes. In contrast, memoryview objects let you share memory between binary data
# structures. To extract structured information from binary sequences, the struct
# module is invaluable

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'


In [19]:
# Structs and Memory Views

# The struct module provides functions to parse packed bytes into a tuple of fields of
# different types and to perform the opposite conversion, from a tuple into packed
# bytes. struct is used with bytes, bytearray, and memoryview objects

# the memoryview class does not let you
# create or store byte sequences, but provides shared memory access to slices of data
# from other binary sequences, packed arrays, and buffers such as Python Imaging
# Library (PIL) images,2 without copying the bytes


```python
# Example 4-4. Using memoryview and struct to inspect a GIF image header

import struct

fmt = '<3s3sHH' # struct format: < little-endian; 3s3s two sequences of 3 bytes; HH two 16-bit integers

with open('filter.gif', 'rb') as fp:
... img = memoryview(fp.read()) # Create memoryview from file contents in memory
...

header = img[:10] # then another memoryview by slicing the first one; no bytes are copied here
bytes(header) # Convert to bytes for display only; 10 bytes are copied here
b'GIF89a+\x02\xe6\x00'

struct.unpack(fmt, header) # Unpack memoryview into tuple of: type, version, width, and height
(b'GIF', b'89a', 555, 230)

del header # Delete references to release the memory associated with the memoryview instances
del img
```

# Basic Encoders/Decoders

In [29]:
# The Python distribution bundles more than 100 codecs (encoder/decoder) for text to
# byte conversion and vice versa. Each codec has a name, like 'utf_8', and often
# aliases, such as 'utf8', 'utf-8', and 'U8', which you can use as the encoding argument 
# in functions like open(), str.encode(), bytes.decode()


# Example 4-5. The string “El Niño” encoded with three codecs producing very different byte sequences
for codec in ['latin_1', 'utf_8', 'utf_16']:
    print(codec, 'El Niño'.encode(codec), sep='\t')

latin_1	b'El Ni\xf1o'
utf_8	b'El Ni\xc3\xb1o'
utf_16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


In [30]:
city = 'São Paulo'
print(city.encode('utf_8')) # The 'utf_?' encodings handle any str

print(city.encode('utf_16'))

print(city.encode('iso8859_1')) # 'iso8859_1' also works for the 'São Paulo' str

try: # 'cp437' can’t encode the 'ã' (“a” with tilde). The default error handler —'strict'—raises UnicodeEncodeError
    print(city.encode('cp437'))
except Exception as e:
    print(e)

print(city.encode('cp437', errors='ignore')) # The error='ignore' handler silently skips characters that cannot be encoded; this is usually a very bad idea

print(city.encode('cp437', errors='replace')) # When encoding, error='replace' substitutes unencodable characters with '?'; data is lost, but users will know something is amiss

print(city.encode('cp437', errors='xmlcharrefreplace')) # 'xmlcharrefreplace' replaces unencodable characters with an XML entity

b'S\xc3\xa3o Paulo'
b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'
b'S\xe3o Paulo'
'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>
b'So Paulo'
b'S?o Paulo'
b'S&#227;o Paulo'


In [32]:
# How to Discover the Encoding of a Byte Sequence

# Chardet — The Universal Character Encoding Detector
# works to identify one of 30 supported encodings. Chardet is a Python library that you
# Understanding Encode/Decode Problems | 113can use in your programs, but also includes a command-line utility, chardetect.
# Here is what it reports on the source file for this chapter:

# $ chardetect 04-text-byte.asciidoc
# 04-text-byte.asciidoc: utf-8 with confidence 0.99

# Although binary sequences of encoded text usually don’t carry explicit hints of their
# encoding, the UTF formats may prepend a byte order mark to the textual content.

In [37]:
# BOM: A Useful Gremlin

u16 = 'El Niño'.encode('utf_16')
print(u16)

# The bytes are b'\xff\xfe'. That is a BOM—byte-order mark—denoting the “littleendian” 
# byte ordering of the Intel CPU where the encoding was performed

# On a little-endian machine, for each code point the least significant byte comes first:
# the letter 'E', code point U+0045 (decimal 69), is encoded in byte offsets 2 and 3 as 69 and 0:

print(list(u16))

# On a big-endian CPU, the encoding would be reversed; 'E' would be encoded as 0 and 69

b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'
[255, 254, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]


# Handling Text Files

The best practice for handling text is the “Unicode sandwich” (Figure 4-2).4 This
means that bytes should be decoded to str as early as possible on input (e.g., when
opening a file for reading). The “meat” of the sandwich is the business logic of your
program, where text handling is done exclusively on str objects. You should never be
encoding or decoding in the middle of other processing. On output, the str are enco‐
ded to bytes as late as possible. Most web frameworks work like that, and we rarely
touch bytes when using them. In Django, for example, your views should output
Unicode str; Django itself takes care of encoding the response to bytes, using UTF-8
by default

Bytes -> str : Decode bytes on input  
100% str : process using text only  
str -> bytes : encode text back to bytes on output  

Python 3 makes it easier to follow the advice of the Unicode sandwich, because the
open built-in does the necessary decoding when reading and encoding when writing
files in text mode, so all you get from my_file.read() and pass to
my_file.write(text) are str objects.  

Therefore, using text files is simple. But if you rely on default encodings you will get
bitten.

Example 4-9. A platform encoding issue (if you try this on your machine, you may or
may not see the problem)
```python
>>> open('cafe.txt', 'w', encoding='utf_8').write('café')
4
>>> open('cafe.txt').read()
'cafÃ©'
```

Moral of story: always specify encoding when reading/writing

In [42]:
# Encoding defaults: a madhouse

# Several settings affect the encoding defaults for I/O in Python

import sys, locale
expressions = """
    locale.getpreferredencoding()
    type(my_file)
    sys.stdout.isatty()
    sys.stdout.encoding
    sys.stdin.isatty()
    sys.stdin.encoding
    sys.stderr.isatty()
    sys.stderr.encoding
    sys.getdefaultencoding()
    sys.getfilesystemencoding()
"""
my_file = open('dummy', 'w')
for expression in expressions.split():
    value = eval(expression)
    print(expression.rjust(30), '->', repr(value))

 locale.getpreferredencoding() -> 'UTF-8'
                 type(my_file) -> <class '_io.TextIOWrapper'>
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'UTF-8'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'utf-8'


The above was run on ubuntu 18.04

running the same code on windows results in: 

locale.getpreferredencoding() -> 'cp1252'  
type(my_file) -> <class '_io.TextIOWrapper'>  
my_file.encoding -> 'cp1252'  
sys.stdout.isatty() -> True  
sys.stdout.encoding -> 'cp850'  
sys.stdin.isatty() -> True  
sys.stdin.encoding -> 'cp850'  
sys.stderr.isatty() -> True  
sys.stderr.encoding -> 'cp850'  
sys.getdefaultencoding() -> 'utf-8'  
sys.getfilesystemencoding() -> 'mbcs  

the most important encoding setting is that returned by **locale.getpreferredencoding()**: it is the default for opening text files and for sys.stdout/stdin/stderr when they are redirected to files.   

However, the documentation reads (in part):
```
locale.getpreferredencoding(do_setlocale=True)
Return the encoding used for text data, according to user preferences. User pref‐
erences are expressed differently on different systems, and might not be available
programmatically on some systems, so this function only returns a guess. […]
```

Therefore, the best advice about encoding defaults is: do not rely on them.

# Normalizing Unicode for Comparisons


In [51]:
# String comparisons are complicated by the fact that Unicode has combining characters: 
# diacritics and other marks that attach to the preceding character, appearing as one when printed.
# For example, the word “café” may be composed in two ways, using four or five code
# points, but the result looks exactly the same:

s1 = 'café'
s2 = 'cafe\u0301'
print(s1, s2)
print(len(s1), len(s2))
print(s1 == s2)

# The code point U+0301 is the COMBINING ACUTE ACCENT. Using it after “e” renders
# “é”. In the Unicode standard, sequences like 'é' and 'e\u0301' are called “canonical
# equivalents,” and applications are supposed to treat them as the same. But Python
# sees two different sequences of code points, and considers them not equal

# The solution is to use Unicode normalization, provided by the unicodedata.normalize 
# function. The first argument to that function is one of four strings: 'NFC', 'NFD', 'NFKC', and 'NFKD'.

from unicodedata import normalize
s1 = 'café' # composed "e" with acute accent
s2 = 'cafe\u0301' # decomposed "e" and acute accent
print(len(s1), len(s2))
print(len(normalize('NFC', s1)), len(normalize('NFC', s2)))
print(len(normalize('NFD', s1)), len(normalize('NFD', s2)))

# Normalization Form C (NFC) composes the code points to produce the shortest equivalent string
print(normalize('NFC', s1) == normalize('NFC', s2))

# NFD decomposes, expanding composed characters into base characters and separate combining characters
print(normalize('NFD', s1) == normalize('NFD', s2))

# Western keyboards usually generate composed characters, so text typed by users will
# be in NFC by default. However, to be safe, it may be good to sanitize strings with
# normalize('NFC', user_text) before saving

café café
4 5
False
4 5
4 4
5 5
True
True


In [58]:
# Example 4-14. Function to remove all combining marks (module sanitize.py)
import unicodedata
import string

def shave_marks(txt):
    """Remove all diacritic marks"""
    norm_txt = unicodedata.normalize('NFD', txt)
    shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)


order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
#                                         ^                  ^
print(shave_marks(order))

Greek = 'Ζέφυρος, Zéfiro'
#         ^        ^
print(shave_marks(Greek))

“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”
Ζεφυρος, Zefiro


In [59]:
def shave_marks_latin(txt):
    """Remove all diacritic marks from Latin base characters"""
    norm_txt = unicodedata.normalize('NFD', txt)
    latin_base = False
    keepers = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue # ignore diacritic on Latin base char
        keepers.append(c)
        # if it isn't combining char, it's a new base char
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    shaved = ''.join(keepers)
    return unicodedata.normalize('NFC', shaved)

In [61]:
# Example 4-17. Transform some Western typographical symbols into ASCII

single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""",
                           """'f"*^<''""---~>""")

multi_map = str.maketrans({
    '€': '<euro>',
    '…': '...',
    'Œ': 'OE',
    '™': '(TM)',
    'œ': 'oe',
    '‰': '<per mille>',
    '‡': '**',
})

multi_map.update(single_map)

def dewinize(txt):
    """Replace Win1252 symbols with ASCII chars or sequences"""
    return txt.translate(multi_map)

def asciize(txt):
    no_marks = shave_marks_latin(dewinize(txt))
    no_marks = no_marks.replace('ß', 'ss')
    return unicodedata.normalize('NFKC', no_marks)

# Examples

order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
print(dewinize(order))
print(asciize(order))

"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."
"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."


In [66]:
# Sorting Unicode text

# Example 4-19. Using the locale.strxfrm function as sort key

import locale

locale.setlocale(locale.LC_COLLATE, 'pt_BR.UTF-8') # exception will be thrown if local is not installed

fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']

sorted_fruits = sorted(fruits, key=locale.strxfrm)

print(sorted_fruits)

Error: unsupported locale setting

In [67]:
# Sorting with the Unicode Collation Algorithm
# James Tauber, prolific Django contributor, must have felt the pain and created
# PyUCA, a pure-Python implementation of the Unicode Collation Algorithm (UCA).
# Example 4-20 shows how easy it is to use.
# Example 4-20. Using the pyuca.Collator.sort_key method

import pyuca
coll = pyuca.Collator()
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted_fruits = sorted(fruits, key=coll.sort_key)
print(sorted_fruits)

ModuleNotFoundError: No module named 'pyuca'

In [69]:
import unicodedata
import re

re_digit = re.compile(r'\d')

sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'

#http://www.unicode.org/Public/UCA/6.3.0/allkeys.txt

for char in sample:
    print('U+%04x' % ord(char),
        char.center(6),
        're_dig' if re_digit.match(char) else '-',
        'isdig' if char.isdigit() else '-',
        'isnum' if char.isnumeric() else '-',
        format(unicodedata.numeric(char), '5.2f'),
        unicodedata.name(char),
        sep='\t')

U+0031	  1   	re_dig	isdig	isnum	 1.00	DIGIT ONE
U+00bc	  ¼   	-	-	isnum	 0.25	VULGAR FRACTION ONE QUARTER
U+00b2	  ²   	-	isdig	isnum	 2.00	SUPERSCRIPT TWO
U+0969	  ३   	re_dig	isdig	isnum	 3.00	DEVANAGARI DIGIT THREE
U+136b	  ፫   	-	isdig	isnum	 3.00	ETHIOPIC DIGIT THREE
U+216b	  Ⅻ   	-	-	isnum	12.00	ROMAN NUMERAL TWELVE
U+2466	  ⑦   	-	isdig	isnum	 7.00	CIRCLED DIGIT SEVEN
U+2480	  ⒀   	-	-	isnum	13.00	PARENTHESIZED NUMBER THIRTEEN
U+3285	  ㊅   	-	-	isnum	 6.00	CIRCLED IDEOGRAPH SIX
